Coverage Report

Created: 2026-02-26 06:25

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/fallback-dct.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4
 *
5
 * This file is part of libde265.
6
 *
7
 * libde265 is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as
9
 * published by the Free Software Foundation, either version 3 of
10
 * the License, or (at your option) any later version.
11
 *
12
 * libde265 is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19
 */
20
21
#include "fallback-dct.h"
22
23
#if defined(_MSC_VER) || defined(__MINGW32__)
24
# include <malloc.h>
25
#elif defined(HAVE_ALLOCA_H)
26
# include <alloca.h>
27
#endif
28
29
#include <assert.h>
30
#include <algorithm>
31
32
33
static void printMatrix(const char* name, const int16_t* v, int n)
34
0
{
35
0
  printf("--- %s ---\n",name);
36
0
  for (int r=0;r<n;r++) {
37
0
    for (int c=0;c<n;c++) {
38
0
      printf("%4d ",v[c+r*n]);
39
0
    }
40
0
    printf("\n");
41
0
  }
42
0
}
43
44
45
46
void transform_skip_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
47
0
{
48
0
  int nT = 4;
49
0
  int bdShift2 = 20-8;
50
51
0
  assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size
52
53
0
  for (int y=0;y<nT;y++)
54
0
    for (int x=0;x<nT;x++) {
55
0
      int32_t c = coeffs[x+y*nT] << 7;
56
0
      c = (c+(1<<(bdShift2-1)))>>bdShift2;
57
58
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
59
0
    }
60
0
}
61
62
63
void transform_skip_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
64
0
{
65
0
  int nT = 4;
66
0
  int bdShift2 = 20-bit_depth;
67
68
0
  assert(0); // DEPRECATED, should not be used anymore because of fixed 4x4 size
69
70
0
  for (int y=0;y<nT;y++)
71
0
    for (int x=0;x<nT;x++) {
72
0
      int32_t c = coeffs[x+y*nT] << 7;
73
0
      c = (c+(1<<(bdShift2-1)))>>bdShift2;
74
75
0
      dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth);
76
0
    }
77
0
}
78
79
80
void transform_skip_residual_fallback(int32_t *residual, const int16_t *coeffs, int nT,
81
                                      int tsShift,int bdShift)
82
0
{
83
0
  const int rnd = 1<<(bdShift-1);
84
85
0
  for (int y=0;y<nT;y++)
86
0
    for (int x=0;x<nT;x++) {
87
0
      int32_t c = (int32_t)((uint32_t)coeffs[x+y*nT] << tsShift); // C++ up to C++17 treats left-shift of signed values as UB
88
0
      residual[x+y*nT] = (c + rnd) >> bdShift;
89
0
    }
90
0
}
91
92
93
void transform_skip_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride)
94
0
{
95
0
  int bitDepth = 8;
96
0
  int bdShift2 = 20-bitDepth;
97
0
  int offset = (1<<(bdShift2-1));
98
0
  int tsShift = 5 + log2nT; // TODO: extended_precision
99
0
  int nT = 1<<log2nT;
100
101
0
  for (int x=0;x<nT;x++) {
102
0
    int32_t sum = 0;
103
104
0
    for (int y=0;y<nT;y++) {
105
0
      int c = coeffs[x+y*nT] << tsShift;
106
0
      sum += (c+offset)>>bdShift2;
107
108
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
109
0
    }
110
0
  }
111
0
}
112
113
void transform_skip_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs, int log2nT, ptrdiff_t stride)
114
0
{
115
0
  int bitDepth = 8;
116
0
  int bdShift2 = 20-bitDepth;
117
0
  int offset = (1<<(bdShift2-1));
118
0
  int tsShift = 5 + log2nT; // TODO: extended_precision
119
0
  int nT = 1<<log2nT;
120
121
0
  for (int y=0;y<nT;y++) {
122
0
    int32_t sum = 0;
123
124
0
    for (int x=0;x<nT;x++) {
125
0
      int c = coeffs[x+y*nT] << tsShift;
126
0
      sum += (c+offset)>>bdShift2;
127
128
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
129
0
    }
130
0
  }
131
0
}
132
133
134
void transform_bypass_rdpcm_v_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride)
135
0
{
136
0
  for (int x=0;x<nT;x++) {
137
0
    int32_t sum=0;
138
0
    for (int y=0;y<nT;y++) {
139
0
      sum += coeffs[x+y*nT];
140
141
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
142
0
    }
143
0
  }
144
0
}
145
146
147
void transform_bypass_rdpcm_h_8_fallback(uint8_t *dst, const int16_t *coeffs,int nT,ptrdiff_t stride)
148
0
{
149
0
  for (int y=0;y<nT;y++) {
150
0
    int32_t sum=0;
151
0
    for (int x=0;x<nT;x++) {
152
0
      sum += coeffs[x+y*nT];
153
154
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + sum);
155
0
    }
156
0
  }
157
0
}
158
159
160
void transform_bypass_rdpcm_v_fallback(int32_t *dst, const int16_t *coeffs,int nT)
161
0
{
162
0
  for (int x=0;x<nT;x++) {
163
0
    int32_t sum=0;
164
0
    for (int y=0;y<nT;y++) {
165
0
      sum += coeffs[x+y*nT];
166
167
0
      dst[y*nT+x] = sum;
168
0
    }
169
0
  }
170
0
}
171
172
173
void transform_bypass_rdpcm_h_fallback(int32_t *dst, const int16_t *coeffs,int nT)
174
0
{
175
0
  for (int y=0;y<nT;y++) {
176
0
    int32_t sum=0;
177
0
    for (int x=0;x<nT;x++) {
178
0
      sum += coeffs[x+y*nT];
179
180
0
      dst[y*nT+x] = sum;
181
0
    }
182
0
  }
183
0
}
184
185
186
void rdpcm_v_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift)
187
0
{
188
0
  int rnd = (1<<(bdShift-1));
189
190
0
  for (int x=0;x<nT;x++) {
191
0
    int sum=0;
192
0
    for (int y=0;y<nT;y++) {
193
0
      int c = coeffs[x+y*nT] << tsShift;
194
0
      sum += (c+rnd)>>bdShift;
195
0
      residual[y*nT+x] = sum;
196
0
    }
197
0
  }
198
0
}
199
200
201
void rdpcm_h_fallback(int32_t* residual, const int16_t* coeffs, int nT,int tsShift,int bdShift)
202
0
{
203
0
  int rnd = (1<<(bdShift-1));
204
205
0
  for (int y=0;y<nT;y++) {
206
0
    int sum=0;
207
0
    for (int x=0;x<nT;x++) {
208
0
      int c = coeffs[x+y*nT] << tsShift;
209
0
      sum += (c+rnd)>>bdShift;
210
0
      residual[y*nT+x] = sum;
211
0
    }
212
0
  }
213
0
}
214
215
216
void transform_bypass_fallback(int32_t *dst, const int16_t *coeffs, int nT)
217
0
{
218
0
  for (int y=0;y<nT;y++)
219
0
    for (int x=0;x<nT;x++) {
220
0
      int32_t c = coeffs[x+y*nT];
221
222
0
      dst[y*nT+x] = c;
223
0
    }
224
0
}
225
226
227
void transform_bypass_8_fallback(uint8_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride)
228
0
{
229
0
  for (int y=0;y<nT;y++)
230
0
    for (int x=0;x<nT;x++) {
231
0
      int32_t c = coeffs[x+y*nT];
232
233
0
      dst[y*stride+x] = Clip1_8bit(dst[y*stride+x] + c);
234
0
    }
235
0
}
236
237
238
void transform_bypass_16_fallback(uint16_t *dst, const int16_t *coeffs, int nT, ptrdiff_t stride, int bit_depth)
239
0
{
240
0
  int bdShift2 = 20-bit_depth;
241
242
0
  for (int y=0;y<nT;y++)
243
0
    for (int x=0;x<nT;x++) {
244
0
      int32_t c = coeffs[x+y*nT];
245
246
0
      dst[y*stride+x] = Clip_BitDepth(dst[y*stride+x] + c, bit_depth);
247
0
    }
248
0
}
249
250
251
void rotate_coefficients_fallback(int16_t *coeff, int nT)
252
0
{
253
0
  for (int y=0;y<nT/2;y++)
254
0
    for (int x=0;x<nT;x++) {
255
0
      std::swap(coeff[y*nT+x], coeff[(nT-1-y)*nT + nT-1-x]);
256
0
    }
257
0
}
258
259
260
261
static int8_t mat_8_357[4][4] = {
262
  { 29, 55, 74, 84 },
263
  { 74, 74,  0,-74 },
264
  { 84,-29,-74, 55 },
265
  { 55,-84, 74,-29 }
266
};
267
268
269
270
void transform_4x4_luma_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
271
0
{
272
0
  int16_t g[4][4];
273
274
0
  int postShift = 20-8; // 8 bit
275
0
  int rndV = 1<<(7-1);
276
0
  int rndH = 1<<(postShift-1);
277
278
279
  // --- V ---
280
281
0
  for (int c=0;c<4;c++) {
282
    /*
283
    logtrace(LogTransform,"DST-V: ");
284
    for (int r=0;r<4;r++) {
285
      logtrace(LogTransform,"%d ",coeffs[c+r*4]);
286
    }
287
    logtrace(LogTransform,"* -> ");
288
    */
289
290
0
    for (int i=0;i<4;i++) {
291
0
      int sum=0;
292
293
0
      for (int j=0;j<4;j++) {
294
0
        sum += mat_8_357[j][i] * coeffs[c+j*4];
295
0
      }
296
297
0
      g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
298
0
    }
299
300
    /*
301
    for (int y=0;y<4;y++) {
302
      logtrace(LogTransform,"*%d ",g[y][c]);
303
    }
304
    logtrace(LogTransform,"*\n");
305
    */
306
0
  }
307
308
309
  // --- H ---
310
311
0
  for (int y=0;y<4;y++) {
312
313
    /*
314
    logtrace(LogTransform,"DST-H: ");
315
    for (int c=0;c<4;c++) {
316
      logtrace(LogTransform,"%d ",g[y][c]);
317
    }
318
    logtrace(LogTransform,"* -> ");
319
    */
320
321
0
    for (int i=0;i<4;i++) {
322
0
      int sum=0;
323
324
0
      for (int j=0;j<4;j++) {
325
0
        sum += mat_8_357[j][i] * g[y][j];
326
0
      }
327
328
0
      int out = Clip3(-32768,32767, (sum+rndH)>>postShift);
329
330
0
      dst[y*stride+i] = Clip1_8bit(dst[y*stride+i] + out);
331
332
0
      logtrace(LogTransform,"*%d ",out);
333
0
    }
334
335
0
    logtrace(LogTransform,"*\n");
336
0
  }
337
0
}
338
339
340
void transform_4x4_luma_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride,
341
                                        int bit_depth)
342
0
{
343
0
  int16_t g[4][4];
344
345
0
  int postShift = 20-bit_depth;
346
0
  int rndV = 1<<(7-1);
347
0
  int rndH = 1<<(postShift-1);
348
349
350
  // --- V ---
351
352
0
  for (int c=0;c<4;c++) {
353
    /*
354
    logtrace(LogTransform,"DST-V: ");
355
    for (int r=0;r<4;r++) {
356
      logtrace(LogTransform,"%d ",coeffs[c+r*4]);
357
    }
358
    logtrace(LogTransform,"* -> ");
359
    */
360
361
0
    for (int i=0;i<4;i++) {
362
0
      int sum=0;
363
364
0
      for (int j=0;j<4;j++) {
365
0
        sum += mat_8_357[j][i] * coeffs[c+j*4];
366
0
      }
367
368
0
      g[i][c] = Clip3(-32768,32767, (sum+rndV)>>7);
369
0
    }
370
371
    /*
372
    for (int y=0;y<4;y++) {
373
      logtrace(LogTransform,"*%d ",g[y][c]);
374
    }
375
    logtrace(LogTransform,"*\n");
376
    */
377
0
  }
378
379
380
  // --- H ---
381
382
0
  for (int y=0;y<4;y++) {
383
384
    /*
385
    logtrace(LogTransform,"DST-H: ");
386
    for (int c=0;c<4;c++) {
387
      logtrace(LogTransform,"%d ",g[y][c]);
388
    }
389
    logtrace(LogTransform,"* -> ");
390
    */
391
392
0
    for (int i=0;i<4;i++) {
393
0
      int sum=0;
394
395
0
      for (int j=0;j<4;j++) {
396
0
        sum += mat_8_357[j][i] * g[y][j];
397
0
      }
398
399
0
      int out = Clip3(-32768,32767, (sum+rndH)>>postShift);
400
401
0
      dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth);
402
403
0
      logtrace(LogTransform,"*%d ",out);
404
0
    }
405
406
0
    logtrace(LogTransform,"*\n");
407
0
  }
408
0
}
409
410
411
void fdst_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
412
0
{
413
0
  int16_t g[4*4];
414
415
0
  int BD = 8;
416
0
  int shift1 = Log2(4) + BD -9;
417
0
  int shift2 = Log2(4) + 6;
418
419
0
  int rnd1 = 1<<(shift1-1);
420
0
  int rnd2 = 1<<(shift2-1);
421
422
423
  // --- V ---
424
425
0
  for (int c=0;c<4;c++) {
426
427
    /*
428
    logtrace(LogTransform,"DST-V: ");
429
    for (int r=0;r<4;r++) {
430
      logtrace(LogTransform,"%d ",coeffs[c+r*4]);
431
    }
432
    logtrace(LogTransform,"* -> ");
433
    */
434
435
0
    for (int i=0;i<4;i++) {
436
0
      int sum=0;
437
438
0
      for (int j=0;j<4;j++) {
439
0
        sum += mat_8_357[i][j] * input[c+j*stride];
440
0
      }
441
442
0
      g[c+4*i] = Clip3(-32768,32767, (sum+rnd1)>>shift1);
443
0
    }
444
0
  }
445
446
447
  // --- H ---
448
449
0
  for (int y=0;y<4;y++) {
450
0
    for (int i=0;i<4;i++) {
451
0
      int sum=0;
452
453
0
      for (int j=0;j<4;j++) {
454
0
        sum += mat_8_357[i][j] * g[y*4+j];
455
0
      }
456
457
      // TODO: do we need clipping ?
458
0
      int out = (sum+rnd2)>>shift2; // Clip3(-32768,32767, (sum+rndH)>>postShift);
459
460
0
      coeffs[y*4+i] = out;
461
462
0
      logtrace(LogTransform,"*%d ",out);
463
0
    }
464
465
0
    logtrace(LogTransform,"*\n");
466
0
  }
467
0
}
468
469
470
void transform_idst_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
471
0
{
472
0
  int16_t g[4][4];
473
474
0
  int rndV = 1<<(7-1);
475
0
  int rndH = 1<<(bdShift-1);
476
477
0
  int CoeffMax = (1<<max_coeff_bits)-1;
478
0
  int CoeffMin = -(1<<max_coeff_bits);
479
480
481
  // --- V ---
482
483
0
  for (int c=0;c<4;c++) {
484
0
    for (int i=0;i<4;i++) {
485
0
      int sum=0;
486
487
0
      for (int j=0;j<4;j++) {
488
0
        sum += mat_8_357[j][i] * coeffs[c+j*4];
489
0
      }
490
491
0
      g[i][c] = Clip3(CoeffMin,CoeffMax, (sum+rndV)>>7);
492
0
    }
493
0
  }
494
495
496
  // --- H ---
497
498
0
  for (int y=0;y<4;y++) {
499
0
    for (int i=0;i<4;i++) {
500
0
      int sum=0;
501
502
0
      for (int j=0;j<4;j++) {
503
0
        sum += mat_8_357[j][i] * g[y][j];
504
0
      }
505
506
0
      dst[y*4+i] = (sum + rndH)>>bdShift;
507
0
    }
508
0
  }
509
0
}
510
511
512
513
static int8_t mat_dct[32][32] = {
514
  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
515
  { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4,      -4,-13,-22,-31,-38,-46,-54,-61,-67,-73,-78,-82,-85,-88,-90,-90},
516
  { 90, 87, 80, 70, 57, 43, 25,  9, -9,-25,-43,-57,-70,-80,-87,-90,     -90,-87,-80,-70,-57,-43,-25, -9,  9, 25, 43, 57, 70, 80, 87, 90},
517
  { 90, 82, 67, 46, 22, -4,-31,-54,-73,-85,-90,-88,-78,-61,-38,-13,      13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4,-22,-46,-67,-82,-90},
518
  { 89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89,      89, 75, 50, 18,-18,-50,-75,-89,-89,-75,-50,-18, 18, 50, 75, 89},
519
  { 88, 67, 31,-13,-54,-82,-90,-78,-46, -4, 38, 73, 90, 85, 61, 22,     -22,-61,-85,-90,-73,-38,  4, 46, 78, 90, 82, 54, 13,-31,-67,-88},
520
  { 87, 57,  9,-43,-80,-90,-70,-25, 25, 70, 90, 80, 43, -9,-57,-87,     -87,-57, -9, 43, 80, 90, 70, 25,-25,-70,-90,-80,-43,  9, 57, 87},
521
  { 85, 46,-13,-67,-90,-73,-22, 38, 82, 88, 54, -4,-61,-90,-78,-31,      31, 78, 90, 61,  4,-54,-88,-82,-38, 22, 73, 90, 67, 13,-46,-85},
522
  { 83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83,      83, 36,-36,-83,-83,-36, 36, 83, 83, 36,-36,-83,-83,-36, 36, 83},
523
  { 82, 22,-54,-90,-61, 13, 78, 85, 31,-46,-90,-67,  4, 73, 88, 38,     -38,-88,-73, -4, 67, 90, 46,-31,-85,-78,-13, 61, 90, 54,-22,-82},
524
  { 80,  9,-70,-87,-25, 57, 90, 43,-43,-90,-57, 25, 87, 70, -9,-80,     -80, -9, 70, 87, 25,-57,-90,-43, 43, 90, 57,-25,-87,-70,  9, 80},
525
  { 78, -4,-82,-73, 13, 85, 67,-22,-88,-61, 31, 90, 54,-38,-90,-46,      46, 90, 38,-54,-90,-31, 61, 88, 22,-67,-85,-13, 73, 82,  4,-78},
526
  { 75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75,      75,-18,-89,-50, 50, 89, 18,-75,-75, 18, 89, 50,-50,-89,-18, 75},
527
  { 73,-31,-90,-22, 78, 67,-38,-90,-13, 82, 61,-46,-88, -4, 85, 54,     -54,-85,  4, 88, 46,-61,-82, 13, 90, 38,-67,-78, 22, 90, 31,-73},
528
  { 70,-43,-87,  9, 90, 25,-80,-57, 57, 80,-25,-90, -9, 87, 43,-70,     -70, 43, 87, -9,-90,-25, 80, 57,-57,-80, 25, 90,  9,-87,-43, 70},
529
  { 67,-54,-78, 38, 85,-22,-90,  4, 90, 13,-88,-31, 82, 46,-73,-61,      61, 73,-46,-82, 31, 88,-13,-90, -4, 90, 22,-85,-38, 78, 54,-67},
530
  { 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64,      64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64, 64,-64,-64, 64},
531
  { 61,-73,-46, 82, 31,-88,-13, 90, -4,-90, 22, 85,-38,-78, 54, 67,     -67,-54, 78, 38,-85,-22, 90,  4,-90, 13, 88,-31,-82, 46, 73,-61},
532
  { 57,-80,-25, 90, -9,-87, 43, 70,-70,-43, 87,  9,-90, 25, 80,-57,     -57, 80, 25,-90,  9, 87,-43,-70, 70, 43,-87, -9, 90,-25,-80, 57},
533
  { 54,-85, -4, 88,-46,-61, 82, 13,-90, 38, 67,-78,-22, 90,-31,-73,      73, 31,-90, 22, 78,-67,-38, 90,-13,-82, 61, 46,-88,  4, 85,-54},
534
  { 50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50,      50,-89, 18, 75,-75,-18, 89,-50,-50, 89,-18,-75, 75, 18,-89, 50},
535
  { 46,-90, 38, 54,-90, 31, 61,-88, 22, 67,-85, 13, 73,-82,  4, 78,     -78, -4, 82,-73,-13, 85,-67,-22, 88,-61,-31, 90,-54,-38, 90,-46},
536
  { 43,-90, 57, 25,-87, 70,  9,-80, 80, -9,-70, 87,-25,-57, 90,-43,     -43, 90,-57,-25, 87,-70, -9, 80,-80,  9, 70,-87, 25, 57,-90, 43},
537
  { 38,-88, 73, -4,-67, 90,-46,-31, 85,-78, 13, 61,-90, 54, 22,-82,      82,-22,-54, 90,-61,-13, 78,-85, 31, 46,-90, 67,  4,-73, 88,-38},
538
  { 36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36,      36,-83, 83,-36,-36, 83,-83, 36, 36,-83, 83,-36,-36, 83,-83, 36},
539
  { 31,-78, 90,-61,  4, 54,-88, 82,-38,-22, 73,-90, 67,-13,-46, 85,     -85, 46, 13,-67, 90,-73, 22, 38,-82, 88,-54, -4, 61,-90, 78,-31},
540
  { 25,-70, 90,-80, 43,  9,-57, 87,-87, 57, -9,-43, 80,-90, 70,-25,     -25, 70,-90, 80,-43, -9, 57,-87, 87,-57,  9, 43,-80, 90,-70, 25},
541
  { 22,-61, 85,-90, 73,-38, -4, 46,-78, 90,-82, 54,-13,-31, 67,-88,      88,-67, 31, 13,-54, 82,-90, 78,-46,  4, 38,-73, 90,-85, 61,-22},
542
  { 18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18,      18,-50, 75,-89, 89,-75, 50,-18,-18, 50,-75, 89,-89, 75,-50, 18},
543
  { 13,-38, 61,-78, 88,-90, 85,-73, 54,-31,  4, 22,-46, 67,-82, 90,     -90, 82,-67, 46,-22, -4, 31,-54, 73,-85, 90,-88, 78,-61, 38,-13},
544
  {  9,-25, 43,-57, 70,-80, 87,-90, 90,-87, 80,-70, 57,-43, 25, -9,      -9, 25,-43, 57,-70, 80,-87, 90,-90, 87,-80, 70,-57, 43,-25,  9},
545
  {  4,-13, 22,-31, 38,-46, 54,-61, 67,-73, 78,-82, 85,-88, 90,-90,      90,-90, 88,-85, 82,-78, 73,-67, 61,-54, 46,-38, 31,-22, 13, -4}
546
};
547
548
549
550
551
template <class pixel_t>
552
void transform_idct_add(pixel_t *dst, ptrdiff_t stride,
553
                        int nT, const int16_t *coeffs, int bit_depth)
554
0
{
555
  /*
556
    The effective shift is
557
    7 bits right for bit-depth 8,
558
    6 bits right for bit-depth 9,
559
    5 bits right for bit-depth 10.
560
561
    Computation is independent of the block size.
562
    Each multiplication with the table includes a left shift of 6 bits.
563
    Hence, we have 2* 6 bits = 12 bits left shift.
564
    V-pass has fixed 7 bit right shift.
565
    H-pass has 20-BitDepth bit right shift;
566
567
    Effective shift 's' means: residual value 1 gives DC-coeff (1<<s).
568
   */
569
570
571
0
  int postShift = 20-bit_depth;
572
0
  int rnd1 = 1<<(7-1);
573
0
  int rnd2 = 1<<(postShift-1);
574
0
  int fact = (1<<(5-Log2(nT)));
575
576
0
  int16_t g[32*32];  // actually, only [nT*nT] used
577
578
  // TODO: valgrind reports that dst[] contains uninitialized data.
579
  // Probably from intra-prediction.
580
581
  /*
582
  for (int i=0;i<nT*nT;i++) {
583
    printf("%d\n",coeffs[i]);
584
  }
585
586
  for (int y=0;y<nT;y++) {
587
    for (int i=0;i<nT;i++) {
588
      printf("%d ",dst[y*stride+i]);
589
    }
590
  }
591
  printf("\n");
592
  */
593
594
  /*
595
  printf("--- input\n");
596
  for (int r=0;r<nT;r++, printf("\n"))
597
    for (int c=0;c<nT;c++) {
598
      printf("%3d ",coeffs[c+r*nT]);
599
    }
600
  */
601
602
0
  for (int c=0;c<nT;c++) {
603
604
    /*
605
    logtrace(LogTransform,"DCT-V: ");
606
    for (int i=0;i<nT;i++) {
607
      logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
608
    }
609
    logtrace(LogTransform,"* -> ");
610
    */
611
612
613
    // find last non-zero coefficient to reduce computations carried out in DCT
614
615
0
    int lastCol = nT-1;
616
0
    for (;lastCol>=0;lastCol--) {
617
0
      if (coeffs[c+lastCol*nT]) { break; }
618
0
    }
619
620
0
    for (int i=0;i<nT;i++) {
621
0
      int sum=0;
622
623
      /*
624
      printf("input: ");
625
      for (int j=0;j<nT;j++) {
626
        printf("%3d ",coeffs[c+j*nT]);
627
      }
628
      printf("\n");
629
630
      printf("mat: ");
631
      for (int j=0;j<nT;j++) {
632
        printf("%3d ",mat_dct[fact*j][i]);
633
      }
634
      printf("\n");
635
      */
636
637
0
      for (int j=0;j<=lastCol /*nT*/;j++) {
638
0
        sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
639
0
      }
640
641
0
      g[c+i*nT] = Clip3(-32768,32767, (sum+rnd1)>>7);
642
643
0
      logtrace(LogTransform,"*%d ",g[c+i*nT]);
644
0
    }
645
0
    logtrace(LogTransform,"*\n");
646
0
  }
647
648
  /*
649
  printf("--- temp\n");
650
  for (int r=0;r<nT;r++, printf("\n"))
651
    for (int c=0;c<nT;c++) {
652
      printf("%3d ",g[c+r*nT]);
653
    }
654
  */
655
656
0
  for (int y=0;y<nT;y++) {
657
    /*
658
    logtrace(LogTransform,"DCT-H: ");
659
    for (int i=0;i<nT;i++) {
660
      logtrace(LogTransform,"*%d ",g[i+y*nT]);
661
    }
662
    logtrace(LogTransform,"* -> ");
663
    */
664
665
666
    // find last non-zero coefficient to reduce computations carried out in DCT
667
668
0
    int lastCol = nT-1;
669
0
    for (;lastCol>=0;lastCol--) {
670
0
      if (g[y*nT+lastCol]) { break; }
671
0
    }
672
673
674
0
    for (int i=0;i<nT;i++) {
675
0
      int sum=0;
676
677
0
      for (int j=0;j<=lastCol /*nT*/;j++) {
678
0
        sum += mat_dct[fact*j][i] * g[y*nT+j];
679
0
      }
680
681
      //int out = Clip3(-32768,32767, (sum+rnd2)>>postShift);
682
0
      int out = (sum+rnd2)>>postShift;
683
684
      //fprintf(stderr,"%d*%d+%d = %d\n",y,stride,i,y*stride+i);
685
      //fprintf(stderr,"[%p]=%d\n",&dst[y*stride+i], Clip1_8bit(dst[y*stride+i]));
686
0
      dst[y*stride+i] = Clip_BitDepth(dst[y*stride+i] + out, bit_depth);
687
688
0
      logtrace(LogTransform,"*%d ",out);
689
0
    }
690
0
    logtrace(LogTransform,"*\n");
691
0
  }
692
0
}
Unexecuted instantiation: void transform_idct_add<unsigned char>(unsigned char*, long, int, short const*, int)
Unexecuted instantiation: void transform_idct_add<unsigned short>(unsigned short*, long, int, short const*, int)
693
694
695
696
void transform_idct_fallback(int32_t *dst, int nT, const int16_t *coeffs, int bdShift, int max_coeff_bits)
697
0
{
698
  /*
699
    The effective shift is
700
    7 bits right for bit-depth 8,
701
    6 bits right for bit-depth 9,
702
    5 bits right for bit-depth 10.
703
704
    One transformation with raw transform filter values increases range be 2048 (=32*64).
705
    This equals 11 bits.
706
707
    Computation is independent of the block size.
708
    Each multiplication with the table includes a left shift of 6 bits.
709
    Hence, we have 2* 6 bits = 12 bits left shift.
710
    V-pass has fixed 7 bit right shift.
711
    H-pass has 20-BitDepth bit right shift;
712
713
    Effective shift 's' means: residual value 1 gives DC-coeff (1<<s).
714
   */
715
716
717
0
  int rnd1 = 1<<(7-1);
718
0
  int fact = (1<<(5-Log2(nT)));
719
720
  //int bdShift = 20-bit_depth;
721
0
  int rnd2 = 1<<(bdShift-1);
722
723
0
  int16_t g[32*32];  // actually, only [nT*nT] used
724
725
0
  int CoeffMax = (1<<max_coeff_bits)-1;
726
0
  int CoeffMin = -(1<<max_coeff_bits);
727
728
  // TODO: valgrind reports that dst[] contains uninitialized data.
729
  // Probably from intra-prediction.
730
731
  /*
732
  for (int i=0;i<nT*nT;i++) {
733
    printf("%d\n",coeffs[i]);
734
  }
735
736
  for (int y=0;y<nT;y++) {
737
    for (int i=0;i<nT;i++) {
738
      printf("%d ",dst[y*stride+i]);
739
    }
740
  }
741
  printf("\n");
742
  */
743
744
  /*
745
  printf("--- input\n");
746
  for (int r=0;r<nT;r++, printf("\n"))
747
    for (int c=0;c<nT;c++) {
748
      printf("%3d ",coeffs[c+r*nT]);
749
    }
750
  */
751
752
0
  for (int c=0;c<nT;c++) {
753
754
    /*
755
    logtrace(LogTransform,"DCT-V: ");
756
    for (int i=0;i<nT;i++) {
757
      logtrace(LogTransform,"*%d ",coeffs[c+i*nT]);
758
    }
759
    logtrace(LogTransform,"* -> ");
760
    */
761
762
763
    // find last non-zero coefficient to reduce computations carried out in DCT
764
765
0
    int lastCol = nT-1;
766
0
    for (;lastCol>=0;lastCol--) {
767
0
      if (coeffs[c+lastCol*nT]) { break; }
768
0
    }
769
770
0
    for (int i=0;i<nT;i++) {
771
0
      int sum=0;
772
773
      /*
774
      printf("input: ");
775
      for (int j=0;j<nT;j++) {
776
        printf("%3d ",coeffs[c+j*nT]);
777
      }
778
      printf("\n");
779
780
      printf("mat: ");
781
      for (int j=0;j<nT;j++) {
782
        printf("%3d ",mat_dct[fact*j][i]);
783
      }
784
      printf("\n");
785
      */
786
787
0
      for (int j=0;j<=lastCol /*nT*/;j++) {
788
0
        sum += mat_dct[fact*j][i] * coeffs[c+j*nT];
789
0
      }
790
791
0
      g[c+i*nT] = Clip3(CoeffMin,CoeffMax, (sum+rnd1)>>7);
792
793
0
      logtrace(LogTransform,"*%d ",g[c+i*nT]);
794
0
    }
795
0
    logtrace(LogTransform,"*\n");
796
0
  }
797
798
  /*
799
  printf("--- temp\n");
800
  for (int r=0;r<nT;r++, printf("\n"))
801
    for (int c=0;c<nT;c++) {
802
      printf("%3d ",g[c+r*nT]);
803
    }
804
  */
805
806
0
  for (int y=0;y<nT;y++) {
807
    /*
808
    logtrace(LogTransform,"DCT-H: ");
809
    for (int i=0;i<nT;i++) {
810
      logtrace(LogTransform,"*%d ",g[i+y*nT]);
811
    }
812
    logtrace(LogTransform,"* -> ");
813
    */
814
815
816
    // find last non-zero coefficient to reduce computations carried out in DCT
817
818
0
    int lastCol = nT-1;
819
0
    for (;lastCol>=0;lastCol--) {
820
0
      if (g[y*nT+lastCol]) { break; }
821
0
    }
822
823
824
0
    for (int i=0;i<nT;i++) {
825
0
      int sum=0;
826
827
0
      for (int j=0;j<=lastCol /*nT*/;j++) {
828
0
        sum += mat_dct[fact*j][i] * g[y*nT+j];
829
0
      }
830
831
0
      dst[y*nT+i] = (sum + rnd2)>>bdShift;
832
833
0
      logtrace(LogTransform,"*%d ",sum);
834
0
    }
835
0
    logtrace(LogTransform,"*\n");
836
0
  }
837
0
}
838
839
840
void transform_idct_4x4_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
841
0
{
842
0
  transform_idct_fallback(dst,4,coeffs,bdShift,max_coeff_bits);
843
0
}
844
845
void transform_idct_8x8_fallback(int32_t *dst, const int16_t *coeffs, int bdShift, int max_coeff_bits)
846
0
{
847
0
  transform_idct_fallback(dst,8,coeffs,bdShift,max_coeff_bits);
848
0
}
849
850
void transform_idct_16x16_fallback(int32_t *dst, const int16_t *coeffs,
851
                                   int bdShift, int max_coeff_bits)
852
0
{
853
0
  transform_idct_fallback(dst,16,coeffs,bdShift,max_coeff_bits);
854
0
}
855
856
void transform_idct_32x32_fallback(int32_t *dst, const int16_t *coeffs,
857
                                   int bdShift, int max_coeff_bits)
858
0
{
859
0
  transform_idct_fallback(dst,32,coeffs,bdShift,max_coeff_bits);
860
0
}
861
862
863
864
865
void transform_4x4_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
866
0
{
867
0
  transform_idct_add<uint8_t>(dst,stride,  4, coeffs, 8);
868
0
}
869
870
void transform_8x8_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
871
0
{
872
0
  transform_idct_add<uint8_t>(dst,stride,  8, coeffs, 8);
873
0
}
874
875
void transform_16x16_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
876
0
{
877
0
  transform_idct_add<uint8_t>(dst,stride,  16, coeffs, 8);
878
0
}
879
880
void transform_32x32_add_8_fallback(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride)
881
0
{
882
0
  transform_idct_add<uint8_t>(dst,stride,  32, coeffs, 8);
883
0
}
884
885
886
void transform_4x4_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
887
0
{
888
0
  transform_idct_add<uint16_t>(dst,stride,  4, coeffs, bit_depth);
889
0
}
890
891
void transform_8x8_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
892
0
{
893
0
  transform_idct_add<uint16_t>(dst,stride,  8, coeffs, bit_depth);
894
0
}
895
896
void transform_16x16_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
897
0
{
898
0
  transform_idct_add<uint16_t>(dst,stride,  16, coeffs, bit_depth);
899
0
}
900
901
void transform_32x32_add_16_fallback(uint16_t *dst, const int16_t *coeffs, ptrdiff_t stride, int bit_depth)
902
0
{
903
0
  transform_idct_add<uint16_t>(dst,stride,  32, coeffs, bit_depth);
904
0
}
905
906
907
static void transform_fdct_8(int16_t* coeffs, int nT,
908
                             const int16_t *input, ptrdiff_t stride)
909
0
{
910
  /*
911
    Each sum over a basis vector sums nT elements, which is compensated by
912
    shifting right by Log2(nT), effectively dividing by 2^Log2(nT) = nT.
913
    Do this in each of the H/V passes.
914
915
    Each multiplication with the table includes a left shift of 6 bits.
916
    Hence, we have in total 2* 6 bits = 12 bits left shift because of the
917
    multiplications.
918
919
    We carry out shifts after each pass:
920
    First (V) pass has BitDepth-9 bit right shift,
921
    Second (H) pass has fixed 6 bit right shift.
922
923
    For bit-depth 8, the total shift is 7 bits left.
924
    For bit-depth 9, the total shift is 6 bits left.
925
    For bit-depth 10, the total shift is 5 bits left.
926
927
    I.e.: a constant residual value 1 gives DC-coeff (1<<s).
928
929
    For 8-bit images in a 32x32 block, the input are 8 bits + 1 sign bit.
930
    After the first pass, we need 9+5+6=20 bits for the intermediate sum
931
    (9 bit input, 5 bit because we sum 2^5 elements, 6 bit because of multiplication with 64).
932
    The first pass shift is Log2(32) - 1 -> 4 bits and we are down to 16 bits again.
933
    After the second pass, we need 16+5+6=27 bits for the intermediate sum
934
    (16 bit input, 5 bit because we sum 2^5 elements, 6 bit because of coefficient multiplication).
935
    The second pass shift is Log2(32)+6 = 11 and we are down again to 16 bits.
936
937
    For larger input bit-depths, the intermediate result after the first pass
938
    will be wider accordingly, but the widths after the shifts are the same.
939
  */
940
941
0
  int BitDepth = 8;
942
943
  //          / compensate everything | / effective word length |
944
0
  int shift1 = Log2(nT) + 6 + BitDepth  - 15;
945
0
  int shift2 = Log2(nT) + 6;
946
947
0
  int rnd1 = 1<<(shift1-1);
948
0
  int rnd2 = 1<<(shift2-1);
949
0
  int fact = (1<<(5-Log2(nT)));
950
951
0
  int16_t g[32*32];  // actually, only [nT*nT] used
952
953
0
  for (int c=0;c<nT;c++) {
954
955
0
    for (int i=0;i<nT;i++) {
956
0
      int sum=0;
957
958
0
      for (int j=0;j<nT;j++) {
959
0
        sum += mat_dct[fact*i][j] * input[c+j*stride];
960
0
      }
961
962
0
      g[c+i*nT] = (sum+rnd1)>>shift1; // clipping to -32768;32767 unnecessary
963
0
    }
964
0
  }
965
966
967
0
  for (int y=0;y<nT;y++) {
968
0
    for (int i=0;i<nT;i++) {
969
0
      int sum=0;
970
971
0
      for (int j=0;j<nT;j++) {
972
0
        sum += mat_dct[fact*i][j] * g[y*nT+j];
973
0
      }
974
975
      // no clipping to -32768;32767 required
976
0
      int out = (sum+rnd2)>>shift2;
977
978
0
      coeffs[y*nT+i] = out;
979
0
    }
980
0
  }
981
0
}
982
983
984
void fdct_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
985
0
{
986
0
  transform_fdct_8(coeffs, 4, input,stride);
987
0
}
988
989
void fdct_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
990
0
{
991
0
  transform_fdct_8(coeffs, 8, input,stride);
992
0
}
993
994
void fdct_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
995
0
{
996
0
  transform_fdct_8(coeffs, 16, input,stride);
997
0
}
998
999
void fdct_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
1000
0
{
1001
0
  transform_fdct_8(coeffs, 32, input,stride);
1002
0
}
1003
1004
1005
1006
1007
void hadamard_transform_8(int16_t *coeffs, int n, const int16_t *input, ptrdiff_t stride)
1008
0
{
1009
0
  int16_t tmp[32*32];
1010
1011
  // row transforms
1012
1013
  //printMatrix("input",input,n);
1014
1015
0
  int16_t am[32],bm[32];
1016
0
  int16_t *a = am, *b = bm;
1017
0
  for (int row=0;row<n;row++) {
1018
0
    int rs = row*stride;
1019
0
    for (int i=0;i<(n>>1);i++) {
1020
0
      a[       i] = input[i+rs] + input[i+(n>>1)+rs];
1021
0
      a[(n>>1)+i] = input[i+rs] - input[i+(n>>1)+rs];
1022
0
    }
1023
1024
0
    int iOuter=(n>>1);
1025
0
    int nInner=(n>>2);
1026
1027
0
    while (nInner>=2) {
1028
0
      std::swap(a,b);
1029
1030
0
      for (int k=0;k<n;k+=iOuter) {
1031
0
        for (int i=0;i<nInner;i++) {
1032
0
          a[k+i       ] = b[k+i] + b[k+i+nInner];
1033
0
          a[k+i+nInner] = b[k+i] - b[k+i+nInner];
1034
0
        }
1035
0
      }
1036
1037
0
      iOuter>>=1;
1038
0
      nInner>>=1;
1039
0
    }
1040
1041
0
    for (int k=0;k<n;k+=2) {
1042
0
      tmp[k  +n*row] = a[k] + a[k+1];
1043
0
      tmp[k+1+n*row] = a[k] - a[k+1];
1044
0
    }
1045
0
  }
1046
1047
  //printMatrix("tmp",tmp,n);
1048
1049
  // column transforms
1050
1051
0
  for (int col=0;col<n;col++) {
1052
0
    for (int i=0;i<(n>>1);i++) {
1053
0
      a[       i] = tmp[i*n+col] + tmp[(i+(n>>1))*n+col];
1054
0
      a[(n>>1)+i] = tmp[i*n+col] - tmp[(i+(n>>1))*n+col];
1055
0
    }
1056
1057
0
    int iOuter=(n>>1);
1058
0
    int nInner=(n>>2);
1059
1060
0
    while (nInner>=2) {
1061
0
      std::swap(a,b);
1062
1063
0
      for (int k=0;k<n;k+=iOuter) {
1064
0
        for (int i=0;i<nInner;i++) {
1065
0
          a[k+i       ] = b[k+i] + b[k+i+nInner];
1066
0
          a[k+i+nInner] = b[k+i] - b[k+i+nInner];
1067
0
        }
1068
0
      }
1069
1070
0
      iOuter>>=1;
1071
0
      nInner>>=1;
1072
0
    }
1073
1074
0
    for (int k=0;k<n;k+=2) {
1075
0
      coeffs[col+(k  )*n] = a[k] + a[k+1];
1076
0
      coeffs[col+(k+1)*n] = a[k] - a[k+1];
1077
0
    }
1078
0
  }
1079
1080
  //printMatrix("coeffs",coeffs,n);
1081
0
}
1082
1083
1084
void hadamard_4x4_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
1085
0
{
1086
0
  int16_t tmp[4*4];
1087
1088
  // row transforms
1089
1090
  //printMatrix("input",input,4);
1091
1092
0
  int16_t a[4];
1093
0
  for (int row=0;row<4;row++) {
1094
0
    int rs = row*stride;
1095
0
    a[0] = input[0+rs] + input[2+rs];
1096
0
    a[1] = input[1+rs] + input[3+rs];
1097
0
    a[2] = input[0+rs] - input[2+rs];
1098
0
    a[3] = input[1+rs] - input[3+rs];
1099
1100
0
    tmp[0+4*row] = a[0]+a[1];
1101
0
    tmp[1+4*row] = a[0]-a[1];
1102
0
    tmp[2+4*row] = a[2]+a[3];
1103
0
    tmp[3+4*row] = a[2]-a[3];
1104
0
  }
1105
1106
  //printMatrix("tmp",tmp,4);
1107
1108
  // column transforms
1109
1110
0
  for (int col=0;col<4;col++) {
1111
0
    a[0] = tmp[col+0*4] + tmp[col+2*4];
1112
0
    a[1] = tmp[col+1*4] + tmp[col+3*4];
1113
0
    a[2] = tmp[col+0*4] - tmp[col+2*4];
1114
0
    a[3] = tmp[col+1*4] - tmp[col+3*4];
1115
1116
0
    coeffs[col+0*4] = a[0]+a[1];
1117
0
    coeffs[col+1*4] = a[0]-a[1];
1118
0
    coeffs[col+2*4] = a[2]+a[3];
1119
0
    coeffs[col+3*4] = a[2]-a[3];
1120
0
  }
1121
1122
  //printMatrix("coeffs",coeffs,4);
1123
0
}
1124
1125
1126
void hadamard_8x8_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
1127
0
{
1128
0
  int16_t tmp[8*8];
1129
1130
  // row transforms
1131
1132
  //printMatrix("input",input,8);
1133
1134
0
  int16_t a[8],b[8];
1135
0
  for (int row=0;row<8;row++) {
1136
0
    int rs = row*stride;
1137
0
    a[0] = input[0+rs] + input[4+rs];
1138
0
    a[1] = input[1+rs] + input[5+rs];
1139
0
    a[2] = input[2+rs] + input[6+rs];
1140
0
    a[3] = input[3+rs] + input[7+rs];
1141
0
    a[4] = input[0+rs] - input[4+rs];
1142
0
    a[5] = input[1+rs] - input[5+rs];
1143
0
    a[6] = input[2+rs] - input[6+rs];
1144
0
    a[7] = input[3+rs] - input[7+rs];
1145
1146
0
    b[0] = a[0]+a[2];
1147
0
    b[1] = a[1]+a[3];
1148
0
    b[2] = a[0]-a[2];
1149
0
    b[3] = a[1]-a[3];
1150
0
    b[4] = a[4]+a[6];
1151
0
    b[5] = a[5]+a[7];
1152
0
    b[6] = a[4]-a[6];
1153
0
    b[7] = a[5]-a[7];
1154
1155
0
    tmp[0+8*row] = b[0]+b[1];
1156
0
    tmp[1+8*row] = b[0]-b[1];
1157
0
    tmp[2+8*row] = b[2]+b[3];
1158
0
    tmp[3+8*row] = b[2]-b[3];
1159
0
    tmp[4+8*row] = b[4]+b[5];
1160
0
    tmp[5+8*row] = b[4]-b[5];
1161
0
    tmp[6+8*row] = b[6]+b[7];
1162
0
    tmp[7+8*row] = b[6]-b[7];
1163
0
  }
1164
1165
  //printMatrix("tmp",tmp,8);
1166
1167
  // column transforms
1168
1169
0
  for (int col=0;col<8;col++) {
1170
0
    a[0] = tmp[col+0*8] + tmp[col+4*8];
1171
0
    a[1] = tmp[col+1*8] + tmp[col+5*8];
1172
0
    a[2] = tmp[col+2*8] + tmp[col+6*8];
1173
0
    a[3] = tmp[col+3*8] + tmp[col+7*8];
1174
0
    a[4] = tmp[col+0*8] - tmp[col+4*8];
1175
0
    a[5] = tmp[col+1*8] - tmp[col+5*8];
1176
0
    a[6] = tmp[col+2*8] - tmp[col+6*8];
1177
0
    a[7] = tmp[col+3*8] - tmp[col+7*8];
1178
1179
0
    b[0] = a[0]+a[2];
1180
0
    b[1] = a[1]+a[3];
1181
0
    b[2] = a[0]-a[2];
1182
0
    b[3] = a[1]-a[3];
1183
0
    b[4] = a[4]+a[6];
1184
0
    b[5] = a[5]+a[7];
1185
0
    b[6] = a[4]-a[6];
1186
0
    b[7] = a[5]-a[7];
1187
1188
0
    coeffs[col+0*8] = b[0]+b[1];
1189
0
    coeffs[col+1*8] = b[0]-b[1];
1190
0
    coeffs[col+2*8] = b[2]+b[3];
1191
0
    coeffs[col+3*8] = b[2]-b[3];
1192
0
    coeffs[col+4*8] = b[4]+b[5];
1193
0
    coeffs[col+5*8] = b[4]-b[5];
1194
0
    coeffs[col+6*8] = b[6]+b[7];
1195
0
    coeffs[col+7*8] = b[6]-b[7];
1196
0
  }
1197
1198
  //printMatrix("coeffs",coeffs,8);
1199
0
}
1200
1201
1202
void hadamard_16x16_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
1203
0
{
1204
0
  hadamard_transform_8(coeffs,16, input,stride);
1205
0
}
1206
1207
void hadamard_32x32_8_fallback(int16_t *coeffs, const int16_t *input, ptrdiff_t stride)
1208
0
{
1209
0
  hadamard_transform_8(coeffs,32, input,stride);
1210
0
}