Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/inv_txfm.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include <math.h>
12
#include <stdlib.h>
13
#include <string.h>
14
15
#include "./vpx_dsp_rtcd.h"
16
#include "vpx_dsp/inv_txfm.h"
17
18
0
void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19
  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20
     0.5 shifts per pixel. */
21
0
  int i;
22
0
  tran_low_t output[16];
23
0
  tran_high_t a1, b1, c1, d1, e1;
24
0
  const tran_low_t *ip = input;
25
0
  tran_low_t *op = output;
26
27
0
  for (i = 0; i < 4; i++) {
28
0
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
29
0
    c1 = ip[1] >> UNIT_QUANT_SHIFT;
30
0
    d1 = ip[2] >> UNIT_QUANT_SHIFT;
31
0
    b1 = ip[3] >> UNIT_QUANT_SHIFT;
32
0
    a1 += c1;
33
0
    d1 -= b1;
34
0
    e1 = (a1 - d1) >> 1;
35
0
    b1 = e1 - b1;
36
0
    c1 = e1 - c1;
37
0
    a1 -= b1;
38
0
    d1 += c1;
39
0
    op[0] = WRAPLOW(a1);
40
0
    op[1] = WRAPLOW(b1);
41
0
    op[2] = WRAPLOW(c1);
42
0
    op[3] = WRAPLOW(d1);
43
0
    ip += 4;
44
0
    op += 4;
45
0
  }
46
47
0
  ip = output;
48
0
  for (i = 0; i < 4; i++) {
49
0
    a1 = ip[4 * 0];
50
0
    c1 = ip[4 * 1];
51
0
    d1 = ip[4 * 2];
52
0
    b1 = ip[4 * 3];
53
0
    a1 += c1;
54
0
    d1 -= b1;
55
0
    e1 = (a1 - d1) >> 1;
56
0
    b1 = e1 - b1;
57
0
    c1 = e1 - c1;
58
0
    a1 -= b1;
59
0
    d1 += c1;
60
0
    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61
0
    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62
0
    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63
0
    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64
65
0
    ip++;
66
0
    dest++;
67
0
  }
68
0
}
69
70
1.94M
void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
71
1.94M
  int i;
72
1.94M
  tran_high_t a1, e1;
73
1.94M
  tran_low_t tmp[4];
74
1.94M
  const tran_low_t *ip = input;
75
1.94M
  tran_low_t *op = tmp;
76
77
1.94M
  a1 = ip[0] >> UNIT_QUANT_SHIFT;
78
1.94M
  e1 = a1 >> 1;
79
1.94M
  a1 -= e1;
80
1.94M
  op[0] = WRAPLOW(a1);
81
1.94M
  op[1] = op[2] = op[3] = WRAPLOW(e1);
82
83
1.94M
  ip = tmp;
84
9.71M
  for (i = 0; i < 4; i++) {
85
7.76M
    e1 = ip[0] >> 1;
86
7.76M
    a1 = ip[0] - e1;
87
7.76M
    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88
7.76M
    dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89
7.76M
    dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90
7.76M
    dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
91
7.76M
    ip++;
92
7.76M
    dest++;
93
7.76M
  }
94
1.94M
}
95
96
0
void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97
0
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98
0
  tran_low_t x0 = input[0];
99
0
  tran_low_t x1 = input[1];
100
0
  tran_low_t x2 = input[2];
101
0
  tran_low_t x3 = input[3];
102
103
0
  if (!(x0 | x1 | x2 | x3)) {
104
0
    memset(output, 0, 4 * sizeof(*output));
105
0
    return;
106
0
  }
107
108
  // 32-bit result is enough for the following multiplications.
109
0
  s0 = sinpi_1_9 * x0;
110
0
  s1 = sinpi_2_9 * x0;
111
0
  s2 = sinpi_3_9 * x1;
112
0
  s3 = sinpi_4_9 * x2;
113
0
  s4 = sinpi_1_9 * x2;
114
0
  s5 = sinpi_2_9 * x3;
115
0
  s6 = sinpi_4_9 * x3;
116
0
  s7 = WRAPLOW(x0 - x2 + x3);
117
118
0
  s0 = s0 + s3 + s5;
119
0
  s1 = s1 - s4 - s6;
120
0
  s3 = s2;
121
0
  s2 = sinpi_3_9 * s7;
122
123
  // 1-D transform scaling factor is sqrt(2).
124
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
125
  // + 1b (addition) = 29b.
126
  // Hence the output bit depth is 15b.
127
0
  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
128
0
  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
129
0
  output[2] = WRAPLOW(dct_const_round_shift(s2));
130
0
  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
131
0
}
132
133
0
void idct4_c(const tran_low_t *input, tran_low_t *output) {
134
0
  int16_t step[4];
135
0
  tran_high_t temp1, temp2;
136
137
  // stage 1
138
0
  temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
139
0
  temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
140
0
  step[0] = WRAPLOW(dct_const_round_shift(temp1));
141
0
  step[1] = WRAPLOW(dct_const_round_shift(temp2));
142
0
  temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
143
0
  temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
144
0
  step[2] = WRAPLOW(dct_const_round_shift(temp1));
145
0
  step[3] = WRAPLOW(dct_const_round_shift(temp2));
146
147
  // stage 2
148
0
  output[0] = WRAPLOW(step[0] + step[3]);
149
0
  output[1] = WRAPLOW(step[1] + step[2]);
150
0
  output[2] = WRAPLOW(step[1] - step[2]);
151
0
  output[3] = WRAPLOW(step[0] - step[3]);
152
0
}
153
154
0
void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
155
0
  int i, j;
156
0
  tran_low_t out[4 * 4];
157
0
  tran_low_t *outptr = out;
158
0
  tran_low_t temp_in[4], temp_out[4];
159
160
  // Rows
161
0
  for (i = 0; i < 4; ++i) {
162
0
    idct4_c(input, outptr);
163
0
    input += 4;
164
0
    outptr += 4;
165
0
  }
166
167
  // Columns
168
0
  for (i = 0; i < 4; ++i) {
169
0
    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
170
0
    idct4_c(temp_in, temp_out);
171
0
    for (j = 0; j < 4; ++j) {
172
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
173
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
174
0
    }
175
0
  }
176
0
}
177
178
0
void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
179
0
  int i;
180
0
  tran_high_t a1;
181
0
  tran_low_t out =
182
0
      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
183
184
0
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
185
0
  a1 = ROUND_POWER_OF_TWO(out, 4);
186
187
0
  for (i = 0; i < 4; i++) {
188
0
    dest[0] = clip_pixel_add(dest[0], a1);
189
0
    dest[1] = clip_pixel_add(dest[1], a1);
190
0
    dest[2] = clip_pixel_add(dest[2], a1);
191
0
    dest[3] = clip_pixel_add(dest[3], a1);
192
0
    dest += stride;
193
0
  }
194
0
}
195
196
0
void iadst8_c(const tran_low_t *input, tran_low_t *output) {
197
0
  int s0, s1, s2, s3, s4, s5, s6, s7;
198
0
  tran_high_t x0 = input[7];
199
0
  tran_high_t x1 = input[0];
200
0
  tran_high_t x2 = input[5];
201
0
  tran_high_t x3 = input[2];
202
0
  tran_high_t x4 = input[3];
203
0
  tran_high_t x5 = input[4];
204
0
  tran_high_t x6 = input[1];
205
0
  tran_high_t x7 = input[6];
206
207
0
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
208
0
    memset(output, 0, 8 * sizeof(*output));
209
0
    return;
210
0
  }
211
212
  // stage 1
213
0
  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
214
0
  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
215
0
  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
216
0
  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
217
0
  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
218
0
  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
219
0
  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
220
0
  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
221
222
0
  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
223
0
  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
224
0
  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
225
0
  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
226
0
  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
227
0
  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
228
0
  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
229
0
  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
230
231
  // stage 2
232
0
  s0 = (int)x0;
233
0
  s1 = (int)x1;
234
0
  s2 = (int)x2;
235
0
  s3 = (int)x3;
236
0
  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
237
0
  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
238
0
  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
239
0
  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
240
241
0
  x0 = WRAPLOW(s0 + s2);
242
0
  x1 = WRAPLOW(s1 + s3);
243
0
  x2 = WRAPLOW(s0 - s2);
244
0
  x3 = WRAPLOW(s1 - s3);
245
0
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
246
0
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
247
0
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
248
0
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
249
250
  // stage 3
251
0
  s2 = (int)(cospi_16_64 * (x2 + x3));
252
0
  s3 = (int)(cospi_16_64 * (x2 - x3));
253
0
  s6 = (int)(cospi_16_64 * (x6 + x7));
254
0
  s7 = (int)(cospi_16_64 * (x6 - x7));
255
256
0
  x2 = WRAPLOW(dct_const_round_shift(s2));
257
0
  x3 = WRAPLOW(dct_const_round_shift(s3));
258
0
  x6 = WRAPLOW(dct_const_round_shift(s6));
259
0
  x7 = WRAPLOW(dct_const_round_shift(s7));
260
261
0
  output[0] = WRAPLOW(x0);
262
0
  output[1] = WRAPLOW(-x4);
263
0
  output[2] = WRAPLOW(x6);
264
0
  output[3] = WRAPLOW(-x2);
265
0
  output[4] = WRAPLOW(x3);
266
0
  output[5] = WRAPLOW(-x7);
267
0
  output[6] = WRAPLOW(x5);
268
0
  output[7] = WRAPLOW(-x1);
269
0
}
270
271
0
void idct8_c(const tran_low_t *input, tran_low_t *output) {
272
0
  int16_t step1[8], step2[8];
273
0
  tran_high_t temp1, temp2;
274
275
  // stage 1
276
0
  step1[0] = (int16_t)input[0];
277
0
  step1[2] = (int16_t)input[4];
278
0
  step1[1] = (int16_t)input[2];
279
0
  step1[3] = (int16_t)input[6];
280
0
  temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
281
0
  temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
282
0
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
283
0
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
284
0
  temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
285
0
  temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
286
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
287
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
288
289
  // stage 2
290
0
  temp1 = (step1[0] + step1[2]) * cospi_16_64;
291
0
  temp2 = (step1[0] - step1[2]) * cospi_16_64;
292
0
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
293
0
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
294
0
  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
295
0
  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
296
0
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
297
0
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
298
0
  step2[4] = WRAPLOW(step1[4] + step1[5]);
299
0
  step2[5] = WRAPLOW(step1[4] - step1[5]);
300
0
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
301
0
  step2[7] = WRAPLOW(step1[6] + step1[7]);
302
303
  // stage 3
304
0
  step1[0] = WRAPLOW(step2[0] + step2[3]);
305
0
  step1[1] = WRAPLOW(step2[1] + step2[2]);
306
0
  step1[2] = WRAPLOW(step2[1] - step2[2]);
307
0
  step1[3] = WRAPLOW(step2[0] - step2[3]);
308
0
  step1[4] = step2[4];
309
0
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
310
0
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
311
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
312
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
313
0
  step1[7] = step2[7];
314
315
  // stage 4
316
0
  output[0] = WRAPLOW(step1[0] + step1[7]);
317
0
  output[1] = WRAPLOW(step1[1] + step1[6]);
318
0
  output[2] = WRAPLOW(step1[2] + step1[5]);
319
0
  output[3] = WRAPLOW(step1[3] + step1[4]);
320
0
  output[4] = WRAPLOW(step1[3] - step1[4]);
321
0
  output[5] = WRAPLOW(step1[2] - step1[5]);
322
0
  output[6] = WRAPLOW(step1[1] - step1[6]);
323
0
  output[7] = WRAPLOW(step1[0] - step1[7]);
324
0
}
325
326
0
void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
327
0
  int i, j;
328
0
  tran_low_t out[8 * 8];
329
0
  tran_low_t *outptr = out;
330
0
  tran_low_t temp_in[8], temp_out[8];
331
332
  // First transform rows
333
0
  for (i = 0; i < 8; ++i) {
334
0
    idct8_c(input, outptr);
335
0
    input += 8;
336
0
    outptr += 8;
337
0
  }
338
339
  // Then transform columns
340
0
  for (i = 0; i < 8; ++i) {
341
0
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
342
0
    idct8_c(temp_in, temp_out);
343
0
    for (j = 0; j < 8; ++j) {
344
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
345
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
346
0
    }
347
0
  }
348
0
}
349
350
0
void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
351
0
  int i, j;
352
0
  tran_low_t out[8 * 8] = { 0 };
353
0
  tran_low_t *outptr = out;
354
0
  tran_low_t temp_in[8], temp_out[8];
355
356
  // First transform rows
357
  // Only first 4 row has non-zero coefs
358
0
  for (i = 0; i < 4; ++i) {
359
0
    idct8_c(input, outptr);
360
0
    input += 8;
361
0
    outptr += 8;
362
0
  }
363
364
  // Then transform columns
365
0
  for (i = 0; i < 8; ++i) {
366
0
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
367
0
    idct8_c(temp_in, temp_out);
368
0
    for (j = 0; j < 8; ++j) {
369
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
370
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
371
0
    }
372
0
  }
373
0
}
374
375
0
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
376
0
  int i, j;
377
0
  tran_high_t a1;
378
0
  tran_low_t out =
379
0
      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
380
381
0
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
382
0
  a1 = ROUND_POWER_OF_TWO(out, 5);
383
0
  for (j = 0; j < 8; ++j) {
384
0
    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
385
0
    dest += stride;
386
0
  }
387
0
}
388
389
0
void iadst16_c(const tran_low_t *input, tran_low_t *output) {
390
0
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
391
0
  tran_high_t s9, s10, s11, s12, s13, s14, s15;
392
0
  tran_high_t x0 = input[15];
393
0
  tran_high_t x1 = input[0];
394
0
  tran_high_t x2 = input[13];
395
0
  tran_high_t x3 = input[2];
396
0
  tran_high_t x4 = input[11];
397
0
  tran_high_t x5 = input[4];
398
0
  tran_high_t x6 = input[9];
399
0
  tran_high_t x7 = input[6];
400
0
  tran_high_t x8 = input[7];
401
0
  tran_high_t x9 = input[8];
402
0
  tran_high_t x10 = input[5];
403
0
  tran_high_t x11 = input[10];
404
0
  tran_high_t x12 = input[3];
405
0
  tran_high_t x13 = input[12];
406
0
  tran_high_t x14 = input[1];
407
0
  tran_high_t x15 = input[14];
408
409
0
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
410
0
        x13 | x14 | x15)) {
411
0
    memset(output, 0, 16 * sizeof(*output));
412
0
    return;
413
0
  }
414
415
  // stage 1
416
0
  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
417
0
  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
418
0
  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
419
0
  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
420
0
  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
421
0
  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
422
0
  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
423
0
  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
424
0
  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
425
0
  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
426
0
  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
427
0
  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
428
0
  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
429
0
  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
430
0
  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
431
0
  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
432
433
0
  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
434
0
  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
435
0
  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
436
0
  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
437
0
  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
438
0
  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
439
0
  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
440
0
  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
441
0
  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
442
0
  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
443
0
  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
444
0
  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
445
0
  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
446
0
  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
447
0
  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
448
0
  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
449
450
  // stage 2
451
0
  s0 = x0;
452
0
  s1 = x1;
453
0
  s2 = x2;
454
0
  s3 = x3;
455
0
  s4 = x4;
456
0
  s5 = x5;
457
0
  s6 = x6;
458
0
  s7 = x7;
459
0
  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
460
0
  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
461
0
  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
462
0
  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
463
0
  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
464
0
  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
465
0
  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
466
0
  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
467
468
0
  x0 = WRAPLOW(s0 + s4);
469
0
  x1 = WRAPLOW(s1 + s5);
470
0
  x2 = WRAPLOW(s2 + s6);
471
0
  x3 = WRAPLOW(s3 + s7);
472
0
  x4 = WRAPLOW(s0 - s4);
473
0
  x5 = WRAPLOW(s1 - s5);
474
0
  x6 = WRAPLOW(s2 - s6);
475
0
  x7 = WRAPLOW(s3 - s7);
476
0
  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
477
0
  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
478
0
  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
479
0
  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
480
0
  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
481
0
  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
482
0
  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
483
0
  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
484
485
  // stage 3
486
0
  s0 = x0;
487
0
  s1 = x1;
488
0
  s2 = x2;
489
0
  s3 = x3;
490
0
  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
491
0
  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
492
0
  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
493
0
  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
494
0
  s8 = x8;
495
0
  s9 = x9;
496
0
  s10 = x10;
497
0
  s11 = x11;
498
0
  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
499
0
  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
500
0
  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
501
0
  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
502
503
0
  x0 = WRAPLOW(s0 + s2);
504
0
  x1 = WRAPLOW(s1 + s3);
505
0
  x2 = WRAPLOW(s0 - s2);
506
0
  x3 = WRAPLOW(s1 - s3);
507
0
  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
508
0
  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
509
0
  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
510
0
  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
511
0
  x8 = WRAPLOW(s8 + s10);
512
0
  x9 = WRAPLOW(s9 + s11);
513
0
  x10 = WRAPLOW(s8 - s10);
514
0
  x11 = WRAPLOW(s9 - s11);
515
0
  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
516
0
  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
517
0
  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
518
0
  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
519
520
  // stage 4
521
0
  s2 = (-cospi_16_64) * (x2 + x3);
522
0
  s3 = cospi_16_64 * (x2 - x3);
523
0
  s6 = cospi_16_64 * (x6 + x7);
524
0
  s7 = cospi_16_64 * (-x6 + x7);
525
0
  s10 = cospi_16_64 * (x10 + x11);
526
0
  s11 = cospi_16_64 * (-x10 + x11);
527
0
  s14 = (-cospi_16_64) * (x14 + x15);
528
0
  s15 = cospi_16_64 * (x14 - x15);
529
530
0
  x2 = WRAPLOW(dct_const_round_shift(s2));
531
0
  x3 = WRAPLOW(dct_const_round_shift(s3));
532
0
  x6 = WRAPLOW(dct_const_round_shift(s6));
533
0
  x7 = WRAPLOW(dct_const_round_shift(s7));
534
0
  x10 = WRAPLOW(dct_const_round_shift(s10));
535
0
  x11 = WRAPLOW(dct_const_round_shift(s11));
536
0
  x14 = WRAPLOW(dct_const_round_shift(s14));
537
0
  x15 = WRAPLOW(dct_const_round_shift(s15));
538
539
0
  output[0] = WRAPLOW(x0);
540
0
  output[1] = WRAPLOW(-x8);
541
0
  output[2] = WRAPLOW(x12);
542
0
  output[3] = WRAPLOW(-x4);
543
0
  output[4] = WRAPLOW(x6);
544
0
  output[5] = WRAPLOW(x14);
545
0
  output[6] = WRAPLOW(x10);
546
0
  output[7] = WRAPLOW(x2);
547
0
  output[8] = WRAPLOW(x3);
548
0
  output[9] = WRAPLOW(x11);
549
0
  output[10] = WRAPLOW(x15);
550
0
  output[11] = WRAPLOW(x7);
551
0
  output[12] = WRAPLOW(x5);
552
0
  output[13] = WRAPLOW(-x13);
553
0
  output[14] = WRAPLOW(x9);
554
0
  output[15] = WRAPLOW(-x1);
555
0
}
556
557
0
void idct16_c(const tran_low_t *input, tran_low_t *output) {
558
0
  int16_t step1[16], step2[16];
559
0
  tran_high_t temp1, temp2;
560
561
  // stage 1
562
0
  step1[0] = (int16_t)input[0 / 2];
563
0
  step1[1] = (int16_t)input[16 / 2];
564
0
  step1[2] = (int16_t)input[8 / 2];
565
0
  step1[3] = (int16_t)input[24 / 2];
566
0
  step1[4] = (int16_t)input[4 / 2];
567
0
  step1[5] = (int16_t)input[20 / 2];
568
0
  step1[6] = (int16_t)input[12 / 2];
569
0
  step1[7] = (int16_t)input[28 / 2];
570
0
  step1[8] = (int16_t)input[2 / 2];
571
0
  step1[9] = (int16_t)input[18 / 2];
572
0
  step1[10] = (int16_t)input[10 / 2];
573
0
  step1[11] = (int16_t)input[26 / 2];
574
0
  step1[12] = (int16_t)input[6 / 2];
575
0
  step1[13] = (int16_t)input[22 / 2];
576
0
  step1[14] = (int16_t)input[14 / 2];
577
0
  step1[15] = (int16_t)input[30 / 2];
578
579
  // stage 2
580
0
  step2[0] = step1[0];
581
0
  step2[1] = step1[1];
582
0
  step2[2] = step1[2];
583
0
  step2[3] = step1[3];
584
0
  step2[4] = step1[4];
585
0
  step2[5] = step1[5];
586
0
  step2[6] = step1[6];
587
0
  step2[7] = step1[7];
588
589
0
  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
590
0
  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
591
0
  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
592
0
  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
593
594
0
  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
595
0
  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
596
0
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
597
0
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
598
599
0
  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
600
0
  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
601
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
602
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
603
604
0
  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
605
0
  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
606
0
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
607
0
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
608
609
  // stage 3
610
0
  step1[0] = step2[0];
611
0
  step1[1] = step2[1];
612
0
  step1[2] = step2[2];
613
0
  step1[3] = step2[3];
614
615
0
  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
616
0
  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
617
0
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
618
0
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
619
0
  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
620
0
  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
621
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
622
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
623
624
0
  step1[8] = WRAPLOW(step2[8] + step2[9]);
625
0
  step1[9] = WRAPLOW(step2[8] - step2[9]);
626
0
  step1[10] = WRAPLOW(-step2[10] + step2[11]);
627
0
  step1[11] = WRAPLOW(step2[10] + step2[11]);
628
0
  step1[12] = WRAPLOW(step2[12] + step2[13]);
629
0
  step1[13] = WRAPLOW(step2[12] - step2[13]);
630
0
  step1[14] = WRAPLOW(-step2[14] + step2[15]);
631
0
  step1[15] = WRAPLOW(step2[14] + step2[15]);
632
633
  // stage 4
634
0
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
635
0
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
636
0
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
637
0
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
638
0
  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
639
0
  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
640
0
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
641
0
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
642
0
  step2[4] = WRAPLOW(step1[4] + step1[5]);
643
0
  step2[5] = WRAPLOW(step1[4] - step1[5]);
644
0
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
645
0
  step2[7] = WRAPLOW(step1[6] + step1[7]);
646
647
0
  step2[8] = step1[8];
648
0
  step2[15] = step1[15];
649
0
  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
650
0
  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
651
0
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
652
0
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
653
0
  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
654
0
  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
655
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
656
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
657
0
  step2[11] = step1[11];
658
0
  step2[12] = step1[12];
659
660
  // stage 5
661
0
  step1[0] = WRAPLOW(step2[0] + step2[3]);
662
0
  step1[1] = WRAPLOW(step2[1] + step2[2]);
663
0
  step1[2] = WRAPLOW(step2[1] - step2[2]);
664
0
  step1[3] = WRAPLOW(step2[0] - step2[3]);
665
0
  step1[4] = step2[4];
666
0
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
667
0
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
668
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
669
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
670
0
  step1[7] = step2[7];
671
672
0
  step1[8] = WRAPLOW(step2[8] + step2[11]);
673
0
  step1[9] = WRAPLOW(step2[9] + step2[10]);
674
0
  step1[10] = WRAPLOW(step2[9] - step2[10]);
675
0
  step1[11] = WRAPLOW(step2[8] - step2[11]);
676
0
  step1[12] = WRAPLOW(-step2[12] + step2[15]);
677
0
  step1[13] = WRAPLOW(-step2[13] + step2[14]);
678
0
  step1[14] = WRAPLOW(step2[13] + step2[14]);
679
0
  step1[15] = WRAPLOW(step2[12] + step2[15]);
680
681
  // stage 6
682
0
  step2[0] = WRAPLOW(step1[0] + step1[7]);
683
0
  step2[1] = WRAPLOW(step1[1] + step1[6]);
684
0
  step2[2] = WRAPLOW(step1[2] + step1[5]);
685
0
  step2[3] = WRAPLOW(step1[3] + step1[4]);
686
0
  step2[4] = WRAPLOW(step1[3] - step1[4]);
687
0
  step2[5] = WRAPLOW(step1[2] - step1[5]);
688
0
  step2[6] = WRAPLOW(step1[1] - step1[6]);
689
0
  step2[7] = WRAPLOW(step1[0] - step1[7]);
690
0
  step2[8] = step1[8];
691
0
  step2[9] = step1[9];
692
0
  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
693
0
  temp2 = (step1[10] + step1[13]) * cospi_16_64;
694
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
695
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
696
0
  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
697
0
  temp2 = (step1[11] + step1[12]) * cospi_16_64;
698
0
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
699
0
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
700
0
  step2[14] = step1[14];
701
0
  step2[15] = step1[15];
702
703
  // stage 7
704
0
  output[0] = (tran_low_t)WRAPLOW(step2[0] + step2[15]);
705
0
  output[1] = (tran_low_t)WRAPLOW(step2[1] + step2[14]);
706
0
  output[2] = (tran_low_t)WRAPLOW(step2[2] + step2[13]);
707
0
  output[3] = (tran_low_t)WRAPLOW(step2[3] + step2[12]);
708
0
  output[4] = (tran_low_t)WRAPLOW(step2[4] + step2[11]);
709
0
  output[5] = (tran_low_t)WRAPLOW(step2[5] + step2[10]);
710
0
  output[6] = (tran_low_t)WRAPLOW(step2[6] + step2[9]);
711
0
  output[7] = (tran_low_t)WRAPLOW(step2[7] + step2[8]);
712
0
  output[8] = (tran_low_t)WRAPLOW(step2[7] - step2[8]);
713
0
  output[9] = (tran_low_t)WRAPLOW(step2[6] - step2[9]);
714
0
  output[10] = (tran_low_t)WRAPLOW(step2[5] - step2[10]);
715
0
  output[11] = (tran_low_t)WRAPLOW(step2[4] - step2[11]);
716
0
  output[12] = (tran_low_t)WRAPLOW(step2[3] - step2[12]);
717
0
  output[13] = (tran_low_t)WRAPLOW(step2[2] - step2[13]);
718
0
  output[14] = (tran_low_t)WRAPLOW(step2[1] - step2[14]);
719
0
  output[15] = (tran_low_t)WRAPLOW(step2[0] - step2[15]);
720
0
}
721
722
void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
723
0
                             int stride) {
724
0
  int i, j;
725
0
  tran_low_t out[16 * 16];
726
0
  tran_low_t *outptr = out;
727
0
  tran_low_t temp_in[16], temp_out[16];
728
729
  // First transform rows
730
0
  for (i = 0; i < 16; ++i) {
731
0
    idct16_c(input, outptr);
732
0
    input += 16;
733
0
    outptr += 16;
734
0
  }
735
736
  // Then transform columns
737
0
  for (i = 0; i < 16; ++i) {
738
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
739
0
    idct16_c(temp_in, temp_out);
740
0
    for (j = 0; j < 16; ++j) {
741
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
742
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
743
0
    }
744
0
  }
745
0
}
746
747
void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
748
0
                            int stride) {
749
0
  int i, j;
750
0
  tran_low_t out[16 * 16] = { 0 };
751
0
  tran_low_t *outptr = out;
752
0
  tran_low_t temp_in[16], temp_out[16];
753
754
  // First transform rows. Since all non-zero dct coefficients are in
755
  // upper-left 8x8 area, we only need to calculate first 8 rows here.
756
0
  for (i = 0; i < 8; ++i) {
757
0
    idct16_c(input, outptr);
758
0
    input += 16;
759
0
    outptr += 16;
760
0
  }
761
762
  // Then transform columns
763
0
  for (i = 0; i < 16; ++i) {
764
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
765
0
    idct16_c(temp_in, temp_out);
766
0
    for (j = 0; j < 16; ++j) {
767
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
768
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
769
0
    }
770
0
  }
771
0
}
772
773
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
774
0
                            int stride) {
775
0
  int i, j;
776
0
  tran_low_t out[16 * 16] = { 0 };
777
0
  tran_low_t *outptr = out;
778
0
  tran_low_t temp_in[16], temp_out[16];
779
780
  // First transform rows. Since all non-zero dct coefficients are in
781
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
782
0
  for (i = 0; i < 4; ++i) {
783
0
    idct16_c(input, outptr);
784
0
    input += 16;
785
0
    outptr += 16;
786
0
  }
787
788
  // Then transform columns
789
0
  for (i = 0; i < 16; ++i) {
790
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
791
0
    idct16_c(temp_in, temp_out);
792
0
    for (j = 0; j < 16; ++j) {
793
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
794
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
795
0
    }
796
0
  }
797
0
}
798
799
0
void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
800
0
  int i, j;
801
0
  tran_high_t a1;
802
0
  tran_low_t out =
803
0
      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
804
805
0
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
806
0
  a1 = ROUND_POWER_OF_TWO(out, 6);
807
0
  for (j = 0; j < 16; ++j) {
808
0
    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
809
0
    dest += stride;
810
0
  }
811
0
}
812
813
0
void idct32_c(const tran_low_t *input, tran_low_t *output) {
814
0
  int16_t step1[32], step2[32];
815
0
  tran_high_t temp1, temp2;
816
817
  // stage 1
818
0
  step1[0] = (int16_t)input[0];
819
0
  step1[1] = (int16_t)input[16];
820
0
  step1[2] = (int16_t)input[8];
821
0
  step1[3] = (int16_t)input[24];
822
0
  step1[4] = (int16_t)input[4];
823
0
  step1[5] = (int16_t)input[20];
824
0
  step1[6] = (int16_t)input[12];
825
0
  step1[7] = (int16_t)input[28];
826
0
  step1[8] = (int16_t)input[2];
827
0
  step1[9] = (int16_t)input[18];
828
0
  step1[10] = (int16_t)input[10];
829
0
  step1[11] = (int16_t)input[26];
830
0
  step1[12] = (int16_t)input[6];
831
0
  step1[13] = (int16_t)input[22];
832
0
  step1[14] = (int16_t)input[14];
833
0
  step1[15] = (int16_t)input[30];
834
835
0
  temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
836
0
  temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
837
0
  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
838
0
  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
839
840
0
  temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
841
0
  temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
842
0
  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
843
0
  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
844
845
0
  temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
846
0
  temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
847
0
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
848
0
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
849
850
0
  temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
851
0
  temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
852
0
  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
853
0
  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
854
855
0
  temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
856
0
  temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
857
0
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
858
0
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
859
860
0
  temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
861
0
  temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
862
0
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
863
0
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
864
865
0
  temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
866
0
  temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
867
0
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
868
0
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
869
870
0
  temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
871
0
  temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
872
0
  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
873
0
  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
874
875
  // stage 2
876
0
  step2[0] = step1[0];
877
0
  step2[1] = step1[1];
878
0
  step2[2] = step1[2];
879
0
  step2[3] = step1[3];
880
0
  step2[4] = step1[4];
881
0
  step2[5] = step1[5];
882
0
  step2[6] = step1[6];
883
0
  step2[7] = step1[7];
884
885
0
  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
886
0
  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
887
0
  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
888
0
  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
889
890
0
  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
891
0
  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
892
0
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
893
0
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
894
895
0
  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
896
0
  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
897
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
898
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
899
900
0
  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
901
0
  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
902
0
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
903
0
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
904
905
0
  step2[16] = WRAPLOW(step1[16] + step1[17]);
906
0
  step2[17] = WRAPLOW(step1[16] - step1[17]);
907
0
  step2[18] = WRAPLOW(-step1[18] + step1[19]);
908
0
  step2[19] = WRAPLOW(step1[18] + step1[19]);
909
0
  step2[20] = WRAPLOW(step1[20] + step1[21]);
910
0
  step2[21] = WRAPLOW(step1[20] - step1[21]);
911
0
  step2[22] = WRAPLOW(-step1[22] + step1[23]);
912
0
  step2[23] = WRAPLOW(step1[22] + step1[23]);
913
0
  step2[24] = WRAPLOW(step1[24] + step1[25]);
914
0
  step2[25] = WRAPLOW(step1[24] - step1[25]);
915
0
  step2[26] = WRAPLOW(-step1[26] + step1[27]);
916
0
  step2[27] = WRAPLOW(step1[26] + step1[27]);
917
0
  step2[28] = WRAPLOW(step1[28] + step1[29]);
918
0
  step2[29] = WRAPLOW(step1[28] - step1[29]);
919
0
  step2[30] = WRAPLOW(-step1[30] + step1[31]);
920
0
  step2[31] = WRAPLOW(step1[30] + step1[31]);
921
922
  // stage 3
923
0
  step1[0] = step2[0];
924
0
  step1[1] = step2[1];
925
0
  step1[2] = step2[2];
926
0
  step1[3] = step2[3];
927
928
0
  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
929
0
  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
930
0
  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
931
0
  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
932
0
  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
933
0
  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
934
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
935
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
936
937
0
  step1[8] = WRAPLOW(step2[8] + step2[9]);
938
0
  step1[9] = WRAPLOW(step2[8] - step2[9]);
939
0
  step1[10] = WRAPLOW(-step2[10] + step2[11]);
940
0
  step1[11] = WRAPLOW(step2[10] + step2[11]);
941
0
  step1[12] = WRAPLOW(step2[12] + step2[13]);
942
0
  step1[13] = WRAPLOW(step2[12] - step2[13]);
943
0
  step1[14] = WRAPLOW(-step2[14] + step2[15]);
944
0
  step1[15] = WRAPLOW(step2[14] + step2[15]);
945
946
0
  step1[16] = step2[16];
947
0
  step1[31] = step2[31];
948
0
  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
949
0
  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
950
0
  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
951
0
  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
952
0
  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
953
0
  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
954
0
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
955
0
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
956
0
  step1[19] = step2[19];
957
0
  step1[20] = step2[20];
958
0
  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
959
0
  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
960
0
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
961
0
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
962
0
  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
963
0
  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
964
0
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
965
0
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
966
0
  step1[23] = step2[23];
967
0
  step1[24] = step2[24];
968
0
  step1[27] = step2[27];
969
0
  step1[28] = step2[28];
970
971
  // stage 4
972
0
  temp1 = (step1[0] + step1[1]) * cospi_16_64;
973
0
  temp2 = (step1[0] - step1[1]) * cospi_16_64;
974
0
  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
975
0
  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
976
0
  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
977
0
  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
978
0
  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
979
0
  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
980
0
  step2[4] = WRAPLOW(step1[4] + step1[5]);
981
0
  step2[5] = WRAPLOW(step1[4] - step1[5]);
982
0
  step2[6] = WRAPLOW(-step1[6] + step1[7]);
983
0
  step2[7] = WRAPLOW(step1[6] + step1[7]);
984
985
0
  step2[8] = step1[8];
986
0
  step2[15] = step1[15];
987
0
  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
988
0
  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
989
0
  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
990
0
  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
991
0
  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
992
0
  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
993
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
994
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
995
0
  step2[11] = step1[11];
996
0
  step2[12] = step1[12];
997
998
0
  step2[16] = WRAPLOW(step1[16] + step1[19]);
999
0
  step2[17] = WRAPLOW(step1[17] + step1[18]);
1000
0
  step2[18] = WRAPLOW(step1[17] - step1[18]);
1001
0
  step2[19] = WRAPLOW(step1[16] - step1[19]);
1002
0
  step2[20] = WRAPLOW(-step1[20] + step1[23]);
1003
0
  step2[21] = WRAPLOW(-step1[21] + step1[22]);
1004
0
  step2[22] = WRAPLOW(step1[21] + step1[22]);
1005
0
  step2[23] = WRAPLOW(step1[20] + step1[23]);
1006
1007
0
  step2[24] = WRAPLOW(step1[24] + step1[27]);
1008
0
  step2[25] = WRAPLOW(step1[25] + step1[26]);
1009
0
  step2[26] = WRAPLOW(step1[25] - step1[26]);
1010
0
  step2[27] = WRAPLOW(step1[24] - step1[27]);
1011
0
  step2[28] = WRAPLOW(-step1[28] + step1[31]);
1012
0
  step2[29] = WRAPLOW(-step1[29] + step1[30]);
1013
0
  step2[30] = WRAPLOW(step1[29] + step1[30]);
1014
0
  step2[31] = WRAPLOW(step1[28] + step1[31]);
1015
1016
  // stage 5
1017
0
  step1[0] = WRAPLOW(step2[0] + step2[3]);
1018
0
  step1[1] = WRAPLOW(step2[1] + step2[2]);
1019
0
  step1[2] = WRAPLOW(step2[1] - step2[2]);
1020
0
  step1[3] = WRAPLOW(step2[0] - step2[3]);
1021
0
  step1[4] = step2[4];
1022
0
  temp1 = (step2[6] - step2[5]) * cospi_16_64;
1023
0
  temp2 = (step2[5] + step2[6]) * cospi_16_64;
1024
0
  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1025
0
  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1026
0
  step1[7] = step2[7];
1027
1028
0
  step1[8] = WRAPLOW(step2[8] + step2[11]);
1029
0
  step1[9] = WRAPLOW(step2[9] + step2[10]);
1030
0
  step1[10] = WRAPLOW(step2[9] - step2[10]);
1031
0
  step1[11] = WRAPLOW(step2[8] - step2[11]);
1032
0
  step1[12] = WRAPLOW(-step2[12] + step2[15]);
1033
0
  step1[13] = WRAPLOW(-step2[13] + step2[14]);
1034
0
  step1[14] = WRAPLOW(step2[13] + step2[14]);
1035
0
  step1[15] = WRAPLOW(step2[12] + step2[15]);
1036
1037
0
  step1[16] = step2[16];
1038
0
  step1[17] = step2[17];
1039
0
  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1040
0
  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1041
0
  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1042
0
  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1043
0
  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1044
0
  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1045
0
  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1046
0
  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1047
0
  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1048
0
  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1049
0
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1050
0
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1051
0
  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1052
0
  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1053
0
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1054
0
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1055
0
  step1[22] = step2[22];
1056
0
  step1[23] = step2[23];
1057
0
  step1[24] = step2[24];
1058
0
  step1[25] = step2[25];
1059
0
  step1[30] = step2[30];
1060
0
  step1[31] = step2[31];
1061
1062
  // stage 6
1063
0
  step2[0] = WRAPLOW(step1[0] + step1[7]);
1064
0
  step2[1] = WRAPLOW(step1[1] + step1[6]);
1065
0
  step2[2] = WRAPLOW(step1[2] + step1[5]);
1066
0
  step2[3] = WRAPLOW(step1[3] + step1[4]);
1067
0
  step2[4] = WRAPLOW(step1[3] - step1[4]);
1068
0
  step2[5] = WRAPLOW(step1[2] - step1[5]);
1069
0
  step2[6] = WRAPLOW(step1[1] - step1[6]);
1070
0
  step2[7] = WRAPLOW(step1[0] - step1[7]);
1071
0
  step2[8] = step1[8];
1072
0
  step2[9] = step1[9];
1073
0
  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1074
0
  temp2 = (step1[10] + step1[13]) * cospi_16_64;
1075
0
  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1076
0
  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1077
0
  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1078
0
  temp2 = (step1[11] + step1[12]) * cospi_16_64;
1079
0
  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1080
0
  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1081
0
  step2[14] = step1[14];
1082
0
  step2[15] = step1[15];
1083
1084
0
  step2[16] = WRAPLOW(step1[16] + step1[23]);
1085
0
  step2[17] = WRAPLOW(step1[17] + step1[22]);
1086
0
  step2[18] = WRAPLOW(step1[18] + step1[21]);
1087
0
  step2[19] = WRAPLOW(step1[19] + step1[20]);
1088
0
  step2[20] = WRAPLOW(step1[19] - step1[20]);
1089
0
  step2[21] = WRAPLOW(step1[18] - step1[21]);
1090
0
  step2[22] = WRAPLOW(step1[17] - step1[22]);
1091
0
  step2[23] = WRAPLOW(step1[16] - step1[23]);
1092
1093
0
  step2[24] = WRAPLOW(-step1[24] + step1[31]);
1094
0
  step2[25] = WRAPLOW(-step1[25] + step1[30]);
1095
0
  step2[26] = WRAPLOW(-step1[26] + step1[29]);
1096
0
  step2[27] = WRAPLOW(-step1[27] + step1[28]);
1097
0
  step2[28] = WRAPLOW(step1[27] + step1[28]);
1098
0
  step2[29] = WRAPLOW(step1[26] + step1[29]);
1099
0
  step2[30] = WRAPLOW(step1[25] + step1[30]);
1100
0
  step2[31] = WRAPLOW(step1[24] + step1[31]);
1101
1102
  // stage 7
1103
0
  step1[0] = WRAPLOW(step2[0] + step2[15]);
1104
0
  step1[1] = WRAPLOW(step2[1] + step2[14]);
1105
0
  step1[2] = WRAPLOW(step2[2] + step2[13]);
1106
0
  step1[3] = WRAPLOW(step2[3] + step2[12]);
1107
0
  step1[4] = WRAPLOW(step2[4] + step2[11]);
1108
0
  step1[5] = WRAPLOW(step2[5] + step2[10]);
1109
0
  step1[6] = WRAPLOW(step2[6] + step2[9]);
1110
0
  step1[7] = WRAPLOW(step2[7] + step2[8]);
1111
0
  step1[8] = WRAPLOW(step2[7] - step2[8]);
1112
0
  step1[9] = WRAPLOW(step2[6] - step2[9]);
1113
0
  step1[10] = WRAPLOW(step2[5] - step2[10]);
1114
0
  step1[11] = WRAPLOW(step2[4] - step2[11]);
1115
0
  step1[12] = WRAPLOW(step2[3] - step2[12]);
1116
0
  step1[13] = WRAPLOW(step2[2] - step2[13]);
1117
0
  step1[14] = WRAPLOW(step2[1] - step2[14]);
1118
0
  step1[15] = WRAPLOW(step2[0] - step2[15]);
1119
1120
0
  step1[16] = step2[16];
1121
0
  step1[17] = step2[17];
1122
0
  step1[18] = step2[18];
1123
0
  step1[19] = step2[19];
1124
0
  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1125
0
  temp2 = (step2[20] + step2[27]) * cospi_16_64;
1126
0
  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1127
0
  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1128
0
  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1129
0
  temp2 = (step2[21] + step2[26]) * cospi_16_64;
1130
0
  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1131
0
  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1132
0
  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1133
0
  temp2 = (step2[22] + step2[25]) * cospi_16_64;
1134
0
  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1135
0
  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1136
0
  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1137
0
  temp2 = (step2[23] + step2[24]) * cospi_16_64;
1138
0
  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1139
0
  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1140
0
  step1[28] = step2[28];
1141
0
  step1[29] = step2[29];
1142
0
  step1[30] = step2[30];
1143
0
  step1[31] = step2[31];
1144
1145
  // final stage
1146
0
  output[0] = WRAPLOW(step1[0] + step1[31]);
1147
0
  output[1] = WRAPLOW(step1[1] + step1[30]);
1148
0
  output[2] = WRAPLOW(step1[2] + step1[29]);
1149
0
  output[3] = WRAPLOW(step1[3] + step1[28]);
1150
0
  output[4] = WRAPLOW(step1[4] + step1[27]);
1151
0
  output[5] = WRAPLOW(step1[5] + step1[26]);
1152
0
  output[6] = WRAPLOW(step1[6] + step1[25]);
1153
0
  output[7] = WRAPLOW(step1[7] + step1[24]);
1154
0
  output[8] = WRAPLOW(step1[8] + step1[23]);
1155
0
  output[9] = WRAPLOW(step1[9] + step1[22]);
1156
0
  output[10] = WRAPLOW(step1[10] + step1[21]);
1157
0
  output[11] = WRAPLOW(step1[11] + step1[20]);
1158
0
  output[12] = WRAPLOW(step1[12] + step1[19]);
1159
0
  output[13] = WRAPLOW(step1[13] + step1[18]);
1160
0
  output[14] = WRAPLOW(step1[14] + step1[17]);
1161
0
  output[15] = WRAPLOW(step1[15] + step1[16]);
1162
0
  output[16] = WRAPLOW(step1[15] - step1[16]);
1163
0
  output[17] = WRAPLOW(step1[14] - step1[17]);
1164
0
  output[18] = WRAPLOW(step1[13] - step1[18]);
1165
0
  output[19] = WRAPLOW(step1[12] - step1[19]);
1166
0
  output[20] = WRAPLOW(step1[11] - step1[20]);
1167
0
  output[21] = WRAPLOW(step1[10] - step1[21]);
1168
0
  output[22] = WRAPLOW(step1[9] - step1[22]);
1169
0
  output[23] = WRAPLOW(step1[8] - step1[23]);
1170
0
  output[24] = WRAPLOW(step1[7] - step1[24]);
1171
0
  output[25] = WRAPLOW(step1[6] - step1[25]);
1172
0
  output[26] = WRAPLOW(step1[5] - step1[26]);
1173
0
  output[27] = WRAPLOW(step1[4] - step1[27]);
1174
0
  output[28] = WRAPLOW(step1[3] - step1[28]);
1175
0
  output[29] = WRAPLOW(step1[2] - step1[29]);
1176
0
  output[30] = WRAPLOW(step1[1] - step1[30]);
1177
0
  output[31] = WRAPLOW(step1[0] - step1[31]);
1178
0
}
1179
1180
void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1181
0
                              int stride) {
1182
0
  int i, j;
1183
0
  tran_low_t out[32 * 32];
1184
0
  tran_low_t *outptr = out;
1185
0
  tran_low_t temp_in[32], temp_out[32];
1186
1187
  // Rows
1188
0
  for (i = 0; i < 32; ++i) {
1189
0
    int16_t zero_coeff = 0;
1190
0
    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
1191
1192
0
    if (zero_coeff)
1193
0
      idct32_c(input, outptr);
1194
0
    else
1195
0
      memset(outptr, 0, sizeof(tran_low_t) * 32);
1196
0
    input += 32;
1197
0
    outptr += 32;
1198
0
  }
1199
1200
  // Columns
1201
0
  for (i = 0; i < 32; ++i) {
1202
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1203
0
    idct32_c(temp_in, temp_out);
1204
0
    for (j = 0; j < 32; ++j) {
1205
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1206
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
1207
0
    }
1208
0
  }
1209
0
}
1210
1211
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1212
0
                             int stride) {
1213
0
  int i, j;
1214
0
  tran_low_t out[32 * 32] = { 0 };
1215
0
  tran_low_t *outptr = out;
1216
0
  tran_low_t temp_in[32], temp_out[32];
1217
1218
  // Rows
1219
  // Only upper-left 16x16 has non-zero coeff
1220
0
  for (i = 0; i < 16; ++i) {
1221
0
    idct32_c(input, outptr);
1222
0
    input += 32;
1223
0
    outptr += 32;
1224
0
  }
1225
1226
  // Columns
1227
0
  for (i = 0; i < 32; ++i) {
1228
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1229
0
    idct32_c(temp_in, temp_out);
1230
0
    for (j = 0; j < 32; ++j) {
1231
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1232
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
1233
0
    }
1234
0
  }
1235
0
}
1236
1237
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1238
0
                            int stride) {
1239
0
  int i, j;
1240
0
  tran_low_t out[32 * 32] = { 0 };
1241
0
  tran_low_t *outptr = out;
1242
0
  tran_low_t temp_in[32], temp_out[32];
1243
1244
  // Rows
1245
  // Only upper-left 8x8 has non-zero coeff
1246
0
  for (i = 0; i < 8; ++i) {
1247
0
    idct32_c(input, outptr);
1248
0
    input += 32;
1249
0
    outptr += 32;
1250
0
  }
1251
1252
  // Columns
1253
0
  for (i = 0; i < 32; ++i) {
1254
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1255
0
    idct32_c(temp_in, temp_out);
1256
0
    for (j = 0; j < 32; ++j) {
1257
0
      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1258
0
                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
1259
0
    }
1260
0
  }
1261
0
}
1262
1263
0
void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1264
0
  int i, j;
1265
0
  tran_high_t a1;
1266
0
  tran_low_t out =
1267
0
      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
1268
1269
0
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1270
0
  a1 = ROUND_POWER_OF_TWO(out, 6);
1271
1272
0
  for (j = 0; j < 32; ++j) {
1273
0
    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1274
0
    dest += stride;
1275
0
  }
1276
0
}
1277
1278
#if CONFIG_VP9_HIGHBITDEPTH
1279
1280
// 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1281
// transform amplify bits + 1 bit for contingency in rounding and quantizing
1282
0
#define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1283
1284
static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1285
0
                                              int size) {
1286
0
  int i;
1287
0
  for (i = 0; i < size; ++i)
1288
0
    if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1289
0
  return 0;
1290
0
}
1291
1292
void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1293
0
                                 int stride, int bd) {
1294
  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1295
     0.5 shifts per pixel. */
1296
0
  int i;
1297
0
  tran_low_t output[16];
1298
0
  tran_high_t a1, b1, c1, d1, e1;
1299
0
  const tran_low_t *ip = input;
1300
0
  tran_low_t *op = output;
1301
1302
0
  for (i = 0; i < 4; i++) {
1303
0
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
1304
0
    c1 = ip[1] >> UNIT_QUANT_SHIFT;
1305
0
    d1 = ip[2] >> UNIT_QUANT_SHIFT;
1306
0
    b1 = ip[3] >> UNIT_QUANT_SHIFT;
1307
0
    a1 += c1;
1308
0
    d1 -= b1;
1309
0
    e1 = (a1 - d1) >> 1;
1310
0
    b1 = e1 - b1;
1311
0
    c1 = e1 - c1;
1312
0
    a1 -= b1;
1313
0
    d1 += c1;
1314
0
    op[0] = HIGHBD_WRAPLOW(a1, bd);
1315
0
    op[1] = HIGHBD_WRAPLOW(b1, bd);
1316
0
    op[2] = HIGHBD_WRAPLOW(c1, bd);
1317
0
    op[3] = HIGHBD_WRAPLOW(d1, bd);
1318
0
    ip += 4;
1319
0
    op += 4;
1320
0
  }
1321
1322
0
  ip = output;
1323
0
  for (i = 0; i < 4; i++) {
1324
0
    a1 = ip[4 * 0];
1325
0
    c1 = ip[4 * 1];
1326
0
    d1 = ip[4 * 2];
1327
0
    b1 = ip[4 * 3];
1328
0
    a1 += c1;
1329
0
    d1 -= b1;
1330
0
    e1 = (a1 - d1) >> 1;
1331
0
    b1 = e1 - b1;
1332
0
    c1 = e1 - c1;
1333
0
    a1 -= b1;
1334
0
    d1 += c1;
1335
0
    dest[stride * 0] =
1336
0
        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1337
0
    dest[stride * 1] =
1338
0
        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1339
0
    dest[stride * 2] =
1340
0
        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1341
0
    dest[stride * 3] =
1342
0
        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1343
1344
0
    ip++;
1345
0
    dest++;
1346
0
  }
1347
0
}
1348
1349
void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1350
0
                                int stride, int bd) {
1351
0
  int i;
1352
0
  tran_high_t a1, e1;
1353
0
  tran_low_t tmp[4];
1354
0
  const tran_low_t *ip = input;
1355
0
  tran_low_t *op = tmp;
1356
0
  (void)bd;
1357
1358
0
  a1 = ip[0] >> UNIT_QUANT_SHIFT;
1359
0
  e1 = a1 >> 1;
1360
0
  a1 -= e1;
1361
0
  op[0] = HIGHBD_WRAPLOW(a1, bd);
1362
0
  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1363
1364
0
  ip = tmp;
1365
0
  for (i = 0; i < 4; i++) {
1366
0
    e1 = ip[0] >> 1;
1367
0
    a1 = ip[0] - e1;
1368
0
    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1369
0
    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1370
0
    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1371
0
    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1372
0
    ip++;
1373
0
    dest++;
1374
0
  }
1375
0
}
1376
1377
0
void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1378
0
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1379
0
  tran_low_t x0 = input[0];
1380
0
  tran_low_t x1 = input[1];
1381
0
  tran_low_t x2 = input[2];
1382
0
  tran_low_t x3 = input[3];
1383
0
  (void)bd;
1384
1385
0
  if (detect_invalid_highbd_input(input, 4)) {
1386
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1387
    assert(0 && "invalid highbd txfm input");
1388
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1389
0
    memset(output, 0, sizeof(*output) * 4);
1390
0
    return;
1391
0
  }
1392
1393
0
  if (!(x0 | x1 | x2 | x3)) {
1394
0
    memset(output, 0, 4 * sizeof(*output));
1395
0
    return;
1396
0
  }
1397
1398
0
  s0 = (tran_high_t)sinpi_1_9 * x0;
1399
0
  s1 = (tran_high_t)sinpi_2_9 * x0;
1400
0
  s2 = (tran_high_t)sinpi_3_9 * x1;
1401
0
  s3 = (tran_high_t)sinpi_4_9 * x2;
1402
0
  s4 = (tran_high_t)sinpi_1_9 * x2;
1403
0
  s5 = (tran_high_t)sinpi_2_9 * x3;
1404
0
  s6 = (tran_high_t)sinpi_4_9 * x3;
1405
0
  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1406
1407
0
  s0 = s0 + s3 + s5;
1408
0
  s1 = s1 - s4 - s6;
1409
0
  s3 = s2;
1410
0
  s2 = sinpi_3_9 * s7;
1411
1412
  // 1-D transform scaling factor is sqrt(2).
1413
  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1414
  // + 1b (addition) = 29b.
1415
  // Hence the output bit depth is 15b.
1416
0
  output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1417
0
  output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1418
0
  output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1419
0
  output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1420
0
}
1421
1422
0
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1423
0
  tran_low_t step[4];
1424
0
  tran_high_t temp1, temp2;
1425
0
  (void)bd;
1426
1427
0
  if (detect_invalid_highbd_input(input, 4)) {
1428
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1429
    assert(0 && "invalid highbd txfm input");
1430
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1431
0
    memset(output, 0, sizeof(*output) * 4);
1432
0
    return;
1433
0
  }
1434
1435
  // stage 1
1436
0
  temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
1437
0
  temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
1438
0
  step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1439
0
  step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1440
0
  temp1 =
1441
0
      input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
1442
0
  temp2 =
1443
0
      input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
1444
0
  step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1445
0
  step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1446
1447
  // stage 2
1448
0
  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1449
0
  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1450
0
  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1451
0
  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1452
0
}
1453
1454
void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
1455
0
                                 int stride, int bd) {
1456
0
  int i, j;
1457
0
  tran_low_t out[4 * 4];
1458
0
  tran_low_t *outptr = out;
1459
0
  tran_low_t temp_in[4], temp_out[4];
1460
1461
  // Rows
1462
0
  for (i = 0; i < 4; ++i) {
1463
0
    vpx_highbd_idct4_c(input, outptr, bd);
1464
0
    input += 4;
1465
0
    outptr += 4;
1466
0
  }
1467
1468
  // Columns
1469
0
  for (i = 0; i < 4; ++i) {
1470
0
    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1471
0
    vpx_highbd_idct4_c(temp_in, temp_out, bd);
1472
0
    for (j = 0; j < 4; ++j) {
1473
0
      dest[j * stride + i] = highbd_clip_pixel_add(
1474
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1475
0
    }
1476
0
  }
1477
0
}
1478
1479
void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
1480
0
                                int stride, int bd) {
1481
0
  int i;
1482
0
  tran_high_t a1;
1483
0
  tran_low_t out = HIGHBD_WRAPLOW(
1484
0
      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1485
1486
0
  out =
1487
0
      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1488
0
  a1 = ROUND_POWER_OF_TWO(out, 4);
1489
1490
0
  for (i = 0; i < 4; i++) {
1491
0
    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1492
0
    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1493
0
    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1494
0
    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1495
0
    dest += stride;
1496
0
  }
1497
0
}
1498
1499
0
void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1500
0
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1501
0
  tran_low_t x0 = input[7];
1502
0
  tran_low_t x1 = input[0];
1503
0
  tran_low_t x2 = input[5];
1504
0
  tran_low_t x3 = input[2];
1505
0
  tran_low_t x4 = input[3];
1506
0
  tran_low_t x5 = input[4];
1507
0
  tran_low_t x6 = input[1];
1508
0
  tran_low_t x7 = input[6];
1509
0
  (void)bd;
1510
1511
0
  if (detect_invalid_highbd_input(input, 8)) {
1512
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1513
    assert(0 && "invalid highbd txfm input");
1514
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1515
0
    memset(output, 0, sizeof(*output) * 8);
1516
0
    return;
1517
0
  }
1518
1519
0
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1520
0
    memset(output, 0, 8 * sizeof(*output));
1521
0
    return;
1522
0
  }
1523
1524
  // stage 1
1525
0
  s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
1526
0
  s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
1527
0
  s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
1528
0
  s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
1529
0
  s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
1530
0
  s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
1531
0
  s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
1532
0
  s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
1533
1534
0
  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1535
0
  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1536
0
  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1537
0
  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1538
0
  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1539
0
  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1540
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1541
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1542
1543
  // stage 2
1544
0
  s0 = x0;
1545
0
  s1 = x1;
1546
0
  s2 = x2;
1547
0
  s3 = x3;
1548
0
  s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
1549
0
  s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
1550
0
  s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
1551
0
  s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
1552
1553
0
  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1554
0
  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1555
0
  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1556
0
  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1557
0
  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1558
0
  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1559
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1560
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1561
1562
  // stage 3
1563
0
  s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
1564
0
  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1565
0
  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1566
0
  s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
1567
1568
0
  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1569
0
  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1570
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1571
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1572
1573
0
  output[0] = HIGHBD_WRAPLOW(x0, bd);
1574
0
  output[1] = HIGHBD_WRAPLOW(-x4, bd);
1575
0
  output[2] = HIGHBD_WRAPLOW(x6, bd);
1576
0
  output[3] = HIGHBD_WRAPLOW(-x2, bd);
1577
0
  output[4] = HIGHBD_WRAPLOW(x3, bd);
1578
0
  output[5] = HIGHBD_WRAPLOW(-x7, bd);
1579
0
  output[6] = HIGHBD_WRAPLOW(x5, bd);
1580
0
  output[7] = HIGHBD_WRAPLOW(-x1, bd);
1581
0
}
1582
1583
0
void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1584
0
  tran_low_t step1[8], step2[8];
1585
0
  tran_high_t temp1, temp2;
1586
1587
0
  if (detect_invalid_highbd_input(input, 8)) {
1588
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1589
    assert(0 && "invalid highbd txfm input");
1590
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1591
0
    memset(output, 0, sizeof(*output) * 8);
1592
0
    return;
1593
0
  }
1594
1595
  // stage 1
1596
0
  step1[0] = input[0];
1597
0
  step1[2] = input[4];
1598
0
  step1[1] = input[2];
1599
0
  step1[3] = input[6];
1600
0
  temp1 =
1601
0
      input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
1602
0
  temp2 =
1603
0
      input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
1604
0
  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1605
0
  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1606
0
  temp1 =
1607
0
      input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
1608
0
  temp2 =
1609
0
      input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
1610
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1611
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1612
1613
  // stage 2 & stage 3 - even half
1614
0
  vpx_highbd_idct4_c(step1, step1, bd);
1615
1616
  // stage 2 - odd half
1617
0
  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1618
0
  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1619
0
  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1620
0
  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1621
1622
  // stage 3 - odd half
1623
0
  step1[4] = step2[4];
1624
0
  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
1625
0
  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
1626
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1627
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1628
0
  step1[7] = step2[7];
1629
1630
  // stage 4
1631
0
  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1632
0
  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1633
0
  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1634
0
  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1635
0
  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1636
0
  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1637
0
  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1638
0
  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1639
0
}
1640
1641
void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
1642
0
                                 int stride, int bd) {
1643
0
  int i, j;
1644
0
  tran_low_t out[8 * 8];
1645
0
  tran_low_t *outptr = out;
1646
0
  tran_low_t temp_in[8], temp_out[8];
1647
1648
  // First transform rows
1649
0
  for (i = 0; i < 8; ++i) {
1650
0
    vpx_highbd_idct8_c(input, outptr, bd);
1651
0
    input += 8;
1652
0
    outptr += 8;
1653
0
  }
1654
1655
  // Then transform columns
1656
0
  for (i = 0; i < 8; ++i) {
1657
0
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1658
0
    vpx_highbd_idct8_c(temp_in, temp_out, bd);
1659
0
    for (j = 0; j < 8; ++j) {
1660
0
      dest[j * stride + i] = highbd_clip_pixel_add(
1661
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1662
0
    }
1663
0
  }
1664
0
}
1665
1666
void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest,
1667
0
                                 int stride, int bd) {
1668
0
  int i, j;
1669
0
  tran_low_t out[8 * 8] = { 0 };
1670
0
  tran_low_t *outptr = out;
1671
0
  tran_low_t temp_in[8], temp_out[8];
1672
1673
  // First transform rows
1674
  // Only first 4 row has non-zero coefs
1675
0
  for (i = 0; i < 4; ++i) {
1676
0
    vpx_highbd_idct8_c(input, outptr, bd);
1677
0
    input += 8;
1678
0
    outptr += 8;
1679
0
  }
1680
1681
  // Then transform columns
1682
0
  for (i = 0; i < 8; ++i) {
1683
0
    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1684
0
    vpx_highbd_idct8_c(temp_in, temp_out, bd);
1685
0
    for (j = 0; j < 8; ++j) {
1686
0
      dest[j * stride + i] = highbd_clip_pixel_add(
1687
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1688
0
    }
1689
0
  }
1690
0
}
1691
1692
void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
1693
0
                                int stride, int bd) {
1694
0
  int i, j;
1695
0
  tran_high_t a1;
1696
0
  tran_low_t out = HIGHBD_WRAPLOW(
1697
0
      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1698
1699
0
  out =
1700
0
      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
1701
0
  a1 = ROUND_POWER_OF_TWO(out, 5);
1702
0
  for (j = 0; j < 8; ++j) {
1703
0
    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1704
0
    dest += stride;
1705
0
  }
1706
0
}
1707
1708
0
void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1709
0
  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1710
0
  tran_high_t s9, s10, s11, s12, s13, s14, s15;
1711
0
  tran_low_t x0 = input[15];
1712
0
  tran_low_t x1 = input[0];
1713
0
  tran_low_t x2 = input[13];
1714
0
  tran_low_t x3 = input[2];
1715
0
  tran_low_t x4 = input[11];
1716
0
  tran_low_t x5 = input[4];
1717
0
  tran_low_t x6 = input[9];
1718
0
  tran_low_t x7 = input[6];
1719
0
  tran_low_t x8 = input[7];
1720
0
  tran_low_t x9 = input[8];
1721
0
  tran_low_t x10 = input[5];
1722
0
  tran_low_t x11 = input[10];
1723
0
  tran_low_t x12 = input[3];
1724
0
  tran_low_t x13 = input[12];
1725
0
  tran_low_t x14 = input[1];
1726
0
  tran_low_t x15 = input[14];
1727
0
  (void)bd;
1728
1729
0
  if (detect_invalid_highbd_input(input, 16)) {
1730
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1731
    assert(0 && "invalid highbd txfm input");
1732
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1733
0
    memset(output, 0, sizeof(*output) * 16);
1734
0
    return;
1735
0
  }
1736
1737
0
  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1738
0
        x13 | x14 | x15)) {
1739
0
    memset(output, 0, 16 * sizeof(*output));
1740
0
    return;
1741
0
  }
1742
1743
  // stage 1
1744
0
  s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
1745
0
  s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
1746
0
  s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
1747
0
  s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
1748
0
  s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
1749
0
  s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
1750
0
  s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
1751
0
  s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
1752
0
  s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
1753
0
  s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
1754
0
  s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
1755
0
  s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
1756
0
  s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
1757
0
  s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
1758
0
  s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
1759
0
  s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
1760
1761
0
  x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1762
0
  x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1763
0
  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1764
0
  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1765
0
  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1766
0
  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1767
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1768
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1769
0
  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1770
0
  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1771
0
  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1772
0
  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1773
0
  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1774
0
  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1775
0
  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1776
0
  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1777
1778
  // stage 2
1779
0
  s0 = x0;
1780
0
  s1 = x1;
1781
0
  s2 = x2;
1782
0
  s3 = x3;
1783
0
  s4 = x4;
1784
0
  s5 = x5;
1785
0
  s6 = x6;
1786
0
  s7 = x7;
1787
0
  s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
1788
0
  s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
1789
0
  s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
1790
0
  s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
1791
0
  s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
1792
0
  s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
1793
0
  s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
1794
0
  s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
1795
1796
0
  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1797
0
  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1798
0
  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1799
0
  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1800
0
  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1801
0
  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1802
0
  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1803
0
  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1804
0
  x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1805
0
  x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1806
0
  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1807
0
  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1808
0
  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1809
0
  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1810
0
  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1811
0
  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1812
1813
  // stage 3
1814
0
  s0 = x0;
1815
0
  s1 = x1;
1816
0
  s2 = x2;
1817
0
  s3 = x3;
1818
0
  s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
1819
0
  s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
1820
0
  s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
1821
0
  s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
1822
0
  s8 = x8;
1823
0
  s9 = x9;
1824
0
  s10 = x10;
1825
0
  s11 = x11;
1826
0
  s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
1827
0
  s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
1828
0
  s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
1829
0
  s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
1830
1831
0
  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1832
0
  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1833
0
  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1834
0
  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1835
0
  x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1836
0
  x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1837
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1838
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1839
0
  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1840
0
  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1841
0
  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1842
0
  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1843
0
  x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1844
0
  x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1845
0
  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1846
0
  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1847
1848
  // stage 4
1849
0
  s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
1850
0
  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
1851
0
  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
1852
0
  s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
1853
0
  s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
1854
0
  s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
1855
0
  s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
1856
0
  s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
1857
1858
0
  x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1859
0
  x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1860
0
  x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1861
0
  x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1862
0
  x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1863
0
  x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1864
0
  x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1865
0
  x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1866
1867
0
  output[0] = HIGHBD_WRAPLOW(x0, bd);
1868
0
  output[1] = HIGHBD_WRAPLOW(-x8, bd);
1869
0
  output[2] = HIGHBD_WRAPLOW(x12, bd);
1870
0
  output[3] = HIGHBD_WRAPLOW(-x4, bd);
1871
0
  output[4] = HIGHBD_WRAPLOW(x6, bd);
1872
0
  output[5] = HIGHBD_WRAPLOW(x14, bd);
1873
0
  output[6] = HIGHBD_WRAPLOW(x10, bd);
1874
0
  output[7] = HIGHBD_WRAPLOW(x2, bd);
1875
0
  output[8] = HIGHBD_WRAPLOW(x3, bd);
1876
0
  output[9] = HIGHBD_WRAPLOW(x11, bd);
1877
0
  output[10] = HIGHBD_WRAPLOW(x15, bd);
1878
0
  output[11] = HIGHBD_WRAPLOW(x7, bd);
1879
0
  output[12] = HIGHBD_WRAPLOW(x5, bd);
1880
0
  output[13] = HIGHBD_WRAPLOW(-x13, bd);
1881
0
  output[14] = HIGHBD_WRAPLOW(x9, bd);
1882
0
  output[15] = HIGHBD_WRAPLOW(-x1, bd);
1883
0
}
1884
1885
0
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1886
0
  tran_low_t step1[16], step2[16];
1887
0
  tran_high_t temp1, temp2;
1888
0
  (void)bd;
1889
1890
0
  if (detect_invalid_highbd_input(input, 16)) {
1891
#if CONFIG_COEFFICIENT_RANGE_CHECKING
1892
    assert(0 && "invalid highbd txfm input");
1893
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
1894
0
    memset(output, 0, sizeof(*output) * 16);
1895
0
    return;
1896
0
  }
1897
1898
  // stage 1
1899
0
  step1[0] = input[0 / 2];
1900
0
  step1[1] = input[16 / 2];
1901
0
  step1[2] = input[8 / 2];
1902
0
  step1[3] = input[24 / 2];
1903
0
  step1[4] = input[4 / 2];
1904
0
  step1[5] = input[20 / 2];
1905
0
  step1[6] = input[12 / 2];
1906
0
  step1[7] = input[28 / 2];
1907
0
  step1[8] = input[2 / 2];
1908
0
  step1[9] = input[18 / 2];
1909
0
  step1[10] = input[10 / 2];
1910
0
  step1[11] = input[26 / 2];
1911
0
  step1[12] = input[6 / 2];
1912
0
  step1[13] = input[22 / 2];
1913
0
  step1[14] = input[14 / 2];
1914
0
  step1[15] = input[30 / 2];
1915
1916
  // stage 2
1917
0
  step2[0] = step1[0];
1918
0
  step2[1] = step1[1];
1919
0
  step2[2] = step1[2];
1920
0
  step2[3] = step1[3];
1921
0
  step2[4] = step1[4];
1922
0
  step2[5] = step1[5];
1923
0
  step2[6] = step1[6];
1924
0
  step2[7] = step1[7];
1925
1926
0
  temp1 =
1927
0
      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
1928
0
  temp2 =
1929
0
      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
1930
0
  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1931
0
  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1932
1933
0
  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
1934
0
          step1[14] * (tran_high_t)cospi_18_64;
1935
0
  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
1936
0
          step1[14] * (tran_high_t)cospi_14_64;
1937
0
  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1938
0
  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1939
1940
0
  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
1941
0
          step1[13] * (tran_high_t)cospi_10_64;
1942
0
  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
1943
0
          step1[13] * (tran_high_t)cospi_22_64;
1944
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1945
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1946
1947
0
  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
1948
0
          step1[12] * (tran_high_t)cospi_26_64;
1949
0
  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
1950
0
          step1[12] * (tran_high_t)cospi_6_64;
1951
0
  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1952
0
  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1953
1954
  // stage 3
1955
0
  step1[0] = step2[0];
1956
0
  step1[1] = step2[1];
1957
0
  step1[2] = step2[2];
1958
0
  step1[3] = step2[3];
1959
1960
0
  temp1 =
1961
0
      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
1962
0
  temp2 =
1963
0
      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
1964
0
  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1965
0
  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1966
0
  temp1 =
1967
0
      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
1968
0
  temp2 =
1969
0
      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
1970
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1971
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1972
1973
0
  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1974
0
  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1975
0
  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1976
0
  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1977
0
  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1978
0
  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1979
0
  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1980
0
  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1981
1982
  // stage 4
1983
0
  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
1984
0
  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
1985
0
  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1986
0
  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1987
0
  temp1 =
1988
0
      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
1989
0
  temp2 =
1990
0
      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
1991
0
  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1992
0
  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1993
0
  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1994
0
  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1995
0
  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1996
0
  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1997
1998
0
  step2[8] = step1[8];
1999
0
  step2[15] = step1[15];
2000
0
  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2001
0
          step1[14] * (tran_high_t)cospi_24_64;
2002
0
  temp2 =
2003
0
      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2004
0
  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2005
0
  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2006
0
  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2007
0
          step1[13] * (tran_high_t)cospi_8_64;
2008
0
  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2009
0
          step1[13] * (tran_high_t)cospi_24_64;
2010
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2011
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2012
0
  step2[11] = step1[11];
2013
0
  step2[12] = step1[12];
2014
2015
  // stage 5
2016
0
  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2017
0
  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2018
0
  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2019
0
  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2020
0
  step1[4] = step2[4];
2021
0
  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2022
0
  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2023
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2024
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2025
0
  step1[7] = step2[7];
2026
2027
0
  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2028
0
  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2029
0
  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2030
0
  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2031
0
  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2032
0
  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2033
0
  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2034
0
  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2035
2036
  // stage 6
2037
0
  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2038
0
  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2039
0
  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2040
0
  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2041
0
  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2042
0
  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2043
0
  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2044
0
  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2045
0
  step2[8] = step1[8];
2046
0
  step2[9] = step1[9];
2047
0
  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2048
0
  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2049
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2050
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2051
0
  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2052
0
  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2053
0
  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2054
0
  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2055
0
  step2[14] = step1[14];
2056
0
  step2[15] = step1[15];
2057
2058
  // stage 7
2059
0
  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2060
0
  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2061
0
  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2062
0
  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2063
0
  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2064
0
  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2065
0
  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2066
0
  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2067
0
  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2068
0
  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2069
0
  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2070
0
  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2071
0
  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2072
0
  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2073
0
  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2074
0
  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2075
0
}
2076
2077
void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
2078
0
                                    int stride, int bd) {
2079
0
  int i, j;
2080
0
  tran_low_t out[16 * 16];
2081
0
  tran_low_t *outptr = out;
2082
0
  tran_low_t temp_in[16], temp_out[16];
2083
2084
  // First transform rows
2085
0
  for (i = 0; i < 16; ++i) {
2086
0
    vpx_highbd_idct16_c(input, outptr, bd);
2087
0
    input += 16;
2088
0
    outptr += 16;
2089
0
  }
2090
2091
  // Then transform columns
2092
0
  for (i = 0; i < 16; ++i) {
2093
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2094
0
    vpx_highbd_idct16_c(temp_in, temp_out, bd);
2095
0
    for (j = 0; j < 16; ++j) {
2096
0
      dest[j * stride + i] = highbd_clip_pixel_add(
2097
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2098
0
    }
2099
0
  }
2100
0
}
2101
2102
void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest,
2103
0
                                   int stride, int bd) {
2104
0
  int i, j;
2105
0
  tran_low_t out[16 * 16] = { 0 };
2106
0
  tran_low_t *outptr = out;
2107
0
  tran_low_t temp_in[16], temp_out[16];
2108
2109
  // First transform rows. Since all non-zero dct coefficients are in
2110
  // upper-left 8x8 area, we only need to calculate first 8 rows here.
2111
0
  for (i = 0; i < 8; ++i) {
2112
0
    vpx_highbd_idct16_c(input, outptr, bd);
2113
0
    input += 16;
2114
0
    outptr += 16;
2115
0
  }
2116
2117
  // Then transform columns
2118
0
  for (i = 0; i < 16; ++i) {
2119
0
    uint16_t *destT = dest;
2120
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2121
0
    vpx_highbd_idct16_c(temp_in, temp_out, bd);
2122
0
    for (j = 0; j < 16; ++j) {
2123
0
      destT[i] = highbd_clip_pixel_add(destT[i],
2124
0
                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2125
0
      destT += stride;
2126
0
    }
2127
0
  }
2128
0
}
2129
2130
void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest,
2131
0
                                   int stride, int bd) {
2132
0
  int i, j;
2133
0
  tran_low_t out[16 * 16] = { 0 };
2134
0
  tran_low_t *outptr = out;
2135
0
  tran_low_t temp_in[16], temp_out[16];
2136
2137
  // First transform rows. Since all non-zero dct coefficients are in
2138
  // upper-left 4x4 area, we only need to calculate first 4 rows here.
2139
0
  for (i = 0; i < 4; ++i) {
2140
0
    vpx_highbd_idct16_c(input, outptr, bd);
2141
0
    input += 16;
2142
0
    outptr += 16;
2143
0
  }
2144
2145
  // Then transform columns
2146
0
  for (i = 0; i < 16; ++i) {
2147
0
    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2148
0
    vpx_highbd_idct16_c(temp_in, temp_out, bd);
2149
0
    for (j = 0; j < 16; ++j) {
2150
0
      dest[j * stride + i] = highbd_clip_pixel_add(
2151
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2152
0
    }
2153
0
  }
2154
0
}
2155
2156
void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
2157
0
                                  int stride, int bd) {
2158
0
  int i, j;
2159
0
  tran_high_t a1;
2160
0
  tran_low_t out = HIGHBD_WRAPLOW(
2161
0
      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2162
2163
0
  out =
2164
0
      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2165
0
  a1 = ROUND_POWER_OF_TWO(out, 6);
2166
0
  for (j = 0; j < 16; ++j) {
2167
0
    for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2168
0
    dest += stride;
2169
0
  }
2170
0
}
2171
2172
static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2173
0
                            int bd) {
2174
0
  tran_low_t step1[32], step2[32];
2175
0
  tran_high_t temp1, temp2;
2176
0
  (void)bd;
2177
2178
0
  if (detect_invalid_highbd_input(input, 32)) {
2179
#if CONFIG_COEFFICIENT_RANGE_CHECKING
2180
    assert(0 && "invalid highbd txfm input");
2181
#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
2182
0
    memset(output, 0, sizeof(*output) * 32);
2183
0
    return;
2184
0
  }
2185
2186
  // stage 1
2187
0
  step1[0] = input[0];
2188
0
  step1[1] = input[16];
2189
0
  step1[2] = input[8];
2190
0
  step1[3] = input[24];
2191
0
  step1[4] = input[4];
2192
0
  step1[5] = input[20];
2193
0
  step1[6] = input[12];
2194
0
  step1[7] = input[28];
2195
0
  step1[8] = input[2];
2196
0
  step1[9] = input[18];
2197
0
  step1[10] = input[10];
2198
0
  step1[11] = input[26];
2199
0
  step1[12] = input[6];
2200
0
  step1[13] = input[22];
2201
0
  step1[14] = input[14];
2202
0
  step1[15] = input[30];
2203
2204
0
  temp1 =
2205
0
      input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
2206
0
  temp2 =
2207
0
      input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
2208
0
  step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2209
0
  step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2210
2211
0
  temp1 = input[17] * (tran_high_t)cospi_15_64 -
2212
0
          input[15] * (tran_high_t)cospi_17_64;
2213
0
  temp2 = input[17] * (tran_high_t)cospi_17_64 +
2214
0
          input[15] * (tran_high_t)cospi_15_64;
2215
0
  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2216
0
  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2217
2218
0
  temp1 =
2219
0
      input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
2220
0
  temp2 =
2221
0
      input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
2222
0
  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2223
0
  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2224
2225
0
  temp1 =
2226
0
      input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
2227
0
  temp2 =
2228
0
      input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
2229
0
  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2230
0
  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2231
2232
0
  temp1 =
2233
0
      input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
2234
0
  temp2 =
2235
0
      input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
2236
0
  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2237
0
  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2238
2239
0
  temp1 = input[21] * (tran_high_t)cospi_11_64 -
2240
0
          input[11] * (tran_high_t)cospi_21_64;
2241
0
  temp2 = input[21] * (tran_high_t)cospi_21_64 +
2242
0
          input[11] * (tran_high_t)cospi_11_64;
2243
0
  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2244
0
  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2245
2246
0
  temp1 = input[13] * (tran_high_t)cospi_19_64 -
2247
0
          input[19] * (tran_high_t)cospi_13_64;
2248
0
  temp2 = input[13] * (tran_high_t)cospi_13_64 +
2249
0
          input[19] * (tran_high_t)cospi_19_64;
2250
0
  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2251
0
  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2252
2253
0
  temp1 =
2254
0
      input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
2255
0
  temp2 =
2256
0
      input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
2257
0
  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2258
0
  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2259
2260
  // stage 2
2261
0
  step2[0] = step1[0];
2262
0
  step2[1] = step1[1];
2263
0
  step2[2] = step1[2];
2264
0
  step2[3] = step1[3];
2265
0
  step2[4] = step1[4];
2266
0
  step2[5] = step1[5];
2267
0
  step2[6] = step1[6];
2268
0
  step2[7] = step1[7];
2269
2270
0
  temp1 =
2271
0
      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
2272
0
  temp2 =
2273
0
      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
2274
0
  step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2275
0
  step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2276
2277
0
  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
2278
0
          step1[14] * (tran_high_t)cospi_18_64;
2279
0
  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
2280
0
          step1[14] * (tran_high_t)cospi_14_64;
2281
0
  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2282
0
  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2283
2284
0
  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
2285
0
          step1[13] * (tran_high_t)cospi_10_64;
2286
0
  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
2287
0
          step1[13] * (tran_high_t)cospi_22_64;
2288
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2290
2291
0
  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
2292
0
          step1[12] * (tran_high_t)cospi_26_64;
2293
0
  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
2294
0
          step1[12] * (tran_high_t)cospi_6_64;
2295
0
  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2296
0
  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2297
2298
0
  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2299
0
  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2300
0
  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2301
0
  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2302
0
  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2303
0
  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2304
0
  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2305
0
  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2306
0
  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2307
0
  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2308
0
  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2309
0
  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2310
0
  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2311
0
  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2312
0
  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2313
0
  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2314
2315
  // stage 3
2316
0
  step1[0] = step2[0];
2317
0
  step1[1] = step2[1];
2318
0
  step1[2] = step2[2];
2319
0
  step1[3] = step2[3];
2320
2321
0
  temp1 =
2322
0
      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
2323
0
  temp2 =
2324
0
      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
2325
0
  step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2326
0
  step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2327
0
  temp1 =
2328
0
      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
2329
0
  temp2 =
2330
0
      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
2331
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2332
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2333
2334
0
  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2335
0
  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2336
0
  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2337
0
  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2338
0
  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2339
0
  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2340
0
  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2341
0
  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2342
2343
0
  step1[16] = step2[16];
2344
0
  step1[31] = step2[31];
2345
0
  temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
2346
0
          step2[30] * (tran_high_t)cospi_28_64;
2347
0
  temp2 = step2[17] * (tran_high_t)cospi_28_64 +
2348
0
          step2[30] * (tran_high_t)cospi_4_64;
2349
0
  step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2350
0
  step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2351
0
  temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
2352
0
          step2[29] * (tran_high_t)cospi_4_64;
2353
0
  temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
2354
0
          step2[29] * (tran_high_t)cospi_28_64;
2355
0
  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2356
0
  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2357
0
  step1[19] = step2[19];
2358
0
  step1[20] = step2[20];
2359
0
  temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
2360
0
          step2[26] * (tran_high_t)cospi_12_64;
2361
0
  temp2 = step2[21] * (tran_high_t)cospi_12_64 +
2362
0
          step2[26] * (tran_high_t)cospi_20_64;
2363
0
  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2364
0
  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2365
0
  temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
2366
0
          step2[25] * (tran_high_t)cospi_20_64;
2367
0
  temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
2368
0
          step2[25] * (tran_high_t)cospi_12_64;
2369
0
  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2370
0
  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2371
0
  step1[23] = step2[23];
2372
0
  step1[24] = step2[24];
2373
0
  step1[27] = step2[27];
2374
0
  step1[28] = step2[28];
2375
2376
  // stage 4
2377
0
  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
2378
0
  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
2379
0
  step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2380
0
  step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2381
0
  temp1 =
2382
0
      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
2383
0
  temp2 =
2384
0
      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
2385
0
  step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2386
0
  step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2387
0
  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2388
0
  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2389
0
  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2390
0
  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2391
2392
0
  step2[8] = step1[8];
2393
0
  step2[15] = step1[15];
2394
0
  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
2395
0
          step1[14] * (tran_high_t)cospi_24_64;
2396
0
  temp2 =
2397
0
      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
2398
0
  step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2399
0
  step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2400
0
  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
2401
0
          step1[13] * (tran_high_t)cospi_8_64;
2402
0
  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
2403
0
          step1[13] * (tran_high_t)cospi_24_64;
2404
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2405
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2406
0
  step2[11] = step1[11];
2407
0
  step2[12] = step1[12];
2408
2409
0
  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2410
0
  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2411
0
  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2412
0
  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2413
0
  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2414
0
  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2415
0
  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2416
0
  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2417
2418
0
  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2419
0
  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2420
0
  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2421
0
  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2422
0
  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2423
0
  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2424
0
  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2425
0
  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2426
2427
  // stage 5
2428
0
  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2429
0
  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2430
0
  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2431
0
  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2432
0
  step1[4] = step2[4];
2433
0
  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
2434
0
  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
2435
0
  step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2436
0
  step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2437
0
  step1[7] = step2[7];
2438
2439
0
  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2440
0
  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2441
0
  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2442
0
  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2443
0
  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2444
0
  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2445
0
  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2446
0
  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2447
2448
0
  step1[16] = step2[16];
2449
0
  step1[17] = step2[17];
2450
0
  temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
2451
0
          step2[29] * (tran_high_t)cospi_24_64;
2452
0
  temp2 = step2[18] * (tran_high_t)cospi_24_64 +
2453
0
          step2[29] * (tran_high_t)cospi_8_64;
2454
0
  step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2455
0
  step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2456
0
  temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
2457
0
          step2[28] * (tran_high_t)cospi_24_64;
2458
0
  temp2 = step2[19] * (tran_high_t)cospi_24_64 +
2459
0
          step2[28] * (tran_high_t)cospi_8_64;
2460
0
  step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2461
0
  step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2462
0
  temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
2463
0
          step2[27] * (tran_high_t)cospi_8_64;
2464
0
  temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
2465
0
          step2[27] * (tran_high_t)cospi_24_64;
2466
0
  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2467
0
  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2468
0
  temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
2469
0
          step2[26] * (tran_high_t)cospi_8_64;
2470
0
  temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
2471
0
          step2[26] * (tran_high_t)cospi_24_64;
2472
0
  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2473
0
  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2474
0
  step1[22] = step2[22];
2475
0
  step1[23] = step2[23];
2476
0
  step1[24] = step2[24];
2477
0
  step1[25] = step2[25];
2478
0
  step1[30] = step2[30];
2479
0
  step1[31] = step2[31];
2480
2481
  // stage 6
2482
0
  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2483
0
  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2484
0
  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2485
0
  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2486
0
  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2487
0
  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2488
0
  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2489
0
  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2490
0
  step2[8] = step1[8];
2491
0
  step2[9] = step1[9];
2492
0
  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2493
0
  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
2494
0
  step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2495
0
  step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2496
0
  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2497
0
  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
2498
0
  step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2499
0
  step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2500
0
  step2[14] = step1[14];
2501
0
  step2[15] = step1[15];
2502
2503
0
  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2504
0
  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2505
0
  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2506
0
  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2507
0
  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2508
0
  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2509
0
  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2510
0
  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2511
2512
0
  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2513
0
  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2514
0
  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2515
0
  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2516
0
  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2517
0
  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2518
0
  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2519
0
  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2520
2521
  // stage 7
2522
0
  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2523
0
  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2524
0
  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2525
0
  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2526
0
  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2527
0
  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2528
0
  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2529
0
  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2530
0
  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2531
0
  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2532
0
  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2533
0
  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2534
0
  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2535
0
  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2536
0
  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2537
0
  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2538
2539
0
  step1[16] = step2[16];
2540
0
  step1[17] = step2[17];
2541
0
  step1[18] = step2[18];
2542
0
  step1[19] = step2[19];
2543
0
  temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2544
0
  temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
2545
0
  step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2546
0
  step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2547
0
  temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2548
0
  temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
2549
0
  step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2550
0
  step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2551
0
  temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2552
0
  temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
2553
0
  step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2554
0
  step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2555
0
  temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2556
0
  temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
2557
0
  step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2558
0
  step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2559
0
  step1[28] = step2[28];
2560
0
  step1[29] = step2[29];
2561
0
  step1[30] = step2[30];
2562
0
  step1[31] = step2[31];
2563
2564
  // final stage
2565
0
  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2566
0
  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2567
0
  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2568
0
  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2569
0
  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2570
0
  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2571
0
  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2572
0
  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2573
0
  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2574
0
  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2575
0
  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2576
0
  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2577
0
  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2578
0
  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2579
0
  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2580
0
  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2581
0
  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2582
0
  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2583
0
  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2584
0
  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2585
0
  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2586
0
  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2587
0
  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2588
0
  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2589
0
  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2590
0
  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2591
0
  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2592
0
  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2593
0
  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2594
0
  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2595
0
  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2596
0
  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2597
0
}
2598
2599
void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest,
2600
0
                                     int stride, int bd) {
2601
0
  int i, j;
2602
0
  tran_low_t out[32 * 32];
2603
0
  tran_low_t *outptr = out;
2604
0
  tran_low_t temp_in[32], temp_out[32];
2605
2606
  // Rows
2607
0
  for (i = 0; i < 32; ++i) {
2608
0
    tran_low_t zero_coeff = 0;
2609
0
    for (j = 0; j < 32; ++j) zero_coeff |= input[j];
2610
2611
0
    if (zero_coeff)
2612
0
      highbd_idct32_c(input, outptr, bd);
2613
0
    else
2614
0
      memset(outptr, 0, sizeof(tran_low_t) * 32);
2615
0
    input += 32;
2616
0
    outptr += 32;
2617
0
  }
2618
2619
  // Columns
2620
0
  for (i = 0; i < 32; ++i) {
2621
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2622
0
    highbd_idct32_c(temp_in, temp_out, bd);
2623
0
    for (j = 0; j < 32; ++j) {
2624
0
      dest[j * stride + i] = highbd_clip_pixel_add(
2625
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2626
0
    }
2627
0
  }
2628
0
}
2629
2630
void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest,
2631
0
                                    int stride, int bd) {
2632
0
  int i, j;
2633
0
  tran_low_t out[32 * 32] = { 0 };
2634
0
  tran_low_t *outptr = out;
2635
0
  tran_low_t temp_in[32], temp_out[32];
2636
2637
  // Rows
2638
  // Only upper-left 16x16 has non-zero coeff
2639
0
  for (i = 0; i < 16; ++i) {
2640
0
    highbd_idct32_c(input, outptr, bd);
2641
0
    input += 32;
2642
0
    outptr += 32;
2643
0
  }
2644
2645
  // Columns
2646
0
  for (i = 0; i < 32; ++i) {
2647
0
    uint16_t *destT = dest;
2648
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2649
0
    highbd_idct32_c(temp_in, temp_out, bd);
2650
0
    for (j = 0; j < 32; ++j) {
2651
0
      destT[i] = highbd_clip_pixel_add(destT[i],
2652
0
                                       ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2653
0
      destT += stride;
2654
0
    }
2655
0
  }
2656
0
}
2657
2658
void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest,
2659
0
                                   int stride, int bd) {
2660
0
  int i, j;
2661
0
  tran_low_t out[32 * 32] = { 0 };
2662
0
  tran_low_t *outptr = out;
2663
0
  tran_low_t temp_in[32], temp_out[32];
2664
2665
  // Rows
2666
  // Only upper-left 8x8 has non-zero coeff
2667
0
  for (i = 0; i < 8; ++i) {
2668
0
    highbd_idct32_c(input, outptr, bd);
2669
0
    input += 32;
2670
0
    outptr += 32;
2671
0
  }
2672
2673
  // Columns
2674
0
  for (i = 0; i < 32; ++i) {
2675
0
    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2676
0
    highbd_idct32_c(temp_in, temp_out, bd);
2677
0
    for (j = 0; j < 32; ++j) {
2678
0
      dest[j * stride + i] = highbd_clip_pixel_add(
2679
0
          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2680
0
    }
2681
0
  }
2682
0
}
2683
2684
void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
2685
0
                                  int stride, int bd) {
2686
0
  int i, j;
2687
0
  int a1;
2688
0
  tran_low_t out = HIGHBD_WRAPLOW(
2689
0
      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
2690
2691
0
  out =
2692
0
      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
2693
0
  a1 = ROUND_POWER_OF_TWO(out, 6);
2694
2695
0
  for (j = 0; j < 32; ++j) {
2696
0
    for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2697
0
    dest += stride;
2698
0
  }
2699
0
}
2700
2701
#endif  // CONFIG_VP9_HIGHBITDEPTH