Coverage Report

Created: 2026-01-16 07:48

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libavcodec/jrevdct.c
Line
Count
Source
1
/*
2
 * This file is part of the Independent JPEG Group's software.
3
 *
4
 * The authors make NO WARRANTY or representation, either express or implied,
5
 * with respect to this software, its quality, accuracy, merchantability, or
6
 * fitness for a particular purpose.  This software is provided "AS IS", and
7
 * you, its user, assume the entire risk as to its quality and accuracy.
8
 *
9
 * This software is copyright (C) 1991, 1992, Thomas G. Lane.
10
 * All Rights Reserved except as specified below.
11
 *
12
 * Permission is hereby granted to use, copy, modify, and distribute this
13
 * software (or portions thereof) for any purpose, without fee, subject to
14
 * these conditions:
15
 * (1) If any part of the source code for this software is distributed, then
16
 * this README file must be included, with this copyright and no-warranty
17
 * notice unaltered; and any additions, deletions, or changes to the original
18
 * files must be clearly indicated in accompanying documentation.
19
 * (2) If only executable code is distributed, then the accompanying
20
 * documentation must state that "this software is based in part on the work
21
 * of the Independent JPEG Group".
22
 * (3) Permission for use of this software is granted only if the user accepts
23
 * full responsibility for any undesirable consequences; the authors accept
24
 * NO LIABILITY for damages of any kind.
25
 *
26
 * These conditions apply to any software derived from or based on the IJG
27
 * code, not just to the unmodified library.  If you use our work, you ought
28
 * to acknowledge us.
29
 *
30
 * Permission is NOT granted for the use of any IJG author's name or company
31
 * name in advertising or publicity relating to this software or products
32
 * derived from it.  This software may be referred to only as "the Independent
33
 * JPEG Group's software".
34
 *
35
 * We specifically permit and encourage the use of this software as the basis
36
 * of commercial products, provided that all warranty or liability claims are
37
 * assumed by the product vendor.
38
 *
39
 * This file contains the basic inverse-DCT transformation subroutine.
40
 *
41
 * This implementation is based on an algorithm described in
42
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
43
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
44
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
45
 * The primary algorithm described there uses 11 multiplies and 29 adds.
46
 * We use their alternate method with 12 multiplies and 32 adds.
47
 * The advantage of this method is that no data path contains more than one
48
 * multiplication; this allows a very simple and accurate implementation in
49
 * scaled fixed-point arithmetic, with a minimal number of shifts.
50
 *
51
 * I've made lots of modifications to attempt to take advantage of the
52
 * sparse nature of the DCT matrices we're getting.  Although the logic
53
 * is cumbersome, it's straightforward and the resulting code is much
54
 * faster.
55
 *
56
 * A better way to do this would be to pass in the DCT block as a sparse
57
 * matrix, perhaps with the difference cases encoded.
58
 */
59
60
/**
61
 * @file
62
 * Independent JPEG Group's LLM idct.
63
 */
64
65
#include <stddef.h>
66
#include <stdint.h>
67
68
#include "libavutil/intreadwrite.h"
69
70
#include "dct.h"
71
#include "idctdsp.h"
72
73
#define EIGHT_BIT_SAMPLES
74
75
2.71G
#define DCTSIZE 8
76
#define DCTSIZE2 64
77
78
#define GLOBAL
79
80
1.46G
#define RIGHT_SHIFT(x, n) ((x) >> (n))
81
82
typedef int16_t DCTBLOCK[DCTSIZE2];
83
84
754M
#define CONST_BITS 13
85
86
/*
87
 * This routine is specialized to the case DCTSIZE = 8.
88
 */
89
90
#if DCTSIZE != 8
91
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
92
#endif
93
94
95
/*
96
 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
97
 * on each column.  Direct algorithms are also available, but they are
98
 * much more complex and seem not to be any faster when reduced to code.
99
 *
100
 * The poop on this scaling stuff is as follows:
101
 *
102
 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
103
 * larger than the true IDCT outputs.  The final outputs are therefore
104
 * a factor of N larger than desired; since N=8 this can be cured by
105
 * a simple right shift at the end of the algorithm.  The advantage of
106
 * this arrangement is that we save two multiplications per 1-D IDCT,
107
 * because the y0 and y4 inputs need not be divided by sqrt(N).
108
 *
109
 * We have to do addition and subtraction of the integer inputs, which
110
 * is no problem, and multiplication by fractional constants, which is
111
 * a problem to do in integer arithmetic.  We multiply all the constants
112
 * by CONST_SCALE and convert them to integer constants (thus retaining
113
 * CONST_BITS bits of precision in the constants).  After doing a
114
 * multiplication we have to divide the product by CONST_SCALE, with proper
115
 * rounding, to produce the correct output.  This division can be done
116
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
117
 * as long as possible so that partial sums can be added together with
118
 * full fractional precision.
119
 *
120
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
121
 * they are represented to better-than-integral precision.  These outputs
122
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
123
 * with the recommended scaling.  (To scale up 12-bit sample data further, an
124
 * intermediate int32 array would be needed.)
125
 *
126
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
127
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
128
 * shows that the values given below are the most effective.
129
 */
130
131
#ifdef EIGHT_BIT_SAMPLES
132
272M
#define PASS1_BITS  2
133
#else
134
#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
135
#endif
136
137
351M
#define ONE         ((int32_t) 1)
138
139
351M
#define CONST_SCALE (ONE << CONST_BITS)
140
141
/* Convert a positive real constant to an integer scaled by CONST_SCALE.
142
 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
143
 * you will pay a significant penalty in run time.  In that case, figure
144
 * the correct integer constant values and insert them by hand.
145
 */
146
147
/* Actually FIX is no longer used, we precomputed them all */
148
#define FIX(x)  ((int32_t) ((x) * CONST_SCALE + 0.5))
149
150
/* Descale and correctly round an int32_t value that's scaled by N bits.
151
 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
152
 * the fudge factor is correct for either sign of X.
153
 */
154
155
1.46G
#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
156
157
/* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
158
 * For 8-bit samples with the recommended scaling, all the variable
159
 * and constant values involved are no more than 16 bits wide, so a
160
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
161
 * this provides a useful speedup on many machines.
162
 * There is no way to specify a 16x16->32 multiply in portable C, but
163
 * some C compilers will do the right thing if you provide the correct
164
 * combination of casts.
165
 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
166
 */
167
168
#ifdef EIGHT_BIT_SAMPLES
169
#ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
170
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int16_t) (const)))
171
#endif
172
#ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
173
#define MULTIPLY(var,const)  (((int16_t) (var)) * ((int32_t) (const)))
174
#endif
175
#endif
176
177
#ifndef MULTIPLY                /* default definition */
178
540M
#define MULTIPLY(var,const)  ((var) * (const))
179
#endif
180
181
182
/*
183
  Unlike our decoder where we approximate the FIXes, we need to use exact
184
ones here or successive P-frames will drift too much with Reference frame coding
185
*/
186
#define FIX_0_211164243 1730
187
#define FIX_0_275899380 2260
188
#define FIX_0_298631336 2446
189
#define FIX_0_390180644 3196
190
#define FIX_0_509795579 4176
191
#define FIX_0_541196100 4433
192
#define FIX_0_601344887 4926
193
#define FIX_0_765366865 6270
194
#define FIX_0_785694958 6436
195
#define FIX_0_899976223 7373
196
#define FIX_1_061594337 8697
197
#define FIX_1_111140466 9102
198
#define FIX_1_175875602 9633
199
#define FIX_1_306562965 10703
200
#define FIX_1_387039845 11363
201
#define FIX_1_451774981 11893
202
#define FIX_1_501321110 12299
203
#define FIX_1_662939225 13623
204
#define FIX_1_847759065 15137
205
#define FIX_1_961570560 16069
206
#define FIX_2_053119869 16819
207
#define FIX_2_172734803 17799
208
#define FIX_2_562915447 20995
209
#define FIX_3_072711026 25172
210
211
/*
212
 * Perform the inverse DCT on one block of coefficients.
213
 */
214
215
void ff_j_rev_dct(DCTBLOCK data)
216
19.6M
{
217
19.6M
  int32_t tmp0, tmp1, tmp2, tmp3;
218
19.6M
  int32_t tmp10, tmp11, tmp12, tmp13;
219
19.6M
  int32_t z1, z2, z3, z4, z5;
220
19.6M
  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
221
19.6M
  register int16_t *dataptr;
222
19.6M
  int rowctr;
223
224
  /* Pass 1: process rows. */
225
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
226
  /* furthermore, we scale the results by 2**PASS1_BITS. */
227
228
19.6M
  dataptr = data;
229
230
177M
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
231
    /* Due to quantization, we will usually find that many of the input
232
     * coefficients are zero, especially the AC terms.  We can exploit this
233
     * by short-circuiting the IDCT calculation for any row in which all
234
     * the AC terms are zero.  In that case each output is equal to the
235
     * DC coefficient (with scale factor as needed).
236
     * With typical images and quantization tables, half or more of the
237
     * row DCT calculations can be simplified this way.
238
     */
239
240
157M
    register uint8_t *idataptr = (uint8_t*)dataptr;
241
242
    /* WARNING: we do the same permutation as MMX idct to simplify the
243
       video core */
244
157M
    d0 = dataptr[0];
245
157M
    d2 = dataptr[1];
246
157M
    d4 = dataptr[2];
247
157M
    d6 = dataptr[3];
248
157M
    d1 = dataptr[4];
249
157M
    d3 = dataptr[5];
250
157M
    d5 = dataptr[6];
251
157M
    d7 = dataptr[7];
252
253
157M
    if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
254
      /* AC terms all zero */
255
139M
      if (d0) {
256
          /* Compute a 32 bit value to assign. */
257
8.62M
          int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
258
8.62M
          register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
259
260
8.62M
          AV_WN32A(&idataptr[ 0], v);
261
8.62M
          AV_WN32A(&idataptr[ 4], v);
262
8.62M
          AV_WN32A(&idataptr[ 8], v);
263
8.62M
          AV_WN32A(&idataptr[12], v);
264
8.62M
      }
265
266
139M
      dataptr += DCTSIZE;       /* advance pointer to next row */
267
139M
      continue;
268
139M
    }
269
270
    /* Even part: reverse the even part of the forward DCT. */
271
    /* The rotator is sqrt(2)*c(-6). */
272
18.1M
{
273
18.1M
    if (d6) {
274
4.36M
            if (d2) {
275
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
276
3.43M
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
277
3.43M
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
278
3.43M
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
279
280
3.43M
                    tmp0 = (d0 + d4) * CONST_SCALE;
281
3.43M
                    tmp1 = (d0 - d4) * CONST_SCALE;
282
283
3.43M
                    tmp10 = tmp0 + tmp3;
284
3.43M
                    tmp13 = tmp0 - tmp3;
285
3.43M
                    tmp11 = tmp1 + tmp2;
286
3.43M
                    tmp12 = tmp1 - tmp2;
287
3.43M
            } else {
288
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
289
932k
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
290
932k
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
291
292
932k
                    tmp0 = (d0 + d4) * CONST_SCALE;
293
932k
                    tmp1 = (d0 - d4) * CONST_SCALE;
294
295
932k
                    tmp10 = tmp0 + tmp3;
296
932k
                    tmp13 = tmp0 - tmp3;
297
932k
                    tmp11 = tmp1 + tmp2;
298
932k
                    tmp12 = tmp1 - tmp2;
299
932k
            }
300
13.8M
    } else {
301
13.8M
            if (d2) {
302
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
303
7.30M
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
304
7.30M
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
305
306
7.30M
                    tmp0 = (d0 + d4) * CONST_SCALE;
307
7.30M
                    tmp1 = (d0 - d4) * CONST_SCALE;
308
309
7.30M
                    tmp10 = tmp0 + tmp3;
310
7.30M
                    tmp13 = tmp0 - tmp3;
311
7.30M
                    tmp11 = tmp1 + tmp2;
312
7.30M
                    tmp12 = tmp1 - tmp2;
313
7.30M
            } else {
314
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
315
6.51M
                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
316
6.51M
                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
317
6.51M
            }
318
13.8M
      }
319
320
    /* Odd part per figure 8; the matrix is unitary and hence its
321
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
322
     */
323
324
18.1M
    if (d7) {
325
3.99M
        if (d5) {
326
3.05M
            if (d3) {
327
2.69M
                if (d1) {
328
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
329
2.44M
                    z1 = d7 + d1;
330
2.44M
                    z2 = d5 + d3;
331
2.44M
                    z3 = d7 + d3;
332
2.44M
                    z4 = d5 + d1;
333
2.44M
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
334
335
2.44M
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
336
2.44M
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
337
2.44M
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
338
2.44M
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
339
2.44M
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
340
2.44M
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
341
2.44M
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
342
2.44M
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
343
344
2.44M
                    z3 += z5;
345
2.44M
                    z4 += z5;
346
347
2.44M
                    tmp0 += z1 + z3;
348
2.44M
                    tmp1 += z2 + z4;
349
2.44M
                    tmp2 += z2 + z3;
350
2.44M
                    tmp3 += z1 + z4;
351
2.44M
                } else {
352
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
353
251k
                    z2 = d5 + d3;
354
251k
                    z3 = d7 + d3;
355
251k
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
356
357
251k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
358
251k
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
359
251k
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
360
251k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
361
251k
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
362
251k
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
363
251k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
364
365
251k
                    z3 += z5;
366
251k
                    z4 += z5;
367
368
251k
                    tmp0 += z1 + z3;
369
251k
                    tmp1 += z2 + z4;
370
251k
                    tmp2 += z2 + z3;
371
251k
                    tmp3 = z1 + z4;
372
251k
                }
373
2.69M
            } else {
374
360k
                if (d1) {
375
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
376
149k
                    z1 = d7 + d1;
377
149k
                    z4 = d5 + d1;
378
149k
                    z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
379
380
149k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
381
149k
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
382
149k
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
383
149k
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
384
149k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
385
149k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
386
149k
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
387
388
149k
                    z3 += z5;
389
149k
                    z4 += z5;
390
391
149k
                    tmp0 += z1 + z3;
392
149k
                    tmp1 += z2 + z4;
393
149k
                    tmp2 = z2 + z3;
394
149k
                    tmp3 += z1 + z4;
395
211k
                } else {
396
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
397
211k
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
398
211k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
399
211k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
400
211k
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
401
211k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
402
211k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
403
211k
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
404
405
211k
                    z3 += z5;
406
211k
                    z4 += z5;
407
408
211k
                    tmp0 += z3;
409
211k
                    tmp1 += z4;
410
211k
                    tmp2 = z2 + z3;
411
211k
                    tmp3 = z1 + z4;
412
211k
                }
413
360k
            }
414
3.05M
        } else {
415
942k
            if (d3) {
416
267k
                if (d1) {
417
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
418
157k
                    z1 = d7 + d1;
419
157k
                    z3 = d7 + d3;
420
157k
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
421
422
157k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
423
157k
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
424
157k
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
425
157k
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
426
157k
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
427
157k
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
428
157k
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
429
430
157k
                    z3 += z5;
431
157k
                    z4 += z5;
432
433
157k
                    tmp0 += z1 + z3;
434
157k
                    tmp1 = z2 + z4;
435
157k
                    tmp2 += z2 + z3;
436
157k
                    tmp3 += z1 + z4;
437
157k
                } else {
438
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
439
110k
                    z3 = d7 + d3;
440
441
110k
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
442
110k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
443
110k
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
444
110k
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
445
110k
                    z5 = MULTIPLY(z3, FIX_1_175875602);
446
110k
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
447
448
110k
                    tmp0 += z3;
449
110k
                    tmp1 = z2 + z5;
450
110k
                    tmp2 += z3;
451
110k
                    tmp3 = z1 + z5;
452
110k
                }
453
674k
            } else {
454
674k
                if (d1) {
455
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
456
215k
                    z1 = d7 + d1;
457
215k
                    z5 = MULTIPLY(z1, FIX_1_175875602);
458
459
215k
                    z1 = MULTIPLY(z1, FIX_0_275899380);
460
215k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
461
215k
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
462
215k
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
463
215k
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
464
465
215k
                    tmp0 += z1;
466
215k
                    tmp1 = z4 + z5;
467
215k
                    tmp2 = z3 + z5;
468
215k
                    tmp3 += z1;
469
459k
                } else {
470
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
471
459k
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
472
459k
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
473
459k
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
474
459k
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
475
459k
                }
476
674k
            }
477
942k
        }
478
14.1M
    } else {
479
14.1M
        if (d5) {
480
2.90M
            if (d3) {
481
2.03M
                if (d1) {
482
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
483
1.67M
                    z2 = d5 + d3;
484
1.67M
                    z4 = d5 + d1;
485
1.67M
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
486
487
1.67M
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
488
1.67M
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
489
1.67M
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
490
1.67M
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
491
1.67M
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
492
1.67M
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
493
1.67M
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
494
495
1.67M
                    z3 += z5;
496
1.67M
                    z4 += z5;
497
498
1.67M
                    tmp0 = z1 + z3;
499
1.67M
                    tmp1 += z2 + z4;
500
1.67M
                    tmp2 += z2 + z3;
501
1.67M
                    tmp3 += z1 + z4;
502
1.67M
                } else {
503
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
504
362k
                    z2 = d5 + d3;
505
506
362k
                    z5 = MULTIPLY(z2, FIX_1_175875602);
507
362k
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
508
362k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
509
362k
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
510
362k
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
511
362k
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
512
513
362k
                    tmp0 = z3 + z5;
514
362k
                    tmp1 += z2;
515
362k
                    tmp2 += z2;
516
362k
                    tmp3 = z4 + z5;
517
362k
                }
518
2.03M
            } else {
519
874k
                if (d1) {
520
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
521
385k
                    z4 = d5 + d1;
522
523
385k
                    z5 = MULTIPLY(z4, FIX_1_175875602);
524
385k
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
525
385k
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
526
385k
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
527
385k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
528
385k
                    z4 = MULTIPLY(z4, FIX_0_785694958);
529
530
385k
                    tmp0 = z1 + z5;
531
385k
                    tmp1 += z4;
532
385k
                    tmp2 = z2 + z5;
533
385k
                    tmp3 += z4;
534
488k
                } else {
535
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
536
488k
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
537
488k
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
538
488k
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
539
488k
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
540
488k
                }
541
874k
            }
542
11.2M
        } else {
543
11.2M
            if (d3) {
544
4.12M
                if (d1) {
545
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
546
2.63M
                    z5 = d1 + d3;
547
2.63M
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
548
2.63M
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
549
2.63M
                    z1 = MULTIPLY(d1, FIX_1_061594337);
550
2.63M
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
551
2.63M
                    z4 = MULTIPLY(z5, FIX_0_785694958);
552
2.63M
                    z5 = MULTIPLY(z5, FIX_1_175875602);
553
554
2.63M
                    tmp0 = z1 - z4;
555
2.63M
                    tmp1 = z2 + z4;
556
2.63M
                    tmp2 += z5;
557
2.63M
                    tmp3 += z5;
558
2.63M
                } else {
559
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
560
1.49M
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
561
1.49M
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
562
1.49M
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
563
1.49M
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
564
1.49M
                }
565
7.14M
            } else {
566
7.14M
                if (d1) {
567
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
568
5.46M
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
569
5.46M
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
570
5.46M
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
571
5.46M
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
572
5.46M
                } else {
573
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
574
1.68M
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
575
1.68M
                }
576
7.14M
            }
577
11.2M
        }
578
14.1M
    }
579
18.1M
}
580
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
581
582
18.1M
    dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
583
18.1M
    dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
584
18.1M
    dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
585
18.1M
    dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
586
18.1M
    dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
587
18.1M
    dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
588
18.1M
    dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
589
18.1M
    dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
590
591
18.1M
    dataptr += DCTSIZE;         /* advance pointer to next row */
592
18.1M
  }
593
594
  /* Pass 2: process columns. */
595
  /* Note that we must descale the results by a factor of 8 == 2**3, */
596
  /* and also undo the PASS1_BITS scaling. */
597
598
19.6M
  dataptr = data;
599
177M
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
600
    /* Columns of zeroes can be exploited in the same way as we did with rows.
601
     * However, the row calculation has created many nonzero AC terms, so the
602
     * simplification applies less often (typically 5% to 10% of the time).
603
     * On machines with very fast multiplication, it's possible that the
604
     * test takes more time than it's worth.  In that case this section
605
     * may be commented out.
606
     */
607
608
157M
    d0 = dataptr[DCTSIZE*0];
609
157M
    d1 = dataptr[DCTSIZE*1];
610
157M
    d2 = dataptr[DCTSIZE*2];
611
157M
    d3 = dataptr[DCTSIZE*3];
612
157M
    d4 = dataptr[DCTSIZE*4];
613
157M
    d5 = dataptr[DCTSIZE*5];
614
157M
    d6 = dataptr[DCTSIZE*6];
615
157M
    d7 = dataptr[DCTSIZE*7];
616
617
    /* Even part: reverse the even part of the forward DCT. */
618
    /* The rotator is sqrt(2)*c(-6). */
619
157M
    if (d6) {
620
12.9M
            if (d2) {
621
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
622
12.1M
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
623
12.1M
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
624
12.1M
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
625
626
12.1M
                    tmp0 = (d0 + d4) * CONST_SCALE;
627
12.1M
                    tmp1 = (d0 - d4) * CONST_SCALE;
628
629
12.1M
                    tmp10 = tmp0 + tmp3;
630
12.1M
                    tmp13 = tmp0 - tmp3;
631
12.1M
                    tmp11 = tmp1 + tmp2;
632
12.1M
                    tmp12 = tmp1 - tmp2;
633
12.1M
            } else {
634
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
635
805k
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
636
805k
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
637
638
805k
                    tmp0 = (d0 + d4) * CONST_SCALE;
639
805k
                    tmp1 = (d0 - d4) * CONST_SCALE;
640
641
805k
                    tmp10 = tmp0 + tmp3;
642
805k
                    tmp13 = tmp0 - tmp3;
643
805k
                    tmp11 = tmp1 + tmp2;
644
805k
                    tmp12 = tmp1 - tmp2;
645
805k
            }
646
144M
    } else {
647
144M
            if (d2) {
648
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
649
14.8M
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
650
14.8M
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
651
652
14.8M
                    tmp0 = (d0 + d4) * CONST_SCALE;
653
14.8M
                    tmp1 = (d0 - d4) * CONST_SCALE;
654
655
14.8M
                    tmp10 = tmp0 + tmp3;
656
14.8M
                    tmp13 = tmp0 - tmp3;
657
14.8M
                    tmp11 = tmp1 + tmp2;
658
14.8M
                    tmp12 = tmp1 - tmp2;
659
129M
            } else {
660
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
661
129M
                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
662
129M
                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
663
129M
            }
664
144M
    }
665
666
    /* Odd part per figure 8; the matrix is unitary and hence its
667
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
668
     */
669
157M
    if (d7) {
670
9.45M
        if (d5) {
671
7.62M
            if (d3) {
672
7.25M
                if (d1) {
673
                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
674
7.04M
                    z1 = d7 + d1;
675
7.04M
                    z2 = d5 + d3;
676
7.04M
                    z3 = d7 + d3;
677
7.04M
                    z4 = d5 + d1;
678
7.04M
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
679
680
7.04M
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
681
7.04M
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
682
7.04M
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
683
7.04M
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
684
7.04M
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
685
7.04M
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
686
7.04M
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
687
7.04M
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
688
689
7.04M
                    z3 += z5;
690
7.04M
                    z4 += z5;
691
692
7.04M
                    tmp0 += z1 + z3;
693
7.04M
                    tmp1 += z2 + z4;
694
7.04M
                    tmp2 += z2 + z3;
695
7.04M
                    tmp3 += z1 + z4;
696
7.04M
                } else {
697
                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
698
211k
                    z2 = d5 + d3;
699
211k
                    z3 = d7 + d3;
700
211k
                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
701
702
211k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
703
211k
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
704
211k
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
705
211k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
706
211k
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
707
211k
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
708
211k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
709
710
211k
                    z3 += z5;
711
211k
                    z4 += z5;
712
713
211k
                    tmp0 += z1 + z3;
714
211k
                    tmp1 += z2 + z4;
715
211k
                    tmp2 += z2 + z3;
716
211k
                    tmp3 = z1 + z4;
717
211k
                }
718
7.25M
            } else {
719
375k
                if (d1) {
720
                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
721
266k
                    z1 = d7 + d1;
722
266k
                    z3 = d7;
723
266k
                    z4 = d5 + d1;
724
266k
                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
725
726
266k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
727
266k
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
728
266k
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
729
266k
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
730
266k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
731
266k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
732
266k
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
733
734
266k
                    z3 += z5;
735
266k
                    z4 += z5;
736
737
266k
                    tmp0 += z1 + z3;
738
266k
                    tmp1 += z2 + z4;
739
266k
                    tmp2 = z2 + z3;
740
266k
                    tmp3 += z1 + z4;
741
266k
                } else {
742
                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
743
109k
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
744
109k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
745
109k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
746
109k
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
747
109k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
748
109k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
749
109k
                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
750
751
109k
                    z3 += z5;
752
109k
                    z4 += z5;
753
754
109k
                    tmp0 += z3;
755
109k
                    tmp1 += z4;
756
109k
                    tmp2 = z2 + z3;
757
109k
                    tmp3 = z1 + z4;
758
109k
                }
759
375k
            }
760
7.62M
        } else {
761
1.82M
            if (d3) {
762
780k
                if (d1) {
763
                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
764
500k
                    z1 = d7 + d1;
765
500k
                    z3 = d7 + d3;
766
500k
                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
767
768
500k
                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
769
500k
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
770
500k
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
771
500k
                    z1 = MULTIPLY(-z1, FIX_0_899976223);
772
500k
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
773
500k
                    z3 = MULTIPLY(-z3, FIX_1_961570560);
774
500k
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
775
776
500k
                    z3 += z5;
777
500k
                    z4 += z5;
778
779
500k
                    tmp0 += z1 + z3;
780
500k
                    tmp1 = z2 + z4;
781
500k
                    tmp2 += z2 + z3;
782
500k
                    tmp3 += z1 + z4;
783
500k
                } else {
784
                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
785
279k
                    z3 = d7 + d3;
786
787
279k
                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
788
279k
                    z1 = MULTIPLY(-d7, FIX_0_899976223);
789
279k
                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
790
279k
                    z2 = MULTIPLY(-d3, FIX_2_562915447);
791
279k
                    z5 = MULTIPLY(z3, FIX_1_175875602);
792
279k
                    z3 = MULTIPLY(-z3, FIX_0_785694958);
793
794
279k
                    tmp0 += z3;
795
279k
                    tmp1 = z2 + z5;
796
279k
                    tmp2 += z3;
797
279k
                    tmp3 = z1 + z5;
798
279k
                }
799
1.04M
            } else {
800
1.04M
                if (d1) {
801
                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
802
623k
                    z1 = d7 + d1;
803
623k
                    z5 = MULTIPLY(z1, FIX_1_175875602);
804
805
623k
                    z1 = MULTIPLY(z1, FIX_0_275899380);
806
623k
                    z3 = MULTIPLY(-d7, FIX_1_961570560);
807
623k
                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
808
623k
                    z4 = MULTIPLY(-d1, FIX_0_390180644);
809
623k
                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
810
811
623k
                    tmp0 += z1;
812
623k
                    tmp1 = z4 + z5;
813
623k
                    tmp2 = z3 + z5;
814
623k
                    tmp3 += z1;
815
623k
                } else {
816
                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
817
423k
                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
818
423k
                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
819
423k
                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
820
423k
                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
821
423k
                }
822
1.04M
            }
823
1.82M
        }
824
148M
    } else {
825
148M
        if (d5) {
826
7.28M
            if (d3) {
827
6.45M
                if (d1) {
828
                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
829
5.91M
                    z2 = d5 + d3;
830
5.91M
                    z4 = d5 + d1;
831
5.91M
                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
832
833
5.91M
                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
834
5.91M
                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
835
5.91M
                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
836
5.91M
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
837
5.91M
                    z2 = MULTIPLY(-z2, FIX_2_562915447);
838
5.91M
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
839
5.91M
                    z4 = MULTIPLY(-z4, FIX_0_390180644);
840
841
5.91M
                    z3 += z5;
842
5.91M
                    z4 += z5;
843
844
5.91M
                    tmp0 = z1 + z3;
845
5.91M
                    tmp1 += z2 + z4;
846
5.91M
                    tmp2 += z2 + z3;
847
5.91M
                    tmp3 += z1 + z4;
848
5.91M
                } else {
849
                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
850
534k
                    z2 = d5 + d3;
851
852
534k
                    z5 = MULTIPLY(z2, FIX_1_175875602);
853
534k
                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
854
534k
                    z4 = MULTIPLY(-d5, FIX_0_390180644);
855
534k
                    z2 = MULTIPLY(-z2, FIX_1_387039845);
856
534k
                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
857
534k
                    z3 = MULTIPLY(-d3, FIX_1_961570560);
858
859
534k
                    tmp0 = z3 + z5;
860
534k
                    tmp1 += z2;
861
534k
                    tmp2 += z2;
862
534k
                    tmp3 = z4 + z5;
863
534k
                }
864
6.45M
            } else {
865
836k
                if (d1) {
866
                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
867
580k
                    z4 = d5 + d1;
868
869
580k
                    z5 = MULTIPLY(z4, FIX_1_175875602);
870
580k
                    z1 = MULTIPLY(-d1, FIX_0_899976223);
871
580k
                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
872
580k
                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
873
580k
                    z2 = MULTIPLY(-d5, FIX_2_562915447);
874
580k
                    z4 = MULTIPLY(z4, FIX_0_785694958);
875
876
580k
                    tmp0 = z1 + z5;
877
580k
                    tmp1 += z4;
878
580k
                    tmp2 = z2 + z5;
879
580k
                    tmp3 += z4;
880
580k
                } else {
881
                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
882
255k
                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
883
255k
                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
884
255k
                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
885
255k
                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
886
255k
                }
887
836k
            }
888
140M
        } else {
889
140M
            if (d3) {
890
8.06M
                if (d1) {
891
                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
892
6.46M
                    z5 = d1 + d3;
893
6.46M
                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
894
6.46M
                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
895
6.46M
                    z1 = MULTIPLY(d1, FIX_1_061594337);
896
6.46M
                    z2 = MULTIPLY(-d3, FIX_2_172734803);
897
6.46M
                    z4 = MULTIPLY(z5, FIX_0_785694958);
898
6.46M
                    z5 = MULTIPLY(z5, FIX_1_175875602);
899
900
6.46M
                    tmp0 = z1 - z4;
901
6.46M
                    tmp1 = z2 + z4;
902
6.46M
                    tmp2 += z5;
903
6.46M
                    tmp3 += z5;
904
6.46M
                } else {
905
                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
906
1.59M
                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
907
1.59M
                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
908
1.59M
                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
909
1.59M
                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
910
1.59M
                }
911
132M
            } else {
912
132M
                if (d1) {
913
                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
914
20.1M
                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
915
20.1M
                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
916
20.1M
                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
917
20.1M
                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
918
112M
                } else {
919
                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
920
112M
                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
921
112M
                }
922
132M
            }
923
140M
        }
924
148M
    }
925
926
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
927
928
157M
    dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
929
157M
                                           CONST_BITS+PASS1_BITS+3);
930
157M
    dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
931
157M
                                           CONST_BITS+PASS1_BITS+3);
932
157M
    dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
933
157M
                                           CONST_BITS+PASS1_BITS+3);
934
157M
    dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
935
157M
                                           CONST_BITS+PASS1_BITS+3);
936
157M
    dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
937
157M
                                           CONST_BITS+PASS1_BITS+3);
938
157M
    dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
939
157M
                                           CONST_BITS+PASS1_BITS+3);
940
157M
    dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
941
157M
                                           CONST_BITS+PASS1_BITS+3);
942
157M
    dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
943
157M
                                           CONST_BITS+PASS1_BITS+3);
944
945
157M
    dataptr++;                  /* advance pointer to next column */
946
157M
  }
947
19.6M
}
948
949
#undef DCTSIZE
950
31.2M
#define DCTSIZE 4
951
624M
#define DCTSTRIDE 8
952
953
void ff_j_rev_dct4(DCTBLOCK data)
954
15.6M
{
955
15.6M
  int32_t tmp0, tmp1, tmp2, tmp3;
956
15.6M
  int32_t tmp10, tmp11, tmp12, tmp13;
957
15.6M
  int32_t z1;
958
15.6M
  int32_t d0, d2, d4, d6;
959
15.6M
  register int16_t *dataptr;
960
15.6M
  int rowctr;
961
962
  /* Pass 1: process rows. */
963
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
964
  /* furthermore, we scale the results by 2**PASS1_BITS. */
965
966
15.6M
  data[0] += 4;
967
968
15.6M
  dataptr = data;
969
970
78.1M
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
971
    /* Due to quantization, we will usually find that many of the input
972
     * coefficients are zero, especially the AC terms.  We can exploit this
973
     * by short-circuiting the IDCT calculation for any row in which all
974
     * the AC terms are zero.  In that case each output is equal to the
975
     * DC coefficient (with scale factor as needed).
976
     * With typical images and quantization tables, half or more of the
977
     * row DCT calculations can be simplified this way.
978
     */
979
980
62.4M
    register uint8_t *idataptr = (uint8_t*)dataptr;
981
982
62.4M
    d0 = dataptr[0];
983
62.4M
    d2 = dataptr[1];
984
62.4M
    d4 = dataptr[2];
985
62.4M
    d6 = dataptr[3];
986
987
62.4M
    if ((d2 | d4 | d6) == 0) {
988
      /* AC terms all zero */
989
48.4M
      if (d0) {
990
          /* Compute a 32 bit value to assign. */
991
14.2M
          int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
992
14.2M
          register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
993
994
14.2M
          AV_WN32A(&idataptr[0], v);
995
14.2M
          AV_WN32A(&idataptr[4], v);
996
14.2M
      }
997
998
48.4M
      dataptr += DCTSTRIDE;     /* advance pointer to next row */
999
48.4M
      continue;
1000
48.4M
    }
1001
1002
    /* Even part: reverse the even part of the forward DCT. */
1003
    /* The rotator is sqrt(2)*c(-6). */
1004
14.0M
    if (d6) {
1005
6.62M
            if (d2) {
1006
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1007
5.21M
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1008
5.21M
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1009
5.21M
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1010
1011
5.21M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1012
5.21M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1013
1014
5.21M
                    tmp10 = tmp0 + tmp3;
1015
5.21M
                    tmp13 = tmp0 - tmp3;
1016
5.21M
                    tmp11 = tmp1 + tmp2;
1017
5.21M
                    tmp12 = tmp1 - tmp2;
1018
5.21M
            } else {
1019
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1020
1.41M
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1021
1.41M
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
1022
1023
1.41M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1024
1.41M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1025
1026
1.41M
                    tmp10 = tmp0 + tmp3;
1027
1.41M
                    tmp13 = tmp0 - tmp3;
1028
1.41M
                    tmp11 = tmp1 + tmp2;
1029
1.41M
                    tmp12 = tmp1 - tmp2;
1030
1.41M
            }
1031
7.45M
    } else {
1032
7.45M
            if (d2) {
1033
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1034
6.26M
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1035
6.26M
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1036
1037
6.26M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1038
6.26M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1039
1040
6.26M
                    tmp10 = tmp0 + tmp3;
1041
6.26M
                    tmp13 = tmp0 - tmp3;
1042
6.26M
                    tmp11 = tmp1 + tmp2;
1043
6.26M
                    tmp12 = tmp1 - tmp2;
1044
6.26M
            } else {
1045
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1046
1.18M
                    tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
1047
1.18M
                    tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
1048
1.18M
            }
1049
7.45M
      }
1050
1051
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1052
1053
14.0M
    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1054
14.0M
    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1055
14.0M
    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1056
14.0M
    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1057
1058
14.0M
    dataptr += DCTSTRIDE;       /* advance pointer to next row */
1059
14.0M
  }
1060
1061
  /* Pass 2: process columns. */
1062
  /* Note that we must descale the results by a factor of 8 == 2**3, */
1063
  /* and also undo the PASS1_BITS scaling. */
1064
1065
15.6M
  dataptr = data;
1066
78.1M
  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1067
    /* Columns of zeroes can be exploited in the same way as we did with rows.
1068
     * However, the row calculation has created many nonzero AC terms, so the
1069
     * simplification applies less often (typically 5% to 10% of the time).
1070
     * On machines with very fast multiplication, it's possible that the
1071
     * test takes more time than it's worth.  In that case this section
1072
     * may be commented out.
1073
     */
1074
1075
62.4M
    d0 = dataptr[DCTSTRIDE*0];
1076
62.4M
    d2 = dataptr[DCTSTRIDE*1];
1077
62.4M
    d4 = dataptr[DCTSTRIDE*2];
1078
62.4M
    d6 = dataptr[DCTSTRIDE*3];
1079
1080
    /* Even part: reverse the even part of the forward DCT. */
1081
    /* The rotator is sqrt(2)*c(-6). */
1082
62.4M
    if (d6) {
1083
10.5M
            if (d2) {
1084
                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1085
9.04M
                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1086
9.04M
                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1087
9.04M
                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1088
1089
9.04M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1090
9.04M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1091
1092
9.04M
                    tmp10 = tmp0 + tmp3;
1093
9.04M
                    tmp13 = tmp0 - tmp3;
1094
9.04M
                    tmp11 = tmp1 + tmp2;
1095
9.04M
                    tmp12 = tmp1 - tmp2;
1096
9.04M
            } else {
1097
                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1098
1.49M
                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1099
1.49M
                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
1100
1101
1.49M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1102
1.49M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1103
1104
1.49M
                    tmp10 = tmp0 + tmp3;
1105
1.49M
                    tmp13 = tmp0 - tmp3;
1106
1.49M
                    tmp11 = tmp1 + tmp2;
1107
1.49M
                    tmp12 = tmp1 - tmp2;
1108
1.49M
            }
1109
51.9M
    } else {
1110
51.9M
            if (d2) {
1111
                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1112
15.0M
                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
1113
15.0M
                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
1114
1115
15.0M
                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
1116
15.0M
                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
1117
1118
15.0M
                    tmp10 = tmp0 + tmp3;
1119
15.0M
                    tmp13 = tmp0 - tmp3;
1120
15.0M
                    tmp11 = tmp1 + tmp2;
1121
15.0M
                    tmp12 = tmp1 - tmp2;
1122
36.8M
            } else {
1123
                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1124
36.8M
                    tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
1125
36.8M
                    tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
1126
36.8M
            }
1127
51.9M
    }
1128
1129
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1130
1131
62.4M
    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1132
62.4M
    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1133
62.4M
    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1134
62.4M
    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1135
1136
62.4M
    dataptr++;                  /* advance pointer to next column */
1137
62.4M
  }
1138
15.6M
}
1139
1140
5.20M
void ff_j_rev_dct2(DCTBLOCK data){
1141
5.20M
  int d00, d01, d10, d11;
1142
1143
5.20M
  data[0] += 4;
1144
5.20M
  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1145
5.20M
  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1146
5.20M
  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1147
5.20M
  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1148
1149
5.20M
  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1150
5.20M
  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1151
5.20M
  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1152
5.20M
  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1153
5.20M
}
1154
1155
2.60k
void ff_j_rev_dct1(DCTBLOCK data){
1156
2.60k
  data[0] = (data[0] + 4)>>3;
1157
2.60k
}
1158
1159
#undef FIX
1160
#undef CONST_BITS
1161
1162
void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64])
1163
18.1M
{
1164
18.1M
    ff_j_rev_dct(block);
1165
18.1M
    ff_put_pixels_clamped_c(block, dest, line_size);
1166
18.1M
}
1167
1168
void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64])
1169
1.51M
{
1170
1.51M
    ff_j_rev_dct(block);
1171
1.51M
    ff_add_pixels_clamped_c(block, dest, line_size);
1172
1.51M
}