Coverage Report

Created: 2022-10-31 07:00

/src/ghostpdl/obj/jidctint.c
Line
Count
Source
1
/*
2
 * jidctint.c
3
 *
4
 * Copyright (C) 1991-1998, Thomas G. Lane.
5
 * Modification developed 2002-2018 by Guido Vollbeding.
6
 * This file is part of the Independent JPEG Group's software.
7
 * For conditions of distribution and use, see the accompanying README file.
8
 *
9
 * This file contains a slow-but-accurate integer implementation of the
10
 * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
11
 * must also perform dequantization of the input coefficients.
12
 *
13
 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
14
 * on each row (or vice versa, but it's more convenient to emit a row at
15
 * a time).  Direct algorithms are also available, but they are much more
16
 * complex and seem not to be any faster when reduced to code.
17
 *
18
 * This implementation is based on an algorithm described in
19
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
20
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
21
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
22
 * The primary algorithm described there uses 11 multiplies and 29 adds.
23
 * We use their alternate method with 12 multiplies and 32 adds.
24
 * The advantage of this method is that no data path contains more than one
25
 * multiplication; this allows a very simple and accurate implementation in
26
 * scaled fixed-point arithmetic, with a minimal number of shifts.
27
 *
28
 * We also provide IDCT routines with various output sample block sizes for
29
 * direct resolution reduction or enlargement and for direct resolving the
30
 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
31
 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
32
 *
33
 * For N<8 we simply take the corresponding low-frequency coefficients of
34
 * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
35
 * to yield the downscaled outputs.
36
 * This can be seen as direct low-pass downsampling from the DCT domain
37
 * point of view rather than the usual spatial domain point of view,
38
 * yielding significant computational savings and results at least
39
 * as good as common bilinear (averaging) spatial downsampling.
40
 *
41
 * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
42
 * lower frequencies and higher frequencies assumed to be zero.
43
 * It turns out that the computational effort is similar to the 8x8 IDCT
44
 * regarding the output size.
45
 * Furthermore, the scaling and descaling is the same for all IDCT sizes.
46
 *
47
 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
48
 * since there would be too many additional constants to pre-calculate.
49
 */
50
51
#define JPEG_INTERNALS
52
#include "jinclude.h"
53
#include "jpeglib.h"
54
#include "jdct.h"   /* Private declarations for DCT subsystem */
55
56
#ifdef DCT_ISLOW_SUPPORTED
57
58
59
/*
60
 * This module is specialized to the case DCTSIZE = 8.
61
 */
62
63
#if DCTSIZE != 8
64
  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
65
#endif
66
67
68
/*
69
 * The poop on this scaling stuff is as follows:
70
 *
71
 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
72
 * larger than the true IDCT outputs.  The final outputs are therefore
73
 * a factor of N larger than desired; since N=8 this can be cured by
74
 * a simple right shift at the end of the algorithm.  The advantage of
75
 * this arrangement is that we save two multiplications per 1-D IDCT,
76
 * because the y0 and y4 inputs need not be divided by sqrt(N).
77
 *
78
 * We have to do addition and subtraction of the integer inputs, which
79
 * is no problem, and multiplication by fractional constants, which is
80
 * a problem to do in integer arithmetic.  We multiply all the constants
81
 * by CONST_SCALE and convert them to integer constants (thus retaining
82
 * CONST_BITS bits of precision in the constants).  After doing a
83
 * multiplication we have to divide the product by CONST_SCALE, with proper
84
 * rounding, to produce the correct output.  This division can be done
85
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
86
 * as long as possible so that partial sums can be added together with
87
 * full fractional precision.
88
 *
89
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
90
 * they are represented to better-than-integral precision.  These outputs
91
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
92
 * with the recommended scaling.  (To scale up 12-bit sample data further, an
93
 * intermediate INT32 array would be needed.)
94
 *
95
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
96
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
97
 * shows that the values given below are the most effective.
98
 */
99
100
#if BITS_IN_JSAMPLE == 8
101
342M
#define CONST_BITS  13
102
639M
#define PASS1_BITS  2
103
#else
104
#define CONST_BITS  13
105
#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
106
#endif
107
108
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
109
 * causing a lot of useless floating-point operations at run time.
110
 * To get around this we use the following pre-calculated constants.
111
 * If you change CONST_BITS you may want to add appropriate values.
112
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
113
 */
114
115
#if CONST_BITS == 13
116
#define FIX_0_298631336  ((INT32)  2446)  /* FIX(0.298631336) */
117
#define FIX_0_390180644  ((INT32)  3196)  /* FIX(0.390180644) */
118
#define FIX_0_541196100  ((INT32)  4433)  /* FIX(0.541196100) */
119
#define FIX_0_765366865  ((INT32)  6270)  /* FIX(0.765366865) */
120
#define FIX_0_899976223  ((INT32)  7373)  /* FIX(0.899976223) */
121
#define FIX_1_175875602  ((INT32)  9633)  /* FIX(1.175875602) */
122
#define FIX_1_501321110  ((INT32)  12299) /* FIX(1.501321110) */
123
#define FIX_1_847759065  ((INT32)  15137) /* FIX(1.847759065) */
124
#define FIX_1_961570560  ((INT32)  16069) /* FIX(1.961570560) */
125
#define FIX_2_053119869  ((INT32)  16819) /* FIX(2.053119869) */
126
#define FIX_2_562915447  ((INT32)  20995) /* FIX(2.562915447) */
127
#define FIX_3_072711026  ((INT32)  25172) /* FIX(3.072711026) */
128
#else
129
#define FIX_0_298631336  FIX(0.298631336)
130
#define FIX_0_390180644  FIX(0.390180644)
131
#define FIX_0_541196100  FIX(0.541196100)
132
#define FIX_0_765366865  FIX(0.765366865)
133
#define FIX_0_899976223  FIX(0.899976223)
134
#define FIX_1_175875602  FIX(1.175875602)
135
#define FIX_1_501321110  FIX(1.501321110)
136
#define FIX_1_847759065  FIX(1.847759065)
137
#define FIX_1_961570560  FIX(1.961570560)
138
#define FIX_2_053119869  FIX(2.053119869)
139
#define FIX_2_562915447  FIX(2.562915447)
140
#define FIX_3_072711026  FIX(3.072711026)
141
#endif
142
143
144
/* Clamp DC value to acceptable range for bug 697186 */
145
#define CLAMP_DC(dcval)    \
146
26.6M
  {                        \
147
26.6M
    if (dcval < -1024)     \
148
26.6M
      dcval = -1024;       \
149
26.6M
    else if (dcval > 1023) \
150
25.4M
      dcval = 1023;        \
151
26.6M
  }
152
153
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
154
 * For 8-bit samples with the recommended scaling, all the variable
155
 * and constant values involved are no more than 16 bits wide, so a
156
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
157
 * For 12-bit samples, a full 32-bit multiplication will be needed.
158
 */
159
160
#if BITS_IN_JSAMPLE == 8
161
1.71G
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
162
#else
163
#define MULTIPLY(var,const)  ((var) * (const))
164
#endif
165
166
167
/* Dequantize a coefficient by multiplying it by the multiplier-table
168
 * entry; produce an int result.  In this module, both inputs and result
169
 * are 16 bits or less, so either int or short multiply will work.
170
 */
171
172
612M
#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
173
174
175
/*
176
 * Perform dequantization and inverse DCT on one block of coefficients.
177
 *
178
 * Optimized algorithm with 12 multiplications in the 1-D kernel.
179
 * cK represents sqrt(2) * cos(K*pi/16).
180
 */
181
182
GLOBAL(void)
183
jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
184
     JCOEFPTR coef_block,
185
     JSAMPARRAY output_buf, JDIMENSION output_col)
186
26.6M
{
187
26.6M
  INT32 tmp0, tmp1, tmp2, tmp3;
188
26.6M
  INT32 tmp10, tmp11, tmp12, tmp13;
189
26.6M
  INT32 z1, z2, z3;
190
26.6M
  JCOEFPTR inptr;
191
26.6M
  ISLOW_MULT_TYPE * quantptr;
192
26.6M
  int * wsptr;
193
26.6M
  JSAMPROW outptr;
194
26.6M
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
195
26.6M
  int ctr;
196
26.6M
  int workspace[DCTSIZE2];  /* buffers data between passes */
197
  SHIFT_TEMPS
198
199
  /* Pass 1: process columns from input, store into work array.
200
   * Note results are scaled up by sqrt(8) compared to a true IDCT;
201
   * furthermore, we scale the results by 2**PASS1_BITS.
202
   */
203
204
26.6M
  inptr = coef_block;
205
26.6M
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
206
26.6M
  wsptr = workspace;
207
239M
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
208
    /* Due to quantization, we will usually find that many of the input
209
     * coefficients are zero, especially the AC terms.  We can exploit this
210
     * by short-circuiting the IDCT calculation for any column in which all
211
     * the AC terms are zero.  In that case each output is equal to the
212
     * DC coefficient (with scale factor as needed).
213
     * With typical images and quantization tables, half or more of the
214
     * column DCT calculations can be simplified this way.
215
     */
216
217
213M
    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
218
213M
  inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
219
213M
  inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
220
213M
  inptr[DCTSIZE*7] == 0) {
221
      /* AC terms all zero */
222
156M
      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
223
156M
      if (ctr == DCTSIZE)
224
16.0M
        CLAMP_DC(dcval);
225
156M
      dcval <<= PASS1_BITS;
226
227
156M
      wsptr[DCTSIZE*0] = dcval;
228
156M
      wsptr[DCTSIZE*1] = dcval;
229
156M
      wsptr[DCTSIZE*2] = dcval;
230
156M
      wsptr[DCTSIZE*3] = dcval;
231
156M
      wsptr[DCTSIZE*4] = dcval;
232
156M
      wsptr[DCTSIZE*5] = dcval;
233
156M
      wsptr[DCTSIZE*6] = dcval;
234
156M
      wsptr[DCTSIZE*7] = dcval;
235
236
156M
      inptr++;      /* advance pointers to next column */
237
156M
      quantptr++;
238
156M
      wsptr++;
239
156M
      continue;
240
156M
    }
241
242
    /* Even part: reverse the even part of the forward DCT.
243
     * The rotator is c(-6).
244
     */
245
246
57.0M
    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
247
57.0M
    if (ctr == DCTSIZE)
248
10.5M
      CLAMP_DC(z2);
249
57.0M
    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
250
57.0M
    z2 <<= CONST_BITS;
251
57.0M
    z3 <<= CONST_BITS;
252
    /* Add fudge factor here for final descale. */
253
57.0M
    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
254
255
57.0M
    tmp0 = z2 + z3;
256
57.0M
    tmp1 = z2 - z3;
257
258
57.0M
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
259
57.0M
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
260
261
57.0M
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
262
57.0M
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
263
57.0M
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
264
265
57.0M
    tmp10 = tmp0 + tmp2;
266
57.0M
    tmp13 = tmp0 - tmp2;
267
57.0M
    tmp11 = tmp1 + tmp3;
268
57.0M
    tmp12 = tmp1 - tmp3;
269
270
    /* Odd part per figure 8; the matrix is unitary and hence its
271
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
272
     */
273
274
57.0M
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
275
57.0M
    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
276
57.0M
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
277
57.0M
    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
278
279
57.0M
    z2 = tmp0 + tmp2;
280
57.0M
    z3 = tmp1 + tmp3;
281
282
57.0M
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
283
57.0M
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
284
57.0M
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
285
57.0M
    z2 += z1;
286
57.0M
    z3 += z1;
287
288
57.0M
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
289
57.0M
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
290
57.0M
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
291
57.0M
    tmp0 += z1 + z2;
292
57.0M
    tmp3 += z1 + z3;
293
294
57.0M
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
295
57.0M
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
296
57.0M
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
297
57.0M
    tmp1 += z1 + z3;
298
57.0M
    tmp2 += z1 + z2;
299
300
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
301
302
57.0M
    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
303
57.0M
    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
304
57.0M
    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
305
57.0M
    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
306
57.0M
    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
307
57.0M
    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
308
57.0M
    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
309
57.0M
    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
310
311
57.0M
    inptr++;      /* advance pointers to next column */
312
57.0M
    quantptr++;
313
57.0M
    wsptr++;
314
57.0M
  }
315
316
  /* Pass 2: process rows from work array, store into output array.
317
   * Note that we must descale the results by a factor of 8 == 2**3,
318
   * and also undo the PASS1_BITS scaling.
319
   */
320
321
26.6M
  wsptr = workspace;
322
239M
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
323
213M
    outptr = output_buf[ctr] + output_col;
324
325
    /* Add range center and fudge factor for final descale and range-limit. */
326
213M
    z2 = (INT32) wsptr[0] +
327
213M
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
328
213M
      (ONE << (PASS1_BITS+2)));
329
330
    /* Rows of zeroes can be exploited in the same way as we did with columns.
331
     * However, the column calculation has created many nonzero AC terms, so
332
     * the simplification applies less often (typically 5% to 10% of the time).
333
     * On machines with very fast multiplication, it's possible that the
334
     * test takes more time than it's worth.  In that case this section
335
     * may be commented out.
336
     */
337
338
213M
#ifndef NO_ZERO_ROW_TEST
339
213M
    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
340
213M
  wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
341
      /* AC terms all zero */
342
127M
      JSAMPLE dcval = range_limit[(int) RIGHT_SHIFT(z2, PASS1_BITS+3)
343
127M
          & RANGE_MASK];
344
345
127M
      outptr[0] = dcval;
346
127M
      outptr[1] = dcval;
347
127M
      outptr[2] = dcval;
348
127M
      outptr[3] = dcval;
349
127M
      outptr[4] = dcval;
350
127M
      outptr[5] = dcval;
351
127M
      outptr[6] = dcval;
352
127M
      outptr[7] = dcval;
353
354
127M
      wsptr += DCTSIZE;   /* advance pointer to next row */
355
127M
      continue;
356
127M
    }
357
85.6M
#endif
358
359
    /* Even part: reverse the even part of the forward DCT.
360
     * The rotator is c(-6).
361
     */
362
363
85.6M
    z3 = (INT32) wsptr[4];
364
365
85.6M
    tmp0 = (z2 + z3) << CONST_BITS;
366
85.6M
    tmp1 = (z2 - z3) << CONST_BITS;
367
368
85.6M
    z2 = (INT32) wsptr[2];
369
85.6M
    z3 = (INT32) wsptr[6];
370
371
85.6M
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
372
85.6M
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
373
85.6M
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
374
375
85.6M
    tmp10 = tmp0 + tmp2;
376
85.6M
    tmp13 = tmp0 - tmp2;
377
85.6M
    tmp11 = tmp1 + tmp3;
378
85.6M
    tmp12 = tmp1 - tmp3;
379
380
    /* Odd part per figure 8; the matrix is unitary and hence its
381
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
382
     */
383
384
85.6M
    tmp0 = (INT32) wsptr[7];
385
85.6M
    tmp1 = (INT32) wsptr[5];
386
85.6M
    tmp2 = (INT32) wsptr[3];
387
85.6M
    tmp3 = (INT32) wsptr[1];
388
389
85.6M
    z2 = tmp0 + tmp2;
390
85.6M
    z3 = tmp1 + tmp3;
391
392
85.6M
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
393
85.6M
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
394
85.6M
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
395
85.6M
    z2 += z1;
396
85.6M
    z3 += z1;
397
398
85.6M
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
399
85.6M
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
400
85.6M
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
401
85.6M
    tmp0 += z1 + z2;
402
85.6M
    tmp3 += z1 + z3;
403
404
85.6M
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
405
85.6M
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
406
85.6M
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
407
85.6M
    tmp1 += z1 + z3;
408
85.6M
    tmp2 += z1 + z2;
409
410
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
411
412
85.6M
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
413
85.6M
                CONST_BITS+PASS1_BITS+3)
414
85.6M
          & RANGE_MASK];
415
85.6M
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
416
85.6M
                CONST_BITS+PASS1_BITS+3)
417
85.6M
          & RANGE_MASK];
418
85.6M
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
419
85.6M
                CONST_BITS+PASS1_BITS+3)
420
85.6M
          & RANGE_MASK];
421
85.6M
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
422
85.6M
                CONST_BITS+PASS1_BITS+3)
423
85.6M
          & RANGE_MASK];
424
85.6M
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
425
85.6M
                CONST_BITS+PASS1_BITS+3)
426
85.6M
          & RANGE_MASK];
427
85.6M
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
428
85.6M
                CONST_BITS+PASS1_BITS+3)
429
85.6M
          & RANGE_MASK];
430
85.6M
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
431
85.6M
                CONST_BITS+PASS1_BITS+3)
432
85.6M
          & RANGE_MASK];
433
85.6M
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
434
85.6M
                CONST_BITS+PASS1_BITS+3)
435
85.6M
          & RANGE_MASK];
436
437
85.6M
    wsptr += DCTSIZE;   /* advance pointer to next row */
438
85.6M
  }
439
26.6M
}
440
441
#ifdef IDCT_SCALING_SUPPORTED
442
443
444
/*
445
 * Perform dequantization and inverse DCT on one block of coefficients,
446
 * producing a reduced-size 7x7 output block.
447
 *
448
 * Optimized algorithm with 12 multiplications in the 1-D kernel.
449
 * cK represents sqrt(2) * cos(K*pi/14).
450
 */
451
452
GLOBAL(void)
453
jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
454
         JCOEFPTR coef_block,
455
         JSAMPARRAY output_buf, JDIMENSION output_col)
456
{
457
  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
458
  INT32 z1, z2, z3;
459
  JCOEFPTR inptr;
460
  ISLOW_MULT_TYPE * quantptr;
461
  int * wsptr;
462
  JSAMPROW outptr;
463
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
464
  int ctr;
465
  int workspace[7*7]; /* buffers data between passes */
466
  SHIFT_TEMPS
467
468
  /* Pass 1: process columns from input, store into work array. */
469
470
  inptr = coef_block;
471
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
472
  wsptr = workspace;
473
  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
474
    /* Even part */
475
476
    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
477
    if (ctr == 0)
478
      CLAMP_DC(tmp13);
479
    tmp13 <<= CONST_BITS;
480
    /* Add fudge factor here for final descale. */
481
    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
482
483
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
484
    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
485
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
486
487
    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
488
    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
489
    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
490
    tmp0 = z1 + z3;
491
    z2 -= tmp0;
492
    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
493
    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
494
    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
495
    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
496
497
    /* Odd part */
498
499
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
500
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
501
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
502
503
    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
504
    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
505
    tmp0 = tmp1 - tmp2;
506
    tmp1 += tmp2;
507
    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
508
    tmp1 += tmp2;
509
    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
510
    tmp0 += z2;
511
    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
512
513
    /* Final output stage */
514
515
    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
516
    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
517
    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
518
    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
519
    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
520
    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
521
    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
522
  }
523
524
  /* Pass 2: process 7 rows from work array, store into output array. */
525
526
  wsptr = workspace;
527
  for (ctr = 0; ctr < 7; ctr++) {
528
    outptr = output_buf[ctr] + output_col;
529
530
    /* Even part */
531
532
    /* Add range center and fudge factor for final descale and range-limit. */
533
    tmp13 = (INT32) wsptr[0] +
534
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
535
         (ONE << (PASS1_BITS+2)));
536
    tmp13 <<= CONST_BITS;
537
538
    z1 = (INT32) wsptr[2];
539
    z2 = (INT32) wsptr[4];
540
    z3 = (INT32) wsptr[6];
541
542
    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
543
    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
544
    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
545
    tmp0 = z1 + z3;
546
    z2 -= tmp0;
547
    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
548
    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
549
    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
550
    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
551
552
    /* Odd part */
553
554
    z1 = (INT32) wsptr[1];
555
    z2 = (INT32) wsptr[3];
556
    z3 = (INT32) wsptr[5];
557
558
    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
559
    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
560
    tmp0 = tmp1 - tmp2;
561
    tmp1 += tmp2;
562
    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
563
    tmp1 += tmp2;
564
    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
565
    tmp0 += z2;
566
    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
567
568
    /* Final output stage */
569
570
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
571
                CONST_BITS+PASS1_BITS+3)
572
          & RANGE_MASK];
573
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
574
                CONST_BITS+PASS1_BITS+3)
575
          & RANGE_MASK];
576
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
577
                CONST_BITS+PASS1_BITS+3)
578
          & RANGE_MASK];
579
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
580
                CONST_BITS+PASS1_BITS+3)
581
          & RANGE_MASK];
582
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
583
                CONST_BITS+PASS1_BITS+3)
584
          & RANGE_MASK];
585
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
586
                CONST_BITS+PASS1_BITS+3)
587
          & RANGE_MASK];
588
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
589
                CONST_BITS+PASS1_BITS+3)
590
          & RANGE_MASK];
591
592
    wsptr += 7;   /* advance pointer to next row */
593
  }
594
}
595
596
597
/*
598
 * Perform dequantization and inverse DCT on one block of coefficients,
599
 * producing a reduced-size 6x6 output block.
600
 *
601
 * Optimized algorithm with 3 multiplications in the 1-D kernel.
602
 * cK represents sqrt(2) * cos(K*pi/12).
603
 */
604
605
GLOBAL(void)
606
jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
607
         JCOEFPTR coef_block,
608
         JSAMPARRAY output_buf, JDIMENSION output_col)
609
{
610
  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
611
  INT32 z1, z2, z3;
612
  JCOEFPTR inptr;
613
  ISLOW_MULT_TYPE * quantptr;
614
  int * wsptr;
615
  JSAMPROW outptr;
616
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
617
  int ctr;
618
  int workspace[6*6]; /* buffers data between passes */
619
  SHIFT_TEMPS
620
621
  /* Pass 1: process columns from input, store into work array. */
622
623
  inptr = coef_block;
624
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
625
  wsptr = workspace;
626
  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
627
    /* Even part */
628
629
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
630
    if (ctr == 0)
631
      CLAMP_DC(tmp0);
632
    tmp0 <<= CONST_BITS;
633
    /* Add fudge factor here for final descale. */
634
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
635
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
636
    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
637
    tmp1 = tmp0 + tmp10;
638
    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
639
    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
640
    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
641
    tmp10 = tmp1 + tmp0;
642
    tmp12 = tmp1 - tmp0;
643
644
    /* Odd part */
645
646
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
647
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
648
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
649
    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
650
    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
651
    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
652
    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
653
654
    /* Final output stage */
655
656
    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
657
    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
658
    wsptr[6*1] = (int) (tmp11 + tmp1);
659
    wsptr[6*4] = (int) (tmp11 - tmp1);
660
    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
661
    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
662
  }
663
664
  /* Pass 2: process 6 rows from work array, store into output array. */
665
666
  wsptr = workspace;
667
  for (ctr = 0; ctr < 6; ctr++) {
668
    outptr = output_buf[ctr] + output_col;
669
670
    /* Even part */
671
672
    /* Add range center and fudge factor for final descale and range-limit. */
673
    tmp0 = (INT32) wsptr[0] +
674
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
675
        (ONE << (PASS1_BITS+2)));
676
    tmp0 <<= CONST_BITS;
677
    tmp2 = (INT32) wsptr[4];
678
    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
679
    tmp1 = tmp0 + tmp10;
680
    tmp11 = tmp0 - tmp10 - tmp10;
681
    tmp10 = (INT32) wsptr[2];
682
    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
683
    tmp10 = tmp1 + tmp0;
684
    tmp12 = tmp1 - tmp0;
685
686
    /* Odd part */
687
688
    z1 = (INT32) wsptr[1];
689
    z2 = (INT32) wsptr[3];
690
    z3 = (INT32) wsptr[5];
691
    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
692
    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
693
    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
694
    tmp1 = (z1 - z2 - z3) << CONST_BITS;
695
696
    /* Final output stage */
697
698
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
699
                CONST_BITS+PASS1_BITS+3)
700
          & RANGE_MASK];
701
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
702
                CONST_BITS+PASS1_BITS+3)
703
          & RANGE_MASK];
704
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
705
                CONST_BITS+PASS1_BITS+3)
706
          & RANGE_MASK];
707
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
708
                CONST_BITS+PASS1_BITS+3)
709
          & RANGE_MASK];
710
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
711
                CONST_BITS+PASS1_BITS+3)
712
          & RANGE_MASK];
713
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
714
                CONST_BITS+PASS1_BITS+3)
715
          & RANGE_MASK];
716
717
    wsptr += 6;   /* advance pointer to next row */
718
  }
719
}
720
721
722
/*
723
 * Perform dequantization and inverse DCT on one block of coefficients,
724
 * producing a reduced-size 5x5 output block.
725
 *
726
 * Optimized algorithm with 5 multiplications in the 1-D kernel.
727
 * cK represents sqrt(2) * cos(K*pi/10).
728
 */
729
730
GLOBAL(void)
731
jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
732
         JCOEFPTR coef_block,
733
         JSAMPARRAY output_buf, JDIMENSION output_col)
734
{
735
  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
736
  INT32 z1, z2, z3;
737
  JCOEFPTR inptr;
738
  ISLOW_MULT_TYPE * quantptr;
739
  int * wsptr;
740
  JSAMPROW outptr;
741
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
742
  int ctr;
743
  int workspace[5*5]; /* buffers data between passes */
744
  SHIFT_TEMPS
745
746
  /* Pass 1: process columns from input, store into work array. */
747
748
  inptr = coef_block;
749
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
750
  wsptr = workspace;
751
  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
752
    /* Even part */
753
754
    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
755
    if (ctr == 0)
756
      CLAMP_DC(tmp12);
757
    tmp12 <<= CONST_BITS;
758
    /* Add fudge factor here for final descale. */
759
    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
760
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
761
    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
762
    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
763
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
764
    z3 = tmp12 + z2;
765
    tmp10 = z3 + z1;
766
    tmp11 = z3 - z1;
767
    tmp12 -= z2 << 2;
768
769
    /* Odd part */
770
771
    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
772
    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
773
774
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
775
    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
776
    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
777
778
    /* Final output stage */
779
780
    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
781
    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
782
    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
783
    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
784
    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
785
  }
786
787
  /* Pass 2: process 5 rows from work array, store into output array. */
788
789
  wsptr = workspace;
790
  for (ctr = 0; ctr < 5; ctr++) {
791
    outptr = output_buf[ctr] + output_col;
792
793
    /* Even part */
794
795
    /* Add range center and fudge factor for final descale and range-limit. */
796
    tmp12 = (INT32) wsptr[0] +
797
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
798
         (ONE << (PASS1_BITS+2)));
799
    tmp12 <<= CONST_BITS;
800
    tmp0 = (INT32) wsptr[2];
801
    tmp1 = (INT32) wsptr[4];
802
    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
803
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
804
    z3 = tmp12 + z2;
805
    tmp10 = z3 + z1;
806
    tmp11 = z3 - z1;
807
    tmp12 -= z2 << 2;
808
809
    /* Odd part */
810
811
    z2 = (INT32) wsptr[1];
812
    z3 = (INT32) wsptr[3];
813
814
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
815
    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
816
    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
817
818
    /* Final output stage */
819
820
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
821
                CONST_BITS+PASS1_BITS+3)
822
          & RANGE_MASK];
823
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
824
                CONST_BITS+PASS1_BITS+3)
825
          & RANGE_MASK];
826
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
827
                CONST_BITS+PASS1_BITS+3)
828
          & RANGE_MASK];
829
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
830
                CONST_BITS+PASS1_BITS+3)
831
          & RANGE_MASK];
832
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
833
                CONST_BITS+PASS1_BITS+3)
834
          & RANGE_MASK];
835
836
    wsptr += 5;   /* advance pointer to next row */
837
  }
838
}
839
840
841
/*
842
 * Perform dequantization and inverse DCT on one block of coefficients,
843
 * producing a reduced-size 4x4 output block.
844
 *
845
 * Optimized algorithm with 3 multiplications in the 1-D kernel.
846
 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
847
 */
848
849
GLOBAL(void)
850
jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
851
         JCOEFPTR coef_block,
852
         JSAMPARRAY output_buf, JDIMENSION output_col)
853
{
854
  INT32 tmp0, tmp2, tmp10, tmp12;
855
  INT32 z1, z2, z3;
856
  JCOEFPTR inptr;
857
  ISLOW_MULT_TYPE * quantptr;
858
  int * wsptr;
859
  JSAMPROW outptr;
860
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
861
  int ctr;
862
  int workspace[4*4]; /* buffers data between passes */
863
  SHIFT_TEMPS
864
865
  /* Pass 1: process columns from input, store into work array. */
866
867
  inptr = coef_block;
868
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
869
  wsptr = workspace;
870
  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
871
    /* Even part */
872
873
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
874
    if (ctr == 0)
875
      CLAMP_DC(tmp0);
876
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
877
    
878
    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
879
    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
880
881
    /* Odd part */
882
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
883
884
    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
885
    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
886
887
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
888
    /* Add fudge factor here for final descale. */
889
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
890
    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
891
           CONST_BITS-PASS1_BITS);
892
    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
893
           CONST_BITS-PASS1_BITS);
894
895
    /* Final output stage */
896
897
    wsptr[4*0] = (int) (tmp10 + tmp0);
898
    wsptr[4*3] = (int) (tmp10 - tmp0);
899
    wsptr[4*1] = (int) (tmp12 + tmp2);
900
    wsptr[4*2] = (int) (tmp12 - tmp2);
901
  }
902
903
  /* Pass 2: process 4 rows from work array, store into output array. */
904
905
  wsptr = workspace;
906
  for (ctr = 0; ctr < 4; ctr++) {
907
    outptr = output_buf[ctr] + output_col;
908
909
    /* Even part */
910
911
    /* Add range center and fudge factor for final descale and range-limit. */
912
    tmp0 = (INT32) wsptr[0] +
913
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
914
        (ONE << (PASS1_BITS+2)));
915
    tmp2 = (INT32) wsptr[2];
916
917
    tmp10 = (tmp0 + tmp2) << CONST_BITS;
918
    tmp12 = (tmp0 - tmp2) << CONST_BITS;
919
920
    /* Odd part */
921
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
922
923
    z2 = (INT32) wsptr[1];
924
    z3 = (INT32) wsptr[3];
925
926
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
927
    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
928
    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
929
930
    /* Final output stage */
931
932
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
933
                CONST_BITS+PASS1_BITS+3)
934
          & RANGE_MASK];
935
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
936
                CONST_BITS+PASS1_BITS+3)
937
          & RANGE_MASK];
938
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
939
                CONST_BITS+PASS1_BITS+3)
940
          & RANGE_MASK];
941
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
942
                CONST_BITS+PASS1_BITS+3)
943
          & RANGE_MASK];
944
945
    wsptr += 4;   /* advance pointer to next row */
946
  }
947
}
948
949
950
/*
951
 * Perform dequantization and inverse DCT on one block of coefficients,
952
 * producing a reduced-size 3x3 output block.
953
 *
954
 * Optimized algorithm with 2 multiplications in the 1-D kernel.
955
 * cK represents sqrt(2) * cos(K*pi/6).
956
 */
957
958
GLOBAL(void)
959
jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
960
         JCOEFPTR coef_block,
961
         JSAMPARRAY output_buf, JDIMENSION output_col)
962
{
963
  INT32 tmp0, tmp2, tmp10, tmp12;
964
  JCOEFPTR inptr;
965
  ISLOW_MULT_TYPE * quantptr;
966
  int * wsptr;
967
  JSAMPROW outptr;
968
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
969
  int ctr;
970
  int workspace[3*3]; /* buffers data between passes */
971
  SHIFT_TEMPS
972
973
  /* Pass 1: process columns from input, store into work array. */
974
975
  inptr = coef_block;
976
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
977
  wsptr = workspace;
978
  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
979
    /* Even part */
980
981
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
982
    if (ctr == 0)
983
      CLAMP_DC(tmp0);
984
    tmp0 <<= CONST_BITS;
985
    /* Add fudge factor here for final descale. */
986
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
987
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
988
    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
989
    tmp10 = tmp0 + tmp12;
990
    tmp2 = tmp0 - tmp12 - tmp12;
991
992
    /* Odd part */
993
994
    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
995
    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
996
997
    /* Final output stage */
998
999
    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1000
    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1001
    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
1002
  }
1003
1004
  /* Pass 2: process 3 rows from work array, store into output array. */
1005
1006
  wsptr = workspace;
1007
  for (ctr = 0; ctr < 3; ctr++) {
1008
    outptr = output_buf[ctr] + output_col;
1009
1010
    /* Even part */
1011
1012
    /* Add range center and fudge factor for final descale and range-limit. */
1013
    tmp0 = (INT32) wsptr[0] +
1014
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1015
        (ONE << (PASS1_BITS+2)));
1016
    tmp0 <<= CONST_BITS;
1017
    tmp2 = (INT32) wsptr[2];
1018
    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
1019
    tmp10 = tmp0 + tmp12;
1020
    tmp2 = tmp0 - tmp12 - tmp12;
1021
1022
    /* Odd part */
1023
1024
    tmp12 = (INT32) wsptr[1];
1025
    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
1026
1027
    /* Final output stage */
1028
1029
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1030
                CONST_BITS+PASS1_BITS+3)
1031
          & RANGE_MASK];
1032
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1033
                CONST_BITS+PASS1_BITS+3)
1034
          & RANGE_MASK];
1035
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
1036
                CONST_BITS+PASS1_BITS+3)
1037
          & RANGE_MASK];
1038
1039
    wsptr += 3;   /* advance pointer to next row */
1040
  }
1041
}
1042
1043
1044
/*
1045
 * Perform dequantization and inverse DCT on one block of coefficients,
1046
 * producing a reduced-size 2x2 output block.
1047
 *
1048
 * Multiplication-less algorithm.
1049
 */
1050
1051
GLOBAL(void)
1052
jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1053
         JCOEFPTR coef_block,
1054
         JSAMPARRAY output_buf, JDIMENSION output_col)
1055
{
1056
  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1057
  ISLOW_MULT_TYPE * quantptr;
1058
  JSAMPROW outptr;
1059
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1060
  ISHIFT_TEMPS
1061
1062
  /* Pass 1: process columns from input. */
1063
1064
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1065
1066
  /* Column 0 */
1067
  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1068
  CLAMP_DC(tmp4);
1069
  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1070
  /* Add range center and fudge factor for final descale and range-limit. */
1071
  tmp4 += (((DCTELEM) RANGE_CENTER) << 3) + (1 << 2);
1072
1073
  tmp0 = tmp4 + tmp5;
1074
  tmp2 = tmp4 - tmp5;
1075
1076
  /* Column 1 */
1077
  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1078
  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1079
1080
  tmp1 = tmp4 + tmp5;
1081
  tmp3 = tmp4 - tmp5;
1082
1083
  /* Pass 2: process 2 rows, store into output array. */
1084
1085
  /* Row 0 */
1086
  outptr = output_buf[0] + output_col;
1087
1088
  outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1089
  outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1090
1091
  /* Row 1 */
1092
  outptr = output_buf[1] + output_col;
1093
1094
  outptr[0] = range_limit[(int) IRIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1095
  outptr[1] = range_limit[(int) IRIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1096
}
1097
1098
1099
/*
1100
 * Perform dequantization and inverse DCT on one block of coefficients,
1101
 * producing a reduced-size 1x1 output block.
1102
 *
1103
 * We hardly need an inverse DCT routine for this: just take the
1104
 * average pixel value, which is one-eighth of the DC coefficient.
1105
 */
1106
1107
GLOBAL(void)
1108
jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1109
         JCOEFPTR coef_block,
1110
         JSAMPARRAY output_buf, JDIMENSION output_col)
1111
{
1112
  int dcval;
1113
  ISLOW_MULT_TYPE * quantptr;
1114
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1115
  SHIFT_TEMPS
1116
1117
  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1118
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1119
  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1120
  CLAMP_DC(dcval);
1121
  dcval = (int) DESCALE((INT32) dcval, 3);
1122
1123
  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
1124
}
1125
1126
1127
/*
1128
 * Perform dequantization and inverse DCT on one block of coefficients,
1129
 * producing a 9x9 output block.
1130
 *
1131
 * Optimized algorithm with 10 multiplications in the 1-D kernel.
1132
 * cK represents sqrt(2) * cos(K*pi/18).
1133
 */
1134
1135
GLOBAL(void)
1136
jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1137
         JCOEFPTR coef_block,
1138
         JSAMPARRAY output_buf, JDIMENSION output_col)
1139
{
1140
  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1141
  INT32 z1, z2, z3, z4;
1142
  JCOEFPTR inptr;
1143
  ISLOW_MULT_TYPE * quantptr;
1144
  int * wsptr;
1145
  JSAMPROW outptr;
1146
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1147
  int ctr;
1148
  int workspace[8*9]; /* buffers data between passes */
1149
  SHIFT_TEMPS
1150
1151
  /* Pass 1: process columns from input, store into work array. */
1152
1153
  inptr = coef_block;
1154
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1155
  wsptr = workspace;
1156
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1157
    /* Even part */
1158
1159
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1160
    if (ctr == 0)
1161
      CLAMP_DC(tmp0);
1162
    tmp0 <<= CONST_BITS;
1163
    /* Add fudge factor here for final descale. */
1164
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1165
1166
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1167
    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1168
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1169
1170
    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1171
    tmp1 = tmp0 + tmp3;
1172
    tmp2 = tmp0 - tmp3 - tmp3;
1173
1174
    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1175
    tmp11 = tmp2 + tmp0;
1176
    tmp14 = tmp2 - tmp0 - tmp0;
1177
1178
    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1179
    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1180
    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1181
1182
    tmp10 = tmp1 + tmp0 - tmp3;
1183
    tmp12 = tmp1 - tmp0 + tmp2;
1184
    tmp13 = tmp1 - tmp2 + tmp3;
1185
1186
    /* Odd part */
1187
1188
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1189
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1190
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1191
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1192
1193
    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1194
1195
    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1196
    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1197
    tmp0 = tmp2 + tmp3 - z2;
1198
    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1199
    tmp2 += z2 - tmp1;
1200
    tmp3 += z2 + tmp1;
1201
    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1202
1203
    /* Final output stage */
1204
1205
    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1206
    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1207
    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1208
    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1209
    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1210
    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1211
    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1212
    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1213
    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1214
  }
1215
1216
  /* Pass 2: process 9 rows from work array, store into output array. */
1217
1218
  wsptr = workspace;
1219
  for (ctr = 0; ctr < 9; ctr++) {
1220
    outptr = output_buf[ctr] + output_col;
1221
1222
    /* Even part */
1223
1224
    /* Add range center and fudge factor for final descale and range-limit. */
1225
    tmp0 = (INT32) wsptr[0] +
1226
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1227
        (ONE << (PASS1_BITS+2)));
1228
    tmp0 <<= CONST_BITS;
1229
1230
    z1 = (INT32) wsptr[2];
1231
    z2 = (INT32) wsptr[4];
1232
    z3 = (INT32) wsptr[6];
1233
1234
    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1235
    tmp1 = tmp0 + tmp3;
1236
    tmp2 = tmp0 - tmp3 - tmp3;
1237
1238
    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1239
    tmp11 = tmp2 + tmp0;
1240
    tmp14 = tmp2 - tmp0 - tmp0;
1241
1242
    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1243
    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1244
    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1245
1246
    tmp10 = tmp1 + tmp0 - tmp3;
1247
    tmp12 = tmp1 - tmp0 + tmp2;
1248
    tmp13 = tmp1 - tmp2 + tmp3;
1249
1250
    /* Odd part */
1251
1252
    z1 = (INT32) wsptr[1];
1253
    z2 = (INT32) wsptr[3];
1254
    z3 = (INT32) wsptr[5];
1255
    z4 = (INT32) wsptr[7];
1256
1257
    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1258
1259
    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1260
    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1261
    tmp0 = tmp2 + tmp3 - z2;
1262
    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1263
    tmp2 += z2 - tmp1;
1264
    tmp3 += z2 + tmp1;
1265
    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1266
1267
    /* Final output stage */
1268
1269
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1270
                CONST_BITS+PASS1_BITS+3)
1271
          & RANGE_MASK];
1272
    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1273
                CONST_BITS+PASS1_BITS+3)
1274
          & RANGE_MASK];
1275
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1276
                CONST_BITS+PASS1_BITS+3)
1277
          & RANGE_MASK];
1278
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1279
                CONST_BITS+PASS1_BITS+3)
1280
          & RANGE_MASK];
1281
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1282
                CONST_BITS+PASS1_BITS+3)
1283
          & RANGE_MASK];
1284
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1285
                CONST_BITS+PASS1_BITS+3)
1286
          & RANGE_MASK];
1287
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1288
                CONST_BITS+PASS1_BITS+3)
1289
          & RANGE_MASK];
1290
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1291
                CONST_BITS+PASS1_BITS+3)
1292
          & RANGE_MASK];
1293
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1294
                CONST_BITS+PASS1_BITS+3)
1295
          & RANGE_MASK];
1296
1297
    wsptr += 8;   /* advance pointer to next row */
1298
  }
1299
}
1300
1301
1302
/*
1303
 * Perform dequantization and inverse DCT on one block of coefficients,
1304
 * producing a 10x10 output block.
1305
 *
1306
 * Optimized algorithm with 12 multiplications in the 1-D kernel.
1307
 * cK represents sqrt(2) * cos(K*pi/20).
1308
 */
1309
1310
GLOBAL(void)
1311
jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1312
     JCOEFPTR coef_block,
1313
     JSAMPARRAY output_buf, JDIMENSION output_col)
1314
{
1315
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1316
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1317
  INT32 z1, z2, z3, z4, z5;
1318
  JCOEFPTR inptr;
1319
  ISLOW_MULT_TYPE * quantptr;
1320
  int * wsptr;
1321
  JSAMPROW outptr;
1322
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1323
  int ctr;
1324
  int workspace[8*10];  /* buffers data between passes */
1325
  SHIFT_TEMPS
1326
1327
  /* Pass 1: process columns from input, store into work array. */
1328
1329
  inptr = coef_block;
1330
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1331
  wsptr = workspace;
1332
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1333
    /* Even part */
1334
1335
    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1336
    if (ctr == 0)
1337
      CLAMP_DC(z3);
1338
    z3 <<= CONST_BITS;
1339
    /* Add fudge factor here for final descale. */
1340
    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1341
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1342
    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1343
    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1344
    tmp10 = z3 + z1;
1345
    tmp11 = z3 - z2;
1346
1347
    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1348
      CONST_BITS-PASS1_BITS);
1349
1350
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1351
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1352
1353
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1354
    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1355
    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1356
1357
    tmp20 = tmp10 + tmp12;
1358
    tmp24 = tmp10 - tmp12;
1359
    tmp21 = tmp11 + tmp13;
1360
    tmp23 = tmp11 - tmp13;
1361
1362
    /* Odd part */
1363
1364
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1365
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1366
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1367
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1368
1369
    tmp11 = z2 + z4;
1370
    tmp13 = z2 - z4;
1371
1372
    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1373
    z5 = z3 << CONST_BITS;
1374
1375
    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1376
    z4 = z5 + tmp12;
1377
1378
    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1379
    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1380
1381
    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1382
    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1383
1384
    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1385
1386
    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1387
    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1388
1389
    /* Final output stage */
1390
1391
    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1392
    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1393
    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1394
    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1395
    wsptr[8*2] = (int) (tmp22 + tmp12);
1396
    wsptr[8*7] = (int) (tmp22 - tmp12);
1397
    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1398
    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1399
    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1400
    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1401
  }
1402
1403
  /* Pass 2: process 10 rows from work array, store into output array. */
1404
1405
  wsptr = workspace;
1406
  for (ctr = 0; ctr < 10; ctr++) {
1407
    outptr = output_buf[ctr] + output_col;
1408
1409
    /* Even part */
1410
1411
    /* Add range center and fudge factor for final descale and range-limit. */
1412
    z3 = (INT32) wsptr[0] +
1413
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1414
      (ONE << (PASS1_BITS+2)));
1415
    z3 <<= CONST_BITS;
1416
    z4 = (INT32) wsptr[4];
1417
    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1418
    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1419
    tmp10 = z3 + z1;
1420
    tmp11 = z3 - z2;
1421
1422
    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1423
1424
    z2 = (INT32) wsptr[2];
1425
    z3 = (INT32) wsptr[6];
1426
1427
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1428
    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1429
    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1430
1431
    tmp20 = tmp10 + tmp12;
1432
    tmp24 = tmp10 - tmp12;
1433
    tmp21 = tmp11 + tmp13;
1434
    tmp23 = tmp11 - tmp13;
1435
1436
    /* Odd part */
1437
1438
    z1 = (INT32) wsptr[1];
1439
    z2 = (INT32) wsptr[3];
1440
    z3 = (INT32) wsptr[5];
1441
    z3 <<= CONST_BITS;
1442
    z4 = (INT32) wsptr[7];
1443
1444
    tmp11 = z2 + z4;
1445
    tmp13 = z2 - z4;
1446
1447
    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1448
1449
    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1450
    z4 = z3 + tmp12;
1451
1452
    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1453
    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1454
1455
    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1456
    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1457
1458
    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1459
1460
    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1461
    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1462
1463
    /* Final output stage */
1464
1465
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1466
                CONST_BITS+PASS1_BITS+3)
1467
          & RANGE_MASK];
1468
    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1469
                CONST_BITS+PASS1_BITS+3)
1470
          & RANGE_MASK];
1471
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1472
                CONST_BITS+PASS1_BITS+3)
1473
          & RANGE_MASK];
1474
    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1475
                CONST_BITS+PASS1_BITS+3)
1476
          & RANGE_MASK];
1477
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1478
                CONST_BITS+PASS1_BITS+3)
1479
          & RANGE_MASK];
1480
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1481
                CONST_BITS+PASS1_BITS+3)
1482
          & RANGE_MASK];
1483
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1484
                CONST_BITS+PASS1_BITS+3)
1485
          & RANGE_MASK];
1486
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1487
                CONST_BITS+PASS1_BITS+3)
1488
          & RANGE_MASK];
1489
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1490
                CONST_BITS+PASS1_BITS+3)
1491
          & RANGE_MASK];
1492
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1493
                CONST_BITS+PASS1_BITS+3)
1494
          & RANGE_MASK];
1495
1496
    wsptr += 8;   /* advance pointer to next row */
1497
  }
1498
}
1499
1500
1501
/*
1502
 * Perform dequantization and inverse DCT on one block of coefficients,
1503
 * producing an 11x11 output block.
1504
 *
1505
 * Optimized algorithm with 24 multiplications in the 1-D kernel.
1506
 * cK represents sqrt(2) * cos(K*pi/22).
1507
 */
1508
1509
GLOBAL(void)
1510
jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1511
     JCOEFPTR coef_block,
1512
     JSAMPARRAY output_buf, JDIMENSION output_col)
1513
{
1514
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1515
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1516
  INT32 z1, z2, z3, z4;
1517
  JCOEFPTR inptr;
1518
  ISLOW_MULT_TYPE * quantptr;
1519
  int * wsptr;
1520
  JSAMPROW outptr;
1521
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1522
  int ctr;
1523
  int workspace[8*11];  /* buffers data between passes */
1524
  SHIFT_TEMPS
1525
1526
  /* Pass 1: process columns from input, store into work array. */
1527
1528
  inptr = coef_block;
1529
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1530
  wsptr = workspace;
1531
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1532
    /* Even part */
1533
1534
    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1535
    if (ctr == 0)
1536
      CLAMP_DC(tmp10);
1537
    tmp10 <<= CONST_BITS;
1538
    /* Add fudge factor here for final descale. */
1539
    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1540
1541
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1542
    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1543
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1544
1545
    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1546
    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1547
    z4 = z1 + z3;
1548
    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1549
    z4 -= z2;
1550
    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1551
    tmp21 = tmp20 + tmp23 + tmp25 -
1552
      MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1553
    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1554
    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1555
    tmp24 += tmp25;
1556
    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1557
    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1558
       MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1559
    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1560
1561
    /* Odd part */
1562
1563
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1564
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1565
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1566
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1567
1568
    tmp11 = z1 + z2;
1569
    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1570
    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1571
    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1572
    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1573
    tmp10 = tmp11 + tmp12 + tmp13 -
1574
      MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1575
    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1576
    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1577
    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1578
    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1579
    tmp11 += z1;
1580
    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1581
    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1582
       MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1583
       MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1584
1585
    /* Final output stage */
1586
1587
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1588
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1589
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1590
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1591
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1592
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1593
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1594
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1595
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1596
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1597
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1598
  }
1599
1600
  /* Pass 2: process 11 rows from work array, store into output array. */
1601
1602
  wsptr = workspace;
1603
  for (ctr = 0; ctr < 11; ctr++) {
1604
    outptr = output_buf[ctr] + output_col;
1605
1606
    /* Even part */
1607
1608
    /* Add range center and fudge factor for final descale and range-limit. */
1609
    tmp10 = (INT32) wsptr[0] +
1610
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1611
         (ONE << (PASS1_BITS+2)));
1612
    tmp10 <<= CONST_BITS;
1613
1614
    z1 = (INT32) wsptr[2];
1615
    z2 = (INT32) wsptr[4];
1616
    z3 = (INT32) wsptr[6];
1617
1618
    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1619
    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1620
    z4 = z1 + z3;
1621
    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1622
    z4 -= z2;
1623
    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1624
    tmp21 = tmp20 + tmp23 + tmp25 -
1625
      MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1626
    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1627
    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1628
    tmp24 += tmp25;
1629
    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1630
    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1631
       MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1632
    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1633
1634
    /* Odd part */
1635
1636
    z1 = (INT32) wsptr[1];
1637
    z2 = (INT32) wsptr[3];
1638
    z3 = (INT32) wsptr[5];
1639
    z4 = (INT32) wsptr[7];
1640
1641
    tmp11 = z1 + z2;
1642
    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1643
    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1644
    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1645
    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1646
    tmp10 = tmp11 + tmp12 + tmp13 -
1647
      MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1648
    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1649
    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1650
    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1651
    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1652
    tmp11 += z1;
1653
    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1654
    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1655
       MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1656
       MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1657
1658
    /* Final output stage */
1659
1660
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1661
                 CONST_BITS+PASS1_BITS+3)
1662
           & RANGE_MASK];
1663
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1664
                 CONST_BITS+PASS1_BITS+3)
1665
           & RANGE_MASK];
1666
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1667
                 CONST_BITS+PASS1_BITS+3)
1668
           & RANGE_MASK];
1669
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1670
                 CONST_BITS+PASS1_BITS+3)
1671
           & RANGE_MASK];
1672
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1673
                 CONST_BITS+PASS1_BITS+3)
1674
           & RANGE_MASK];
1675
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1676
                 CONST_BITS+PASS1_BITS+3)
1677
           & RANGE_MASK];
1678
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1679
                 CONST_BITS+PASS1_BITS+3)
1680
           & RANGE_MASK];
1681
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1682
                 CONST_BITS+PASS1_BITS+3)
1683
           & RANGE_MASK];
1684
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1685
                 CONST_BITS+PASS1_BITS+3)
1686
           & RANGE_MASK];
1687
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1688
                 CONST_BITS+PASS1_BITS+3)
1689
           & RANGE_MASK];
1690
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
1691
                 CONST_BITS+PASS1_BITS+3)
1692
           & RANGE_MASK];
1693
1694
    wsptr += 8;   /* advance pointer to next row */
1695
  }
1696
}
1697
1698
1699
/*
1700
 * Perform dequantization and inverse DCT on one block of coefficients,
1701
 * producing a 12x12 output block.
1702
 *
1703
 * Optimized algorithm with 15 multiplications in the 1-D kernel.
1704
 * cK represents sqrt(2) * cos(K*pi/24).
1705
 */
1706
1707
GLOBAL(void)
1708
jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1709
     JCOEFPTR coef_block,
1710
     JSAMPARRAY output_buf, JDIMENSION output_col)
1711
{
1712
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1713
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1714
  INT32 z1, z2, z3, z4;
1715
  JCOEFPTR inptr;
1716
  ISLOW_MULT_TYPE * quantptr;
1717
  int * wsptr;
1718
  JSAMPROW outptr;
1719
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1720
  int ctr;
1721
  int workspace[8*12];  /* buffers data between passes */
1722
  SHIFT_TEMPS
1723
1724
  /* Pass 1: process columns from input, store into work array. */
1725
1726
  inptr = coef_block;
1727
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1728
  wsptr = workspace;
1729
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1730
    /* Even part */
1731
1732
    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1733
    if (ctr == 0)
1734
      CLAMP_DC(z3);
1735
    z3 <<= CONST_BITS;
1736
    /* Add fudge factor here for final descale. */
1737
    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1738
1739
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1740
    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1741
1742
    tmp10 = z3 + z4;
1743
    tmp11 = z3 - z4;
1744
1745
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1746
    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1747
    z1 <<= CONST_BITS;
1748
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1749
    z2 <<= CONST_BITS;
1750
1751
    tmp12 = z1 - z2;
1752
1753
    tmp21 = z3 + tmp12;
1754
    tmp24 = z3 - tmp12;
1755
1756
    tmp12 = z4 + z2;
1757
1758
    tmp20 = tmp10 + tmp12;
1759
    tmp25 = tmp10 - tmp12;
1760
1761
    tmp12 = z4 - z1 - z2;
1762
1763
    tmp22 = tmp11 + tmp12;
1764
    tmp23 = tmp11 - tmp12;
1765
1766
    /* Odd part */
1767
1768
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1769
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1770
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1771
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1772
1773
    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1774
    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1775
1776
    tmp10 = z1 + z3;
1777
    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1778
    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1779
    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1780
    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1781
    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1782
    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1783
    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1784
       MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1785
1786
    z1 -= z4;
1787
    z2 -= z3;
1788
    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1789
    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1790
    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1791
1792
    /* Final output stage */
1793
1794
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1795
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1796
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1797
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1798
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1799
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1800
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1801
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1802
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1803
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1804
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1805
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1806
  }
1807
1808
  /* Pass 2: process 12 rows from work array, store into output array. */
1809
1810
  wsptr = workspace;
1811
  for (ctr = 0; ctr < 12; ctr++) {
1812
    outptr = output_buf[ctr] + output_col;
1813
1814
    /* Even part */
1815
1816
    /* Add range center and fudge factor for final descale and range-limit. */
1817
    z3 = (INT32) wsptr[0] +
1818
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
1819
      (ONE << (PASS1_BITS+2)));
1820
    z3 <<= CONST_BITS;
1821
1822
    z4 = (INT32) wsptr[4];
1823
    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1824
1825
    tmp10 = z3 + z4;
1826
    tmp11 = z3 - z4;
1827
1828
    z1 = (INT32) wsptr[2];
1829
    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1830
    z1 <<= CONST_BITS;
1831
    z2 = (INT32) wsptr[6];
1832
    z2 <<= CONST_BITS;
1833
1834
    tmp12 = z1 - z2;
1835
1836
    tmp21 = z3 + tmp12;
1837
    tmp24 = z3 - tmp12;
1838
1839
    tmp12 = z4 + z2;
1840
1841
    tmp20 = tmp10 + tmp12;
1842
    tmp25 = tmp10 - tmp12;
1843
1844
    tmp12 = z4 - z1 - z2;
1845
1846
    tmp22 = tmp11 + tmp12;
1847
    tmp23 = tmp11 - tmp12;
1848
1849
    /* Odd part */
1850
1851
    z1 = (INT32) wsptr[1];
1852
    z2 = (INT32) wsptr[3];
1853
    z3 = (INT32) wsptr[5];
1854
    z4 = (INT32) wsptr[7];
1855
1856
    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1857
    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1858
1859
    tmp10 = z1 + z3;
1860
    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1861
    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1862
    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1863
    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1864
    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1865
    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1866
    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1867
       MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1868
1869
    z1 -= z4;
1870
    z2 -= z3;
1871
    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1872
    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1873
    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1874
1875
    /* Final output stage */
1876
1877
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1878
                 CONST_BITS+PASS1_BITS+3)
1879
           & RANGE_MASK];
1880
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1881
                 CONST_BITS+PASS1_BITS+3)
1882
           & RANGE_MASK];
1883
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1884
                 CONST_BITS+PASS1_BITS+3)
1885
           & RANGE_MASK];
1886
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1887
                 CONST_BITS+PASS1_BITS+3)
1888
           & RANGE_MASK];
1889
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1890
                 CONST_BITS+PASS1_BITS+3)
1891
           & RANGE_MASK];
1892
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1893
                 CONST_BITS+PASS1_BITS+3)
1894
           & RANGE_MASK];
1895
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1896
                 CONST_BITS+PASS1_BITS+3)
1897
           & RANGE_MASK];
1898
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1899
                 CONST_BITS+PASS1_BITS+3)
1900
           & RANGE_MASK];
1901
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1902
                 CONST_BITS+PASS1_BITS+3)
1903
           & RANGE_MASK];
1904
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1905
                 CONST_BITS+PASS1_BITS+3)
1906
           & RANGE_MASK];
1907
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1908
                 CONST_BITS+PASS1_BITS+3)
1909
           & RANGE_MASK];
1910
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1911
                 CONST_BITS+PASS1_BITS+3)
1912
           & RANGE_MASK];
1913
1914
    wsptr += 8;   /* advance pointer to next row */
1915
  }
1916
}
1917
1918
1919
/*
1920
 * Perform dequantization and inverse DCT on one block of coefficients,
1921
 * producing a 13x13 output block.
1922
 *
1923
 * Optimized algorithm with 29 multiplications in the 1-D kernel.
1924
 * cK represents sqrt(2) * cos(K*pi/26).
1925
 */
1926
1927
GLOBAL(void)
1928
jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1929
     JCOEFPTR coef_block,
1930
     JSAMPARRAY output_buf, JDIMENSION output_col)
1931
{
1932
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1933
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1934
  INT32 z1, z2, z3, z4;
1935
  JCOEFPTR inptr;
1936
  ISLOW_MULT_TYPE * quantptr;
1937
  int * wsptr;
1938
  JSAMPROW outptr;
1939
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1940
  int ctr;
1941
  int workspace[8*13];  /* buffers data between passes */
1942
  SHIFT_TEMPS
1943
1944
  /* Pass 1: process columns from input, store into work array. */
1945
1946
  inptr = coef_block;
1947
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1948
  wsptr = workspace;
1949
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1950
    /* Even part */
1951
1952
    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1953
    if (ctr == 0)
1954
      CLAMP_DC(z1);
1955
    z1 <<= CONST_BITS;
1956
    /* Add fudge factor here for final descale. */
1957
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1958
1959
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1960
    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1961
    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1962
1963
    tmp10 = z3 + z4;
1964
    tmp11 = z3 - z4;
1965
1966
    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1967
    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1968
1969
    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1970
    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1971
1972
    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1973
    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1974
1975
    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1976
    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1977
1978
    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1979
    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1980
1981
    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1982
    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1983
1984
    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1985
1986
    /* Odd part */
1987
1988
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1989
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1990
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1991
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1992
1993
    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1994
    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1995
    tmp15 = z1 + z4;
1996
    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1997
    tmp10 = tmp11 + tmp12 + tmp13 -
1998
      MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1999
    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2000
    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2001
    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2002
    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2003
    tmp11 += tmp14;
2004
    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2005
    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2006
    tmp12 += tmp14;
2007
    tmp13 += tmp14;
2008
    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2009
    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2010
      MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2011
    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2012
    tmp14 += z1;
2013
    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2014
       MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2015
2016
    /* Final output stage */
2017
2018
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2019
    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2020
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2021
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2022
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2023
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2024
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2025
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2026
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2027
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2028
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2029
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2030
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
2031
  }
2032
2033
  /* Pass 2: process 13 rows from work array, store into output array. */
2034
2035
  wsptr = workspace;
2036
  for (ctr = 0; ctr < 13; ctr++) {
2037
    outptr = output_buf[ctr] + output_col;
2038
2039
    /* Even part */
2040
2041
    /* Add range center and fudge factor for final descale and range-limit. */
2042
    z1 = (INT32) wsptr[0] +
2043
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2044
      (ONE << (PASS1_BITS+2)));
2045
    z1 <<= CONST_BITS;
2046
2047
    z2 = (INT32) wsptr[2];
2048
    z3 = (INT32) wsptr[4];
2049
    z4 = (INT32) wsptr[6];
2050
2051
    tmp10 = z3 + z4;
2052
    tmp11 = z3 - z4;
2053
2054
    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
2055
    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
2056
2057
    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
2058
    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
2059
2060
    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
2061
    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
2062
2063
    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
2064
    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
2065
2066
    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
2067
    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
2068
2069
    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2070
    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2071
2072
    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
2073
2074
    /* Odd part */
2075
2076
    z1 = (INT32) wsptr[1];
2077
    z2 = (INT32) wsptr[3];
2078
    z3 = (INT32) wsptr[5];
2079
    z4 = (INT32) wsptr[7];
2080
2081
    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
2082
    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
2083
    tmp15 = z1 + z4;
2084
    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
2085
    tmp10 = tmp11 + tmp12 + tmp13 -
2086
      MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
2087
    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2088
    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2089
    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2090
    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2091
    tmp11 += tmp14;
2092
    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2093
    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2094
    tmp12 += tmp14;
2095
    tmp13 += tmp14;
2096
    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2097
    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2098
      MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2099
    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2100
    tmp14 += z1;
2101
    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2102
       MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2103
2104
    /* Final output stage */
2105
2106
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2107
                 CONST_BITS+PASS1_BITS+3)
2108
           & RANGE_MASK];
2109
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2110
                 CONST_BITS+PASS1_BITS+3)
2111
           & RANGE_MASK];
2112
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2113
                 CONST_BITS+PASS1_BITS+3)
2114
           & RANGE_MASK];
2115
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2116
                 CONST_BITS+PASS1_BITS+3)
2117
           & RANGE_MASK];
2118
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2119
                 CONST_BITS+PASS1_BITS+3)
2120
           & RANGE_MASK];
2121
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2122
                 CONST_BITS+PASS1_BITS+3)
2123
           & RANGE_MASK];
2124
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2125
                 CONST_BITS+PASS1_BITS+3)
2126
           & RANGE_MASK];
2127
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2128
                 CONST_BITS+PASS1_BITS+3)
2129
           & RANGE_MASK];
2130
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2131
                 CONST_BITS+PASS1_BITS+3)
2132
           & RANGE_MASK];
2133
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2134
                 CONST_BITS+PASS1_BITS+3)
2135
           & RANGE_MASK];
2136
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2137
                 CONST_BITS+PASS1_BITS+3)
2138
           & RANGE_MASK];
2139
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2140
                 CONST_BITS+PASS1_BITS+3)
2141
           & RANGE_MASK];
2142
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
2143
                 CONST_BITS+PASS1_BITS+3)
2144
           & RANGE_MASK];
2145
2146
    wsptr += 8;   /* advance pointer to next row */
2147
  }
2148
}
2149
2150
2151
/*
2152
 * Perform dequantization and inverse DCT on one block of coefficients,
2153
 * producing a 14x14 output block.
2154
 *
2155
 * Optimized algorithm with 20 multiplications in the 1-D kernel.
2156
 * cK represents sqrt(2) * cos(K*pi/28).
2157
 */
2158
2159
GLOBAL(void)
2160
jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2161
     JCOEFPTR coef_block,
2162
     JSAMPARRAY output_buf, JDIMENSION output_col)
2163
{
2164
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2165
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2166
  INT32 z1, z2, z3, z4;
2167
  JCOEFPTR inptr;
2168
  ISLOW_MULT_TYPE * quantptr;
2169
  int * wsptr;
2170
  JSAMPROW outptr;
2171
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2172
  int ctr;
2173
  int workspace[8*14];  /* buffers data between passes */
2174
  SHIFT_TEMPS
2175
2176
  /* Pass 1: process columns from input, store into work array. */
2177
2178
  inptr = coef_block;
2179
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2180
  wsptr = workspace;
2181
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2182
    /* Even part */
2183
2184
    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2185
    if (ctr == 0)
2186
      CLAMP_DC(z1);
2187
    z1 <<= CONST_BITS;
2188
    /* Add fudge factor here for final descale. */
2189
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2190
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2191
    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2192
    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2193
    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2194
2195
    tmp10 = z1 + z2;
2196
    tmp11 = z1 + z3;
2197
    tmp12 = z1 - z4;
2198
2199
    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2200
      CONST_BITS-PASS1_BITS);
2201
2202
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2203
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2204
2205
    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2206
2207
    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2208
    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2209
    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2210
      MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2211
2212
    tmp20 = tmp10 + tmp13;
2213
    tmp26 = tmp10 - tmp13;
2214
    tmp21 = tmp11 + tmp14;
2215
    tmp25 = tmp11 - tmp14;
2216
    tmp22 = tmp12 + tmp15;
2217
    tmp24 = tmp12 - tmp15;
2218
2219
    /* Odd part */
2220
2221
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2222
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2223
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2224
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2225
    tmp13 = z4 << CONST_BITS;
2226
2227
    tmp14 = z1 + z3;
2228
    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2229
    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2230
    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2231
    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2232
    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2233
    z1    -= z2;
2234
    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
2235
    tmp16 += tmp15;
2236
    z1    += z4;
2237
    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2238
    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
2239
    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
2240
    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2241
    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2242
    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
2243
2244
    tmp13 = (z1 - z3) << PASS1_BITS;
2245
2246
    /* Final output stage */
2247
2248
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2249
    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2250
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2251
    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2252
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2253
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2254
    wsptr[8*3]  = (int) (tmp23 + tmp13);
2255
    wsptr[8*10] = (int) (tmp23 - tmp13);
2256
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2257
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2258
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2259
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2260
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2261
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2262
  }
2263
2264
  /* Pass 2: process 14 rows from work array, store into output array. */
2265
2266
  wsptr = workspace;
2267
  for (ctr = 0; ctr < 14; ctr++) {
2268
    outptr = output_buf[ctr] + output_col;
2269
2270
    /* Even part */
2271
2272
    /* Add range center and fudge factor for final descale and range-limit. */
2273
    z1 = (INT32) wsptr[0] +
2274
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2275
      (ONE << (PASS1_BITS+2)));
2276
    z1 <<= CONST_BITS;
2277
    z4 = (INT32) wsptr[4];
2278
    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2279
    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2280
    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2281
2282
    tmp10 = z1 + z2;
2283
    tmp11 = z1 + z3;
2284
    tmp12 = z1 - z4;
2285
2286
    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
2287
2288
    z1 = (INT32) wsptr[2];
2289
    z2 = (INT32) wsptr[6];
2290
2291
    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2292
2293
    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2294
    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2295
    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2296
      MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2297
2298
    tmp20 = tmp10 + tmp13;
2299
    tmp26 = tmp10 - tmp13;
2300
    tmp21 = tmp11 + tmp14;
2301
    tmp25 = tmp11 - tmp14;
2302
    tmp22 = tmp12 + tmp15;
2303
    tmp24 = tmp12 - tmp15;
2304
2305
    /* Odd part */
2306
2307
    z1 = (INT32) wsptr[1];
2308
    z2 = (INT32) wsptr[3];
2309
    z3 = (INT32) wsptr[5];
2310
    z4 = (INT32) wsptr[7];
2311
    z4 <<= CONST_BITS;
2312
2313
    tmp14 = z1 + z3;
2314
    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2315
    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2316
    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2317
    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2318
    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2319
    z1    -= z2;
2320
    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2321
    tmp16 += tmp15;
2322
    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
2323
    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2324
    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2325
    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2326
    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2327
    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2328
2329
    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2330
2331
    /* Final output stage */
2332
2333
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2334
                 CONST_BITS+PASS1_BITS+3)
2335
           & RANGE_MASK];
2336
    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2337
                 CONST_BITS+PASS1_BITS+3)
2338
           & RANGE_MASK];
2339
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2340
                 CONST_BITS+PASS1_BITS+3)
2341
           & RANGE_MASK];
2342
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2343
                 CONST_BITS+PASS1_BITS+3)
2344
           & RANGE_MASK];
2345
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2346
                 CONST_BITS+PASS1_BITS+3)
2347
           & RANGE_MASK];
2348
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2349
                 CONST_BITS+PASS1_BITS+3)
2350
           & RANGE_MASK];
2351
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2352
                 CONST_BITS+PASS1_BITS+3)
2353
           & RANGE_MASK];
2354
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2355
                 CONST_BITS+PASS1_BITS+3)
2356
           & RANGE_MASK];
2357
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2358
                 CONST_BITS+PASS1_BITS+3)
2359
           & RANGE_MASK];
2360
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2361
                 CONST_BITS+PASS1_BITS+3)
2362
           & RANGE_MASK];
2363
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2364
                 CONST_BITS+PASS1_BITS+3)
2365
           & RANGE_MASK];
2366
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2367
                 CONST_BITS+PASS1_BITS+3)
2368
           & RANGE_MASK];
2369
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2370
                 CONST_BITS+PASS1_BITS+3)
2371
           & RANGE_MASK];
2372
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2373
                 CONST_BITS+PASS1_BITS+3)
2374
           & RANGE_MASK];
2375
2376
    wsptr += 8;   /* advance pointer to next row */
2377
  }
2378
}
2379
2380
2381
/*
2382
 * Perform dequantization and inverse DCT on one block of coefficients,
2383
 * producing a 15x15 output block.
2384
 *
2385
 * Optimized algorithm with 22 multiplications in the 1-D kernel.
2386
 * cK represents sqrt(2) * cos(K*pi/30).
2387
 */
2388
2389
GLOBAL(void)
2390
jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2391
     JCOEFPTR coef_block,
2392
     JSAMPARRAY output_buf, JDIMENSION output_col)
2393
{
2394
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2395
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2396
  INT32 z1, z2, z3, z4;
2397
  JCOEFPTR inptr;
2398
  ISLOW_MULT_TYPE * quantptr;
2399
  int * wsptr;
2400
  JSAMPROW outptr;
2401
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2402
  int ctr;
2403
  int workspace[8*15];  /* buffers data between passes */
2404
  SHIFT_TEMPS
2405
2406
  /* Pass 1: process columns from input, store into work array. */
2407
2408
  inptr = coef_block;
2409
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2410
  wsptr = workspace;
2411
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2412
    /* Even part */
2413
2414
    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2415
    if (ctr == 0)
2416
      CLAMP_DC(z1);
2417
    z1 <<= CONST_BITS;
2418
    /* Add fudge factor here for final descale. */
2419
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2420
2421
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2422
    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2423
    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2424
2425
    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2426
    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2427
2428
    tmp12 = z1 - tmp10;
2429
    tmp13 = z1 + tmp11;
2430
    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2431
2432
    z4 = z2 - z3;
2433
    z3 += z2;
2434
    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2435
    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2436
    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2437
2438
    tmp20 = tmp13 + tmp10 + tmp11;
2439
    tmp23 = tmp12 - tmp10 + tmp11 + z2;
2440
2441
    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2442
    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2443
2444
    tmp25 = tmp13 - tmp10 - tmp11;
2445
    tmp26 = tmp12 + tmp10 - tmp11 - z2;
2446
2447
    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2448
    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2449
2450
    tmp21 = tmp12 + tmp10 + tmp11;
2451
    tmp24 = tmp13 - tmp10 + tmp11;
2452
    tmp11 += tmp11;
2453
    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2454
    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2455
2456
    /* Odd part */
2457
2458
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2459
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2460
    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2461
    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2462
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2463
2464
    tmp13 = z2 - z4;
2465
    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2466
    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2467
    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2468
2469
    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2470
    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2471
    z2 = z1 - z4;
2472
    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2473
2474
    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2475
    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2476
    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2477
    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2478
    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2479
    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2480
2481
    /* Final output stage */
2482
2483
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2484
    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2485
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2486
    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2487
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2488
    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2489
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2490
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2491
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2492
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2493
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2494
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2495
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2496
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2497
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2498
  }
2499
2500
  /* Pass 2: process 15 rows from work array, store into output array. */
2501
2502
  wsptr = workspace;
2503
  for (ctr = 0; ctr < 15; ctr++) {
2504
    outptr = output_buf[ctr] + output_col;
2505
2506
    /* Even part */
2507
2508
    /* Add range center and fudge factor for final descale and range-limit. */
2509
    z1 = (INT32) wsptr[0] +
2510
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2511
      (ONE << (PASS1_BITS+2)));
2512
    z1 <<= CONST_BITS;
2513
2514
    z2 = (INT32) wsptr[2];
2515
    z3 = (INT32) wsptr[4];
2516
    z4 = (INT32) wsptr[6];
2517
2518
    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2519
    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2520
2521
    tmp12 = z1 - tmp10;
2522
    tmp13 = z1 + tmp11;
2523
    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2524
2525
    z4 = z2 - z3;
2526
    z3 += z2;
2527
    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2528
    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2529
    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2530
2531
    tmp20 = tmp13 + tmp10 + tmp11;
2532
    tmp23 = tmp12 - tmp10 + tmp11 + z2;
2533
2534
    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2535
    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2536
2537
    tmp25 = tmp13 - tmp10 - tmp11;
2538
    tmp26 = tmp12 + tmp10 - tmp11 - z2;
2539
2540
    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2541
    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2542
2543
    tmp21 = tmp12 + tmp10 + tmp11;
2544
    tmp24 = tmp13 - tmp10 + tmp11;
2545
    tmp11 += tmp11;
2546
    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2547
    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2548
2549
    /* Odd part */
2550
2551
    z1 = (INT32) wsptr[1];
2552
    z2 = (INT32) wsptr[3];
2553
    z4 = (INT32) wsptr[5];
2554
    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2555
    z4 = (INT32) wsptr[7];
2556
2557
    tmp13 = z2 - z4;
2558
    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2559
    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2560
    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2561
2562
    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2563
    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2564
    z2 = z1 - z4;
2565
    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2566
2567
    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2568
    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2569
    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2570
    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2571
    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2572
    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2573
2574
    /* Final output stage */
2575
2576
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2577
                 CONST_BITS+PASS1_BITS+3)
2578
           & RANGE_MASK];
2579
    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2580
                 CONST_BITS+PASS1_BITS+3)
2581
           & RANGE_MASK];
2582
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2583
                 CONST_BITS+PASS1_BITS+3)
2584
           & RANGE_MASK];
2585
    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2586
                 CONST_BITS+PASS1_BITS+3)
2587
           & RANGE_MASK];
2588
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2589
                 CONST_BITS+PASS1_BITS+3)
2590
           & RANGE_MASK];
2591
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2592
                 CONST_BITS+PASS1_BITS+3)
2593
           & RANGE_MASK];
2594
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2595
                 CONST_BITS+PASS1_BITS+3)
2596
           & RANGE_MASK];
2597
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2598
                 CONST_BITS+PASS1_BITS+3)
2599
           & RANGE_MASK];
2600
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2601
                 CONST_BITS+PASS1_BITS+3)
2602
           & RANGE_MASK];
2603
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2604
                 CONST_BITS+PASS1_BITS+3)
2605
           & RANGE_MASK];
2606
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2607
                 CONST_BITS+PASS1_BITS+3)
2608
           & RANGE_MASK];
2609
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2610
                 CONST_BITS+PASS1_BITS+3)
2611
           & RANGE_MASK];
2612
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2613
                 CONST_BITS+PASS1_BITS+3)
2614
           & RANGE_MASK];
2615
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2616
                 CONST_BITS+PASS1_BITS+3)
2617
           & RANGE_MASK];
2618
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
2619
                 CONST_BITS+PASS1_BITS+3)
2620
           & RANGE_MASK];
2621
2622
    wsptr += 8;   /* advance pointer to next row */
2623
  }
2624
}
2625
2626
2627
/*
2628
 * Perform dequantization and inverse DCT on one block of coefficients,
2629
 * producing a 16x16 output block.
2630
 *
2631
 * Optimized algorithm with 28 multiplications in the 1-D kernel.
2632
 * cK represents sqrt(2) * cos(K*pi/32).
2633
 */
2634
2635
GLOBAL(void)
2636
jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2637
     JCOEFPTR coef_block,
2638
     JSAMPARRAY output_buf, JDIMENSION output_col)
2639
{
2640
  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2641
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2642
  INT32 z1, z2, z3, z4;
2643
  JCOEFPTR inptr;
2644
  ISLOW_MULT_TYPE * quantptr;
2645
  int * wsptr;
2646
  JSAMPROW outptr;
2647
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2648
  int ctr;
2649
  int workspace[8*16];  /* buffers data between passes */
2650
  SHIFT_TEMPS
2651
2652
  /* Pass 1: process columns from input, store into work array. */
2653
2654
  inptr = coef_block;
2655
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2656
  wsptr = workspace;
2657
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2658
    /* Even part */
2659
2660
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2661
    if (ctr == 0)
2662
      CLAMP_DC(tmp0);
2663
    tmp0 <<= CONST_BITS;
2664
    /* Add fudge factor here for final descale. */
2665
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
2666
2667
    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2668
    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2669
    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2670
2671
    tmp10 = tmp0 + tmp1;
2672
    tmp11 = tmp0 - tmp1;
2673
    tmp12 = tmp0 + tmp2;
2674
    tmp13 = tmp0 - tmp2;
2675
2676
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2677
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2678
    z3 = z1 - z2;
2679
    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2680
    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2681
2682
    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2683
    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2684
    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2685
    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2686
2687
    tmp20 = tmp10 + tmp0;
2688
    tmp27 = tmp10 - tmp0;
2689
    tmp21 = tmp12 + tmp1;
2690
    tmp26 = tmp12 - tmp1;
2691
    tmp22 = tmp13 + tmp2;
2692
    tmp25 = tmp13 - tmp2;
2693
    tmp23 = tmp11 + tmp3;
2694
    tmp24 = tmp11 - tmp3;
2695
2696
    /* Odd part */
2697
2698
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2699
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2700
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2701
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2702
2703
    tmp11 = z1 + z3;
2704
2705
    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2706
    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2707
    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2708
    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2709
    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2710
    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2711
    tmp0  = tmp1 + tmp2 + tmp3 -
2712
      MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2713
    tmp13 = tmp10 + tmp11 + tmp12 -
2714
      MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2715
    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2716
    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2717
    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2718
    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2719
    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2720
    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2721
    z2    += z4;
2722
    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2723
    tmp1  += z1;
2724
    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2725
    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2726
    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2727
    tmp12 += z2;
2728
    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2729
    tmp2  += z2;
2730
    tmp3  += z2;
2731
    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2732
    tmp10 += z2;
2733
    tmp11 += z2;
2734
2735
    /* Final output stage */
2736
2737
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
2738
    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
2739
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
2740
    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
2741
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
2742
    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
2743
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
2744
    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
2745
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2746
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2747
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2748
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2749
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2750
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2751
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2752
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2753
  }
2754
2755
  /* Pass 2: process 16 rows from work array, store into output array. */
2756
2757
  wsptr = workspace;
2758
  for (ctr = 0; ctr < 16; ctr++) {
2759
    outptr = output_buf[ctr] + output_col;
2760
2761
    /* Even part */
2762
2763
    /* Add range center and fudge factor for final descale and range-limit. */
2764
    tmp0 = (INT32) wsptr[0] +
2765
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
2766
        (ONE << (PASS1_BITS+2)));
2767
    tmp0 <<= CONST_BITS;
2768
2769
    z1 = (INT32) wsptr[4];
2770
    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2771
    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2772
2773
    tmp10 = tmp0 + tmp1;
2774
    tmp11 = tmp0 - tmp1;
2775
    tmp12 = tmp0 + tmp2;
2776
    tmp13 = tmp0 - tmp2;
2777
2778
    z1 = (INT32) wsptr[2];
2779
    z2 = (INT32) wsptr[6];
2780
    z3 = z1 - z2;
2781
    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2782
    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2783
2784
    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2785
    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2786
    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2787
    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2788
2789
    tmp20 = tmp10 + tmp0;
2790
    tmp27 = tmp10 - tmp0;
2791
    tmp21 = tmp12 + tmp1;
2792
    tmp26 = tmp12 - tmp1;
2793
    tmp22 = tmp13 + tmp2;
2794
    tmp25 = tmp13 - tmp2;
2795
    tmp23 = tmp11 + tmp3;
2796
    tmp24 = tmp11 - tmp3;
2797
2798
    /* Odd part */
2799
2800
    z1 = (INT32) wsptr[1];
2801
    z2 = (INT32) wsptr[3];
2802
    z3 = (INT32) wsptr[5];
2803
    z4 = (INT32) wsptr[7];
2804
2805
    tmp11 = z1 + z3;
2806
2807
    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2808
    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2809
    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2810
    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2811
    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2812
    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2813
    tmp0  = tmp1 + tmp2 + tmp3 -
2814
      MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2815
    tmp13 = tmp10 + tmp11 + tmp12 -
2816
      MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2817
    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2818
    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2819
    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2820
    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2821
    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2822
    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2823
    z2    += z4;
2824
    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2825
    tmp1  += z1;
2826
    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2827
    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2828
    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2829
    tmp12 += z2;
2830
    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2831
    tmp2  += z2;
2832
    tmp3  += z2;
2833
    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2834
    tmp10 += z2;
2835
    tmp11 += z2;
2836
2837
    /* Final output stage */
2838
2839
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
2840
                 CONST_BITS+PASS1_BITS+3)
2841
           & RANGE_MASK];
2842
    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
2843
                 CONST_BITS+PASS1_BITS+3)
2844
           & RANGE_MASK];
2845
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
2846
                 CONST_BITS+PASS1_BITS+3)
2847
           & RANGE_MASK];
2848
    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
2849
                 CONST_BITS+PASS1_BITS+3)
2850
           & RANGE_MASK];
2851
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
2852
                 CONST_BITS+PASS1_BITS+3)
2853
           & RANGE_MASK];
2854
    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
2855
                 CONST_BITS+PASS1_BITS+3)
2856
           & RANGE_MASK];
2857
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
2858
                 CONST_BITS+PASS1_BITS+3)
2859
           & RANGE_MASK];
2860
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
2861
                 CONST_BITS+PASS1_BITS+3)
2862
           & RANGE_MASK];
2863
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
2864
                 CONST_BITS+PASS1_BITS+3)
2865
           & RANGE_MASK];
2866
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
2867
                 CONST_BITS+PASS1_BITS+3)
2868
           & RANGE_MASK];
2869
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
2870
                 CONST_BITS+PASS1_BITS+3)
2871
           & RANGE_MASK];
2872
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
2873
                 CONST_BITS+PASS1_BITS+3)
2874
           & RANGE_MASK];
2875
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
2876
                 CONST_BITS+PASS1_BITS+3)
2877
           & RANGE_MASK];
2878
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
2879
                 CONST_BITS+PASS1_BITS+3)
2880
           & RANGE_MASK];
2881
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
2882
                 CONST_BITS+PASS1_BITS+3)
2883
           & RANGE_MASK];
2884
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
2885
                 CONST_BITS+PASS1_BITS+3)
2886
           & RANGE_MASK];
2887
2888
    wsptr += 8;   /* advance pointer to next row */
2889
  }
2890
}
2891
2892
2893
/*
2894
 * Perform dequantization and inverse DCT on one block of coefficients,
2895
 * producing a 16x8 output block.
2896
 *
2897
 * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
2898
 */
2899
2900
GLOBAL(void)
2901
jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2902
    JCOEFPTR coef_block,
2903
    JSAMPARRAY output_buf, JDIMENSION output_col)
2904
{
2905
  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2906
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2907
  INT32 z1, z2, z3, z4;
2908
  JCOEFPTR inptr;
2909
  ISLOW_MULT_TYPE * quantptr;
2910
  int * wsptr;
2911
  JSAMPROW outptr;
2912
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2913
  int ctr;
2914
  int workspace[8*8]; /* buffers data between passes */
2915
  SHIFT_TEMPS
2916
2917
  /* Pass 1: process columns from input, store into work array.
2918
   * Note results are scaled up by sqrt(8) compared to a true IDCT;
2919
   * furthermore, we scale the results by 2**PASS1_BITS.
2920
   * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2921
   */
2922
2923
  inptr = coef_block;
2924
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2925
  wsptr = workspace;
2926
  for (ctr = DCTSIZE; ctr > 0; ctr--) {
2927
    /* Due to quantization, we will usually find that many of the input
2928
     * coefficients are zero, especially the AC terms.  We can exploit this
2929
     * by short-circuiting the IDCT calculation for any column in which all
2930
     * the AC terms are zero.  In that case each output is equal to the
2931
     * DC coefficient (with scale factor as needed).
2932
     * With typical images and quantization tables, half or more of the
2933
     * column DCT calculations can be simplified this way.
2934
     */
2935
2936
    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
2937
  inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
2938
  inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
2939
  inptr[DCTSIZE*7] == 0) {
2940
      /* AC terms all zero */
2941
      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2942
      if (ctr == DCTSIZE)
2943
        CLAMP_DC(dcval);
2944
      dcval <<= PASS1_BITS;
2945
2946
      wsptr[DCTSIZE*0] = dcval;
2947
      wsptr[DCTSIZE*1] = dcval;
2948
      wsptr[DCTSIZE*2] = dcval;
2949
      wsptr[DCTSIZE*3] = dcval;
2950
      wsptr[DCTSIZE*4] = dcval;
2951
      wsptr[DCTSIZE*5] = dcval;
2952
      wsptr[DCTSIZE*6] = dcval;
2953
      wsptr[DCTSIZE*7] = dcval;
2954
2955
      inptr++;      /* advance pointers to next column */
2956
      quantptr++;
2957
      wsptr++;
2958
      continue;
2959
    }
2960
2961
    /* Even part: reverse the even part of the forward DCT.
2962
     * The rotator is c(-6).
2963
     */
2964
2965
    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2966
    if (ctr == DCTSIZE)
2967
      CLAMP_DC(z2);
2968
    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2969
    z2 <<= CONST_BITS;
2970
    z3 <<= CONST_BITS;
2971
    /* Add fudge factor here for final descale. */
2972
    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
2973
2974
    tmp0 = z2 + z3;
2975
    tmp1 = z2 - z3;
2976
2977
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2978
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2979
2980
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
2981
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
2982
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
2983
2984
    tmp10 = tmp0 + tmp2;
2985
    tmp13 = tmp0 - tmp2;
2986
    tmp11 = tmp1 + tmp3;
2987
    tmp12 = tmp1 - tmp3;
2988
2989
    /* Odd part per figure 8; the matrix is unitary and hence its
2990
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2991
     */
2992
2993
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2994
    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2995
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2996
    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2997
2998
    z2 = tmp0 + tmp2;
2999
    z3 = tmp1 + tmp3;
3000
3001
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
3002
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
3003
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
3004
    z2 += z1;
3005
    z3 += z1;
3006
3007
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3008
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
3009
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
3010
    tmp0 += z1 + z2;
3011
    tmp3 += z1 + z3;
3012
3013
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3014
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
3015
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
3016
    tmp1 += z1 + z3;
3017
    tmp2 += z1 + z2;
3018
3019
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3020
3021
    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
3022
    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
3023
    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
3024
    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
3025
    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
3026
    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
3027
    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
3028
    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
3029
3030
    inptr++;      /* advance pointers to next column */
3031
    quantptr++;
3032
    wsptr++;
3033
  }
3034
3035
  /* Pass 2: process 8 rows from work array, store into output array.
3036
   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3037
   */
3038
3039
  wsptr = workspace;
3040
  for (ctr = 0; ctr < 8; ctr++) {
3041
    outptr = output_buf[ctr] + output_col;
3042
3043
    /* Even part */
3044
3045
    /* Add range center and fudge factor for final descale and range-limit. */
3046
    tmp0 = (INT32) wsptr[0] +
3047
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3048
        (ONE << (PASS1_BITS+2)));
3049
    tmp0 <<= CONST_BITS;
3050
3051
    z1 = (INT32) wsptr[4];
3052
    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
3053
    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
3054
3055
    tmp10 = tmp0 + tmp1;
3056
    tmp11 = tmp0 - tmp1;
3057
    tmp12 = tmp0 + tmp2;
3058
    tmp13 = tmp0 - tmp2;
3059
3060
    z1 = (INT32) wsptr[2];
3061
    z2 = (INT32) wsptr[6];
3062
    z3 = z1 - z2;
3063
    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
3064
    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
3065
3066
    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
3067
    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
3068
    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
3069
    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
3070
3071
    tmp20 = tmp10 + tmp0;
3072
    tmp27 = tmp10 - tmp0;
3073
    tmp21 = tmp12 + tmp1;
3074
    tmp26 = tmp12 - tmp1;
3075
    tmp22 = tmp13 + tmp2;
3076
    tmp25 = tmp13 - tmp2;
3077
    tmp23 = tmp11 + tmp3;
3078
    tmp24 = tmp11 - tmp3;
3079
3080
    /* Odd part */
3081
3082
    z1 = (INT32) wsptr[1];
3083
    z2 = (INT32) wsptr[3];
3084
    z3 = (INT32) wsptr[5];
3085
    z4 = (INT32) wsptr[7];
3086
3087
    tmp11 = z1 + z3;
3088
3089
    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
3090
    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
3091
    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
3092
    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
3093
    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
3094
    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
3095
    tmp0  = tmp1 + tmp2 + tmp3 -
3096
      MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
3097
    tmp13 = tmp10 + tmp11 + tmp12 -
3098
      MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
3099
    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
3100
    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
3101
    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
3102
    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
3103
    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
3104
    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
3105
    z2    += z4;
3106
    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
3107
    tmp1  += z1;
3108
    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
3109
    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
3110
    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
3111
    tmp12 += z2;
3112
    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
3113
    tmp2  += z2;
3114
    tmp3  += z2;
3115
    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
3116
    tmp10 += z2;
3117
    tmp11 += z2;
3118
3119
    /* Final output stage */
3120
3121
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
3122
                 CONST_BITS+PASS1_BITS+3)
3123
           & RANGE_MASK];
3124
    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
3125
                 CONST_BITS+PASS1_BITS+3)
3126
           & RANGE_MASK];
3127
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
3128
                 CONST_BITS+PASS1_BITS+3)
3129
           & RANGE_MASK];
3130
    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
3131
                 CONST_BITS+PASS1_BITS+3)
3132
           & RANGE_MASK];
3133
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
3134
                 CONST_BITS+PASS1_BITS+3)
3135
           & RANGE_MASK];
3136
    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
3137
                 CONST_BITS+PASS1_BITS+3)
3138
           & RANGE_MASK];
3139
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
3140
                 CONST_BITS+PASS1_BITS+3)
3141
           & RANGE_MASK];
3142
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
3143
                 CONST_BITS+PASS1_BITS+3)
3144
           & RANGE_MASK];
3145
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
3146
                 CONST_BITS+PASS1_BITS+3)
3147
           & RANGE_MASK];
3148
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
3149
                 CONST_BITS+PASS1_BITS+3)
3150
           & RANGE_MASK];
3151
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
3152
                 CONST_BITS+PASS1_BITS+3)
3153
           & RANGE_MASK];
3154
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
3155
                 CONST_BITS+PASS1_BITS+3)
3156
           & RANGE_MASK];
3157
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
3158
                 CONST_BITS+PASS1_BITS+3)
3159
           & RANGE_MASK];
3160
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
3161
                 CONST_BITS+PASS1_BITS+3)
3162
           & RANGE_MASK];
3163
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
3164
                 CONST_BITS+PASS1_BITS+3)
3165
           & RANGE_MASK];
3166
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
3167
                 CONST_BITS+PASS1_BITS+3)
3168
           & RANGE_MASK];
3169
3170
    wsptr += 8;   /* advance pointer to next row */
3171
  }
3172
}
3173
3174
3175
/*
3176
 * Perform dequantization and inverse DCT on one block of coefficients,
3177
 * producing a 14x7 output block.
3178
 *
3179
 * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
3180
 */
3181
3182
GLOBAL(void)
3183
jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3184
    JCOEFPTR coef_block,
3185
    JSAMPARRAY output_buf, JDIMENSION output_col)
3186
{
3187
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3188
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
3189
  INT32 z1, z2, z3, z4;
3190
  JCOEFPTR inptr;
3191
  ISLOW_MULT_TYPE * quantptr;
3192
  int * wsptr;
3193
  JSAMPROW outptr;
3194
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3195
  int ctr;
3196
  int workspace[8*7]; /* buffers data between passes */
3197
  SHIFT_TEMPS
3198
3199
  /* Pass 1: process columns from input, store into work array.
3200
   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3201
   */
3202
3203
  inptr = coef_block;
3204
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3205
  wsptr = workspace;
3206
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3207
    /* Even part */
3208
3209
    tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3210
    if (ctr == 0)
3211
      CLAMP_DC(tmp23);
3212
    tmp23 <<= CONST_BITS;
3213
    /* Add fudge factor here for final descale. */
3214
    tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
3215
3216
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3217
    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3218
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
3219
3220
    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
3221
    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
3222
    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
3223
    tmp10 = z1 + z3;
3224
    z2 -= tmp10;
3225
    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
3226
    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
3227
    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
3228
    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
3229
3230
    /* Odd part */
3231
3232
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3233
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3234
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3235
3236
    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
3237
    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
3238
    tmp10 = tmp11 - tmp12;
3239
    tmp11 += tmp12;
3240
    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
3241
    tmp11 += tmp12;
3242
    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
3243
    tmp10 += z2;
3244
    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
3245
3246
    /* Final output stage */
3247
3248
    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3249
    wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3250
    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
3251
    wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
3252
    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3253
    wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3254
    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
3255
  }
3256
3257
  /* Pass 2: process 7 rows from work array, store into output array.
3258
   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
3259
   */
3260
3261
  wsptr = workspace;
3262
  for (ctr = 0; ctr < 7; ctr++) {
3263
    outptr = output_buf[ctr] + output_col;
3264
3265
    /* Even part */
3266
3267
    /* Add range center and fudge factor for final descale and range-limit. */
3268
    z1 = (INT32) wsptr[0] +
3269
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3270
      (ONE << (PASS1_BITS+2)));
3271
    z1 <<= CONST_BITS;
3272
    z4 = (INT32) wsptr[4];
3273
    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
3274
    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
3275
    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
3276
3277
    tmp10 = z1 + z2;
3278
    tmp11 = z1 + z3;
3279
    tmp12 = z1 - z4;
3280
3281
    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
3282
3283
    z1 = (INT32) wsptr[2];
3284
    z2 = (INT32) wsptr[6];
3285
3286
    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
3287
3288
    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
3289
    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
3290
    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
3291
      MULTIPLY(z2, FIX(1.378756276));      /* c2 */
3292
3293
    tmp20 = tmp10 + tmp13;
3294
    tmp26 = tmp10 - tmp13;
3295
    tmp21 = tmp11 + tmp14;
3296
    tmp25 = tmp11 - tmp14;
3297
    tmp22 = tmp12 + tmp15;
3298
    tmp24 = tmp12 - tmp15;
3299
3300
    /* Odd part */
3301
3302
    z1 = (INT32) wsptr[1];
3303
    z2 = (INT32) wsptr[3];
3304
    z3 = (INT32) wsptr[5];
3305
    z4 = (INT32) wsptr[7];
3306
    z4 <<= CONST_BITS;
3307
3308
    tmp14 = z1 + z3;
3309
    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
3310
    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
3311
    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
3312
    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
3313
    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
3314
    z1    -= z2;
3315
    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
3316
    tmp16 += tmp15;
3317
    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
3318
    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
3319
    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
3320
    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
3321
    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
3322
    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
3323
3324
    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
3325
3326
    /* Final output stage */
3327
3328
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3329
                 CONST_BITS+PASS1_BITS+3)
3330
           & RANGE_MASK];
3331
    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3332
                 CONST_BITS+PASS1_BITS+3)
3333
           & RANGE_MASK];
3334
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3335
                 CONST_BITS+PASS1_BITS+3)
3336
           & RANGE_MASK];
3337
    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3338
                 CONST_BITS+PASS1_BITS+3)
3339
           & RANGE_MASK];
3340
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3341
                 CONST_BITS+PASS1_BITS+3)
3342
           & RANGE_MASK];
3343
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3344
                 CONST_BITS+PASS1_BITS+3)
3345
           & RANGE_MASK];
3346
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3347
                 CONST_BITS+PASS1_BITS+3)
3348
           & RANGE_MASK];
3349
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3350
                 CONST_BITS+PASS1_BITS+3)
3351
           & RANGE_MASK];
3352
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3353
                 CONST_BITS+PASS1_BITS+3)
3354
           & RANGE_MASK];
3355
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3356
                 CONST_BITS+PASS1_BITS+3)
3357
           & RANGE_MASK];
3358
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3359
                 CONST_BITS+PASS1_BITS+3)
3360
           & RANGE_MASK];
3361
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3362
                 CONST_BITS+PASS1_BITS+3)
3363
           & RANGE_MASK];
3364
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
3365
                 CONST_BITS+PASS1_BITS+3)
3366
           & RANGE_MASK];
3367
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
3368
                 CONST_BITS+PASS1_BITS+3)
3369
           & RANGE_MASK];
3370
3371
    wsptr += 8;   /* advance pointer to next row */
3372
  }
3373
}
3374
3375
3376
/*
3377
 * Perform dequantization and inverse DCT on one block of coefficients,
3378
 * producing a 12x6 output block.
3379
 *
3380
 * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
3381
 */
3382
3383
GLOBAL(void)
3384
jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3385
    JCOEFPTR coef_block,
3386
    JSAMPARRAY output_buf, JDIMENSION output_col)
3387
{
3388
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3389
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
3390
  INT32 z1, z2, z3, z4;
3391
  JCOEFPTR inptr;
3392
  ISLOW_MULT_TYPE * quantptr;
3393
  int * wsptr;
3394
  JSAMPROW outptr;
3395
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3396
  int ctr;
3397
  int workspace[8*6]; /* buffers data between passes */
3398
  SHIFT_TEMPS
3399
3400
  /* Pass 1: process columns from input, store into work array.
3401
   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3402
   */
3403
3404
  inptr = coef_block;
3405
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3406
  wsptr = workspace;
3407
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3408
    /* Even part */
3409
3410
    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3411
    if (ctr == 0)
3412
      CLAMP_DC(tmp10);
3413
    tmp10 <<= CONST_BITS;
3414
    /* Add fudge factor here for final descale. */
3415
    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
3416
    tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3417
    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
3418
    tmp11 = tmp10 + tmp20;
3419
    tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
3420
    tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3421
    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
3422
    tmp20 = tmp11 + tmp10;
3423
    tmp22 = tmp11 - tmp10;
3424
3425
    /* Odd part */
3426
3427
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3428
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3429
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3430
    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3431
    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
3432
    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
3433
    tmp11 = (z1 - z2 - z3) << PASS1_BITS;
3434
3435
    /* Final output stage */
3436
3437
    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3438
    wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3439
    wsptr[8*1] = (int) (tmp21 + tmp11);
3440
    wsptr[8*4] = (int) (tmp21 - tmp11);
3441
    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3442
    wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3443
  }
3444
3445
  /* Pass 2: process 6 rows from work array, store into output array.
3446
   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
3447
   */
3448
3449
  wsptr = workspace;
3450
  for (ctr = 0; ctr < 6; ctr++) {
3451
    outptr = output_buf[ctr] + output_col;
3452
3453
    /* Even part */
3454
3455
    /* Add range center and fudge factor for final descale and range-limit. */
3456
    z3 = (INT32) wsptr[0] +
3457
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3458
      (ONE << (PASS1_BITS+2)));
3459
    z3 <<= CONST_BITS;
3460
3461
    z4 = (INT32) wsptr[4];
3462
    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
3463
3464
    tmp10 = z3 + z4;
3465
    tmp11 = z3 - z4;
3466
3467
    z1 = (INT32) wsptr[2];
3468
    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
3469
    z1 <<= CONST_BITS;
3470
    z2 = (INT32) wsptr[6];
3471
    z2 <<= CONST_BITS;
3472
3473
    tmp12 = z1 - z2;
3474
3475
    tmp21 = z3 + tmp12;
3476
    tmp24 = z3 - tmp12;
3477
3478
    tmp12 = z4 + z2;
3479
3480
    tmp20 = tmp10 + tmp12;
3481
    tmp25 = tmp10 - tmp12;
3482
3483
    tmp12 = z4 - z1 - z2;
3484
3485
    tmp22 = tmp11 + tmp12;
3486
    tmp23 = tmp11 - tmp12;
3487
3488
    /* Odd part */
3489
3490
    z1 = (INT32) wsptr[1];
3491
    z2 = (INT32) wsptr[3];
3492
    z3 = (INT32) wsptr[5];
3493
    z4 = (INT32) wsptr[7];
3494
3495
    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
3496
    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
3497
3498
    tmp10 = z1 + z3;
3499
    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
3500
    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
3501
    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
3502
    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
3503
    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
3504
    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
3505
    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
3506
       MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
3507
3508
    z1 -= z4;
3509
    z2 -= z3;
3510
    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
3511
    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
3512
    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
3513
3514
    /* Final output stage */
3515
3516
    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3517
                 CONST_BITS+PASS1_BITS+3)
3518
           & RANGE_MASK];
3519
    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3520
                 CONST_BITS+PASS1_BITS+3)
3521
           & RANGE_MASK];
3522
    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3523
                 CONST_BITS+PASS1_BITS+3)
3524
           & RANGE_MASK];
3525
    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3526
                 CONST_BITS+PASS1_BITS+3)
3527
           & RANGE_MASK];
3528
    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3529
                 CONST_BITS+PASS1_BITS+3)
3530
           & RANGE_MASK];
3531
    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3532
                 CONST_BITS+PASS1_BITS+3)
3533
           & RANGE_MASK];
3534
    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3535
                 CONST_BITS+PASS1_BITS+3)
3536
           & RANGE_MASK];
3537
    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3538
                 CONST_BITS+PASS1_BITS+3)
3539
           & RANGE_MASK];
3540
    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3541
                 CONST_BITS+PASS1_BITS+3)
3542
           & RANGE_MASK];
3543
    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3544
                 CONST_BITS+PASS1_BITS+3)
3545
           & RANGE_MASK];
3546
    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3547
                 CONST_BITS+PASS1_BITS+3)
3548
           & RANGE_MASK];
3549
    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3550
                 CONST_BITS+PASS1_BITS+3)
3551
           & RANGE_MASK];
3552
3553
    wsptr += 8;   /* advance pointer to next row */
3554
  }
3555
}
3556
3557
3558
/*
3559
 * Perform dequantization and inverse DCT on one block of coefficients,
3560
 * producing a 10x5 output block.
3561
 *
3562
 * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
3563
 */
3564
3565
GLOBAL(void)
3566
jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3567
    JCOEFPTR coef_block,
3568
    JSAMPARRAY output_buf, JDIMENSION output_col)
3569
{
3570
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3571
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
3572
  INT32 z1, z2, z3, z4;
3573
  JCOEFPTR inptr;
3574
  ISLOW_MULT_TYPE * quantptr;
3575
  int * wsptr;
3576
  JSAMPROW outptr;
3577
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3578
  int ctr;
3579
  int workspace[8*5]; /* buffers data between passes */
3580
  SHIFT_TEMPS
3581
3582
  /* Pass 1: process columns from input, store into work array.
3583
   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3584
   */
3585
3586
  inptr = coef_block;
3587
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3588
  wsptr = workspace;
3589
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3590
    /* Even part */
3591
3592
    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3593
    if (ctr == 0)
3594
      CLAMP_DC(tmp12);
3595
    tmp12 <<= CONST_BITS;
3596
    /* Add fudge factor here for final descale. */
3597
    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
3598
    tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3599
    tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3600
    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
3601
    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
3602
    z3 = tmp12 + z2;
3603
    tmp10 = z3 + z1;
3604
    tmp11 = z3 - z1;
3605
    tmp12 -= z2 << 2;
3606
3607
    /* Odd part */
3608
3609
    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3610
    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3611
3612
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
3613
    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
3614
    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
3615
3616
    /* Final output stage */
3617
3618
    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3619
    wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
3620
    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
3621
    wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
3622
    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
3623
  }
3624
3625
  /* Pass 2: process 5 rows from work array, store into output array.
3626
   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3627
   */
3628
3629
  wsptr = workspace;
3630
  for (ctr = 0; ctr < 5; ctr++) {
3631
    outptr = output_buf[ctr] + output_col;
3632
3633
    /* Even part */
3634
3635
    /* Add range center and fudge factor for final descale and range-limit. */
3636
    z3 = (INT32) wsptr[0] +
3637
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3638
      (ONE << (PASS1_BITS+2)));
3639
    z3 <<= CONST_BITS;
3640
    z4 = (INT32) wsptr[4];
3641
    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
3642
    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
3643
    tmp10 = z3 + z1;
3644
    tmp11 = z3 - z2;
3645
3646
    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
3647
3648
    z2 = (INT32) wsptr[2];
3649
    z3 = (INT32) wsptr[6];
3650
3651
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
3652
    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
3653
    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
3654
3655
    tmp20 = tmp10 + tmp12;
3656
    tmp24 = tmp10 - tmp12;
3657
    tmp21 = tmp11 + tmp13;
3658
    tmp23 = tmp11 - tmp13;
3659
3660
    /* Odd part */
3661
3662
    z1 = (INT32) wsptr[1];
3663
    z2 = (INT32) wsptr[3];
3664
    z3 = (INT32) wsptr[5];
3665
    z3 <<= CONST_BITS;
3666
    z4 = (INT32) wsptr[7];
3667
3668
    tmp11 = z2 + z4;
3669
    tmp13 = z2 - z4;
3670
3671
    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
3672
3673
    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
3674
    z4 = z3 + tmp12;
3675
3676
    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
3677
    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
3678
3679
    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
3680
    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
3681
3682
    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
3683
3684
    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
3685
    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
3686
3687
    /* Final output stage */
3688
3689
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3690
                CONST_BITS+PASS1_BITS+3)
3691
          & RANGE_MASK];
3692
    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3693
                CONST_BITS+PASS1_BITS+3)
3694
          & RANGE_MASK];
3695
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3696
                CONST_BITS+PASS1_BITS+3)
3697
          & RANGE_MASK];
3698
    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3699
                CONST_BITS+PASS1_BITS+3)
3700
          & RANGE_MASK];
3701
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3702
                CONST_BITS+PASS1_BITS+3)
3703
          & RANGE_MASK];
3704
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3705
                CONST_BITS+PASS1_BITS+3)
3706
          & RANGE_MASK];
3707
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3708
                CONST_BITS+PASS1_BITS+3)
3709
          & RANGE_MASK];
3710
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3711
                CONST_BITS+PASS1_BITS+3)
3712
          & RANGE_MASK];
3713
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3714
                CONST_BITS+PASS1_BITS+3)
3715
          & RANGE_MASK];
3716
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3717
                CONST_BITS+PASS1_BITS+3)
3718
          & RANGE_MASK];
3719
3720
    wsptr += 8;   /* advance pointer to next row */
3721
  }
3722
}
3723
3724
3725
/*
3726
 * Perform dequantization and inverse DCT on one block of coefficients,
3727
 * producing an 8x4 output block.
3728
 *
3729
 * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
3730
 */
3731
3732
GLOBAL(void)
3733
jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3734
         JCOEFPTR coef_block,
3735
         JSAMPARRAY output_buf, JDIMENSION output_col)
3736
{
3737
  INT32 tmp0, tmp1, tmp2, tmp3;
3738
  INT32 tmp10, tmp11, tmp12, tmp13;
3739
  INT32 z1, z2, z3;
3740
  JCOEFPTR inptr;
3741
  ISLOW_MULT_TYPE * quantptr;
3742
  int * wsptr;
3743
  JSAMPROW outptr;
3744
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3745
  int ctr;
3746
  int workspace[8*4]; /* buffers data between passes */
3747
  SHIFT_TEMPS
3748
3749
  /* Pass 1: process columns from input, store into work array.
3750
   * 4-point IDCT kernel,
3751
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3752
   */
3753
3754
  inptr = coef_block;
3755
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3756
  wsptr = workspace;
3757
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3758
    /* Even part */
3759
3760
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3761
    if (ctr == 0)
3762
      CLAMP_DC(tmp0);
3763
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3764
3765
    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
3766
    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
3767
3768
    /* Odd part */
3769
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3770
3771
    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3772
    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3773
3774
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
3775
    /* Add fudge factor here for final descale. */
3776
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3777
    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
3778
           CONST_BITS-PASS1_BITS);
3779
    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
3780
           CONST_BITS-PASS1_BITS);
3781
3782
    /* Final output stage */
3783
3784
    wsptr[8*0] = (int) (tmp10 + tmp0);
3785
    wsptr[8*3] = (int) (tmp10 - tmp0);
3786
    wsptr[8*1] = (int) (tmp12 + tmp2);
3787
    wsptr[8*2] = (int) (tmp12 - tmp2);
3788
  }
3789
3790
  /* Pass 2: process rows from work array, store into output array.
3791
   * Note that we must descale the results by a factor of 8 == 2**3,
3792
   * and also undo the PASS1_BITS scaling.
3793
   * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3794
   */
3795
3796
  wsptr = workspace;
3797
  for (ctr = 0; ctr < 4; ctr++) {
3798
    outptr = output_buf[ctr] + output_col;
3799
3800
    /* Even part: reverse the even part of the forward DCT.
3801
     * The rotator is c(-6).
3802
     */
3803
3804
    /* Add range center and fudge factor for final descale and range-limit. */
3805
    z2 = (INT32) wsptr[0] +
3806
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3807
      (ONE << (PASS1_BITS+2)));
3808
    z3 = (INT32) wsptr[4];
3809
3810
    tmp0 = (z2 + z3) << CONST_BITS;
3811
    tmp1 = (z2 - z3) << CONST_BITS;
3812
3813
    z2 = (INT32) wsptr[2];
3814
    z3 = (INT32) wsptr[6];
3815
3816
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
3817
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
3818
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
3819
3820
    tmp10 = tmp0 + tmp2;
3821
    tmp13 = tmp0 - tmp2;
3822
    tmp11 = tmp1 + tmp3;
3823
    tmp12 = tmp1 - tmp3;
3824
3825
    /* Odd part per figure 8; the matrix is unitary and hence its
3826
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3827
     */
3828
3829
    tmp0 = (INT32) wsptr[7];
3830
    tmp1 = (INT32) wsptr[5];
3831
    tmp2 = (INT32) wsptr[3];
3832
    tmp3 = (INT32) wsptr[1];
3833
3834
    z2 = tmp0 + tmp2;
3835
    z3 = tmp1 + tmp3;
3836
3837
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
3838
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
3839
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
3840
    z2 += z1;
3841
    z3 += z1;
3842
3843
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
3844
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
3845
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
3846
    tmp0 += z1 + z2;
3847
    tmp3 += z1 + z3;
3848
3849
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
3850
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
3851
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
3852
    tmp1 += z1 + z3;
3853
    tmp2 += z1 + z2;
3854
3855
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3856
3857
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
3858
                CONST_BITS+PASS1_BITS+3)
3859
          & RANGE_MASK];
3860
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
3861
                CONST_BITS+PASS1_BITS+3)
3862
          & RANGE_MASK];
3863
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
3864
                CONST_BITS+PASS1_BITS+3)
3865
          & RANGE_MASK];
3866
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
3867
                CONST_BITS+PASS1_BITS+3)
3868
          & RANGE_MASK];
3869
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
3870
                CONST_BITS+PASS1_BITS+3)
3871
          & RANGE_MASK];
3872
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
3873
                CONST_BITS+PASS1_BITS+3)
3874
          & RANGE_MASK];
3875
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
3876
                CONST_BITS+PASS1_BITS+3)
3877
          & RANGE_MASK];
3878
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
3879
                CONST_BITS+PASS1_BITS+3)
3880
          & RANGE_MASK];
3881
3882
    wsptr += DCTSIZE;   /* advance pointer to next row */
3883
  }
3884
}
3885
3886
3887
/*
3888
 * Perform dequantization and inverse DCT on one block of coefficients,
3889
 * producing a 6x3 output block.
3890
 *
3891
 * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
3892
 */
3893
3894
GLOBAL(void)
3895
jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3896
         JCOEFPTR coef_block,
3897
         JSAMPARRAY output_buf, JDIMENSION output_col)
3898
{
3899
  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
3900
  INT32 z1, z2, z3;
3901
  JCOEFPTR inptr;
3902
  ISLOW_MULT_TYPE * quantptr;
3903
  int * wsptr;
3904
  JSAMPROW outptr;
3905
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3906
  int ctr;
3907
  int workspace[6*3]; /* buffers data between passes */
3908
  SHIFT_TEMPS
3909
3910
  /* Pass 1: process columns from input, store into work array.
3911
   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
3912
   */
3913
3914
  inptr = coef_block;
3915
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3916
  wsptr = workspace;
3917
  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
3918
    /* Even part */
3919
3920
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3921
    if (ctr == 0)
3922
      CLAMP_DC(tmp0);
3923
    tmp0 <<= CONST_BITS;
3924
    /* Add fudge factor here for final descale. */
3925
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
3926
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3927
    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
3928
    tmp10 = tmp0 + tmp12;
3929
    tmp2 = tmp0 - tmp12 - tmp12;
3930
3931
    /* Odd part */
3932
3933
    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3934
    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
3935
3936
    /* Final output stage */
3937
3938
    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
3939
    wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
3940
    wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3941
  }
3942
  
3943
  /* Pass 2: process 3 rows from work array, store into output array.
3944
   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3945
   */
3946
3947
  wsptr = workspace;
3948
  for (ctr = 0; ctr < 3; ctr++) {
3949
    outptr = output_buf[ctr] + output_col;
3950
3951
    /* Even part */
3952
3953
    /* Add range center and fudge factor for final descale and range-limit. */
3954
    tmp0 = (INT32) wsptr[0] +
3955
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
3956
        (ONE << (PASS1_BITS+2)));
3957
    tmp0 <<= CONST_BITS;
3958
    tmp2 = (INT32) wsptr[4];
3959
    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
3960
    tmp1 = tmp0 + tmp10;
3961
    tmp11 = tmp0 - tmp10 - tmp10;
3962
    tmp10 = (INT32) wsptr[2];
3963
    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
3964
    tmp10 = tmp1 + tmp0;
3965
    tmp12 = tmp1 - tmp0;
3966
3967
    /* Odd part */
3968
3969
    z1 = (INT32) wsptr[1];
3970
    z2 = (INT32) wsptr[3];
3971
    z3 = (INT32) wsptr[5];
3972
    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3973
    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
3974
    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
3975
    tmp1 = (z1 - z2 - z3) << CONST_BITS;
3976
3977
    /* Final output stage */
3978
3979
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3980
                CONST_BITS+PASS1_BITS+3)
3981
          & RANGE_MASK];
3982
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3983
                CONST_BITS+PASS1_BITS+3)
3984
          & RANGE_MASK];
3985
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
3986
                CONST_BITS+PASS1_BITS+3)
3987
          & RANGE_MASK];
3988
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
3989
                CONST_BITS+PASS1_BITS+3)
3990
          & RANGE_MASK];
3991
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3992
                CONST_BITS+PASS1_BITS+3)
3993
          & RANGE_MASK];
3994
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3995
                CONST_BITS+PASS1_BITS+3)
3996
          & RANGE_MASK];
3997
3998
    wsptr += 6;   /* advance pointer to next row */
3999
  }
4000
}
4001
4002
4003
/*
4004
 * Perform dequantization and inverse DCT on one block of coefficients,
4005
 * producing a 4x2 output block.
4006
 *
4007
 * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4008
 */
4009
4010
GLOBAL(void)
4011
jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4012
         JCOEFPTR coef_block,
4013
         JSAMPARRAY output_buf, JDIMENSION output_col)
4014
{
4015
  INT32 tmp0, tmp2, tmp10, tmp12;
4016
  INT32 z1, z2, z3;
4017
  JCOEFPTR inptr;
4018
  ISLOW_MULT_TYPE * quantptr;
4019
  INT32 * wsptr;
4020
  JSAMPROW outptr;
4021
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4022
  int ctr;
4023
  INT32 workspace[4*2]; /* buffers data between passes */
4024
  SHIFT_TEMPS
4025
4026
  /* Pass 1: process columns from input, store into work array. */
4027
4028
  inptr = coef_block;
4029
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4030
  wsptr = workspace;
4031
  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
4032
    /* Even part */
4033
4034
    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4035
    if (ctr == 0)
4036
      CLAMP_DC(tmp10);
4037
4038
    /* Odd part */
4039
4040
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4041
4042
    /* Final output stage */
4043
4044
    wsptr[4*0] = tmp10 + tmp0;
4045
    wsptr[4*1] = tmp10 - tmp0;
4046
  }
4047
4048
  /* Pass 2: process 2 rows from work array, store into output array.
4049
   * 4-point IDCT kernel,
4050
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
4051
   */
4052
4053
  wsptr = workspace;
4054
  for (ctr = 0; ctr < 2; ctr++) {
4055
    outptr = output_buf[ctr] + output_col;
4056
4057
    /* Even part */
4058
4059
    /* Add range center and fudge factor for final descale and range-limit. */
4060
    tmp0 = wsptr[0] + ((((INT32) RANGE_CENTER) << 3) + (ONE << 2));
4061
    tmp2 = wsptr[2];
4062
4063
    tmp10 = (tmp0 + tmp2) << CONST_BITS;
4064
    tmp12 = (tmp0 - tmp2) << CONST_BITS;
4065
4066
    /* Odd part */
4067
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4068
4069
    z2 = wsptr[1];
4070
    z3 = wsptr[3];
4071
4072
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4073
    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4074
    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4075
4076
    /* Final output stage */
4077
4078
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4079
                CONST_BITS+3)
4080
          & RANGE_MASK];
4081
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4082
                CONST_BITS+3)
4083
          & RANGE_MASK];
4084
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4085
                CONST_BITS+3)
4086
          & RANGE_MASK];
4087
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4088
                CONST_BITS+3)
4089
          & RANGE_MASK];
4090
4091
    wsptr += 4;   /* advance pointer to next row */
4092
  }
4093
}
4094
4095
4096
/*
4097
 * Perform dequantization and inverse DCT on one block of coefficients,
4098
 * producing a 2x1 output block.
4099
 *
4100
 * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
4101
 */
4102
4103
GLOBAL(void)
4104
jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4105
         JCOEFPTR coef_block,
4106
         JSAMPARRAY output_buf, JDIMENSION output_col)
4107
{
4108
  INT32 tmp0, tmp1;
4109
  ISLOW_MULT_TYPE * quantptr;
4110
  JSAMPROW outptr;
4111
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4112
  SHIFT_TEMPS
4113
4114
  /* Pass 1: empty. */
4115
4116
  /* Pass 2: process 1 row from input, store into output array. */
4117
4118
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4119
  outptr = output_buf[0] + output_col;
4120
4121
  /* Even part */
4122
4123
  tmp0 = DEQUANTIZE(coef_block[0], quantptr[0]);
4124
  CLAMP_DC(tmp0);
4125
  /* Add fudge factor here for final descale. */
4126
  tmp0 += ONE << 2;
4127
4128
  /* Odd part */
4129
4130
  tmp1 = DEQUANTIZE(coef_block[1], quantptr[1]);
4131
4132
  /* Final output stage */
4133
4134
  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
4135
  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
4136
}
4137
4138
4139
/*
4140
 * Perform dequantization and inverse DCT on one block of coefficients,
4141
 * producing an 8x16 output block.
4142
 *
4143
 * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
4144
 */
4145
4146
GLOBAL(void)
4147
jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4148
    JCOEFPTR coef_block,
4149
    JSAMPARRAY output_buf, JDIMENSION output_col)
4150
{
4151
  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
4152
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
4153
  INT32 z1, z2, z3, z4;
4154
  JCOEFPTR inptr;
4155
  ISLOW_MULT_TYPE * quantptr;
4156
  int * wsptr;
4157
  JSAMPROW outptr;
4158
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4159
  int ctr;
4160
  int workspace[8*16];  /* buffers data between passes */
4161
  SHIFT_TEMPS
4162
4163
  /* Pass 1: process columns from input, store into work array.
4164
   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
4165
   */
4166
4167
  inptr = coef_block;
4168
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4169
  wsptr = workspace;
4170
  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
4171
    /* Even part */
4172
4173
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4174
    if (ctr == 0)
4175
      CLAMP_DC(tmp0);
4176
    tmp0 <<= CONST_BITS;
4177
    /* Add fudge factor here for final descale. */
4178
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4179
4180
    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4181
    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
4182
    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
4183
4184
    tmp10 = tmp0 + tmp1;
4185
    tmp11 = tmp0 - tmp1;
4186
    tmp12 = tmp0 + tmp2;
4187
    tmp13 = tmp0 - tmp2;
4188
4189
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4190
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4191
    z3 = z1 - z2;
4192
    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
4193
    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
4194
4195
    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
4196
    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
4197
    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
4198
    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
4199
4200
    tmp20 = tmp10 + tmp0;
4201
    tmp27 = tmp10 - tmp0;
4202
    tmp21 = tmp12 + tmp1;
4203
    tmp26 = tmp12 - tmp1;
4204
    tmp22 = tmp13 + tmp2;
4205
    tmp25 = tmp13 - tmp2;
4206
    tmp23 = tmp11 + tmp3;
4207
    tmp24 = tmp11 - tmp3;
4208
4209
    /* Odd part */
4210
4211
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4212
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4213
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4214
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4215
4216
    tmp11 = z1 + z3;
4217
4218
    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
4219
    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
4220
    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
4221
    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
4222
    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
4223
    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
4224
    tmp0  = tmp1 + tmp2 + tmp3 -
4225
      MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
4226
    tmp13 = tmp10 + tmp11 + tmp12 -
4227
      MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
4228
    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
4229
    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
4230
    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
4231
    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
4232
    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
4233
    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
4234
    z2    += z4;
4235
    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
4236
    tmp1  += z1;
4237
    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
4238
    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
4239
    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
4240
    tmp12 += z2;
4241
    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
4242
    tmp2  += z2;
4243
    tmp3  += z2;
4244
    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
4245
    tmp10 += z2;
4246
    tmp11 += z2;
4247
4248
    /* Final output stage */
4249
4250
    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
4251
    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
4252
    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
4253
    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
4254
    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
4255
    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
4256
    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
4257
    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
4258
    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
4259
    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
4260
    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
4261
    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
4262
    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
4263
    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
4264
    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
4265
    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
4266
  }
4267
4268
  /* Pass 2: process rows from work array, store into output array.
4269
   * Note that we must descale the results by a factor of 8 == 2**3,
4270
   * and also undo the PASS1_BITS scaling.
4271
   * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4272
   */
4273
4274
  wsptr = workspace;
4275
  for (ctr = 0; ctr < 16; ctr++) {
4276
    outptr = output_buf[ctr] + output_col;
4277
4278
    /* Even part: reverse the even part of the forward DCT.
4279
     * The rotator is c(-6).
4280
     */
4281
4282
    /* Add range center and fudge factor for final descale and range-limit. */
4283
    z2 = (INT32) wsptr[0] +
4284
     ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4285
      (ONE << (PASS1_BITS+2)));
4286
    z3 = (INT32) wsptr[4];
4287
4288
    tmp0 = (z2 + z3) << CONST_BITS;
4289
    tmp1 = (z2 - z3) << CONST_BITS;
4290
4291
    z2 = (INT32) wsptr[2];
4292
    z3 = (INT32) wsptr[6];
4293
4294
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
4295
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
4296
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
4297
4298
    tmp10 = tmp0 + tmp2;
4299
    tmp13 = tmp0 - tmp2;
4300
    tmp11 = tmp1 + tmp3;
4301
    tmp12 = tmp1 - tmp3;
4302
4303
    /* Odd part per figure 8; the matrix is unitary and hence its
4304
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4305
     */
4306
4307
    tmp0 = (INT32) wsptr[7];
4308
    tmp1 = (INT32) wsptr[5];
4309
    tmp2 = (INT32) wsptr[3];
4310
    tmp3 = (INT32) wsptr[1];
4311
4312
    z2 = tmp0 + tmp2;
4313
    z3 = tmp1 + tmp3;
4314
4315
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
4316
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
4317
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
4318
    z2 += z1;
4319
    z3 += z1;
4320
4321
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4322
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
4323
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
4324
    tmp0 += z1 + z2;
4325
    tmp3 += z1 + z3;
4326
4327
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
4328
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
4329
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
4330
    tmp1 += z1 + z3;
4331
    tmp2 += z1 + z2;
4332
4333
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4334
4335
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
4336
                CONST_BITS+PASS1_BITS+3)
4337
          & RANGE_MASK];
4338
    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
4339
                CONST_BITS+PASS1_BITS+3)
4340
          & RANGE_MASK];
4341
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
4342
                CONST_BITS+PASS1_BITS+3)
4343
          & RANGE_MASK];
4344
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
4345
                CONST_BITS+PASS1_BITS+3)
4346
          & RANGE_MASK];
4347
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
4348
                CONST_BITS+PASS1_BITS+3)
4349
          & RANGE_MASK];
4350
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
4351
                CONST_BITS+PASS1_BITS+3)
4352
          & RANGE_MASK];
4353
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
4354
                CONST_BITS+PASS1_BITS+3)
4355
          & RANGE_MASK];
4356
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
4357
                CONST_BITS+PASS1_BITS+3)
4358
          & RANGE_MASK];
4359
4360
    wsptr += DCTSIZE;   /* advance pointer to next row */
4361
  }
4362
}
4363
4364
4365
/*
4366
 * Perform dequantization and inverse DCT on one block of coefficients,
4367
 * producing a 7x14 output block.
4368
 *
4369
 * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
4370
 */
4371
4372
GLOBAL(void)
4373
jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4374
    JCOEFPTR coef_block,
4375
    JSAMPARRAY output_buf, JDIMENSION output_col)
4376
{
4377
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
4378
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
4379
  INT32 z1, z2, z3, z4;
4380
  JCOEFPTR inptr;
4381
  ISLOW_MULT_TYPE * quantptr;
4382
  int * wsptr;
4383
  JSAMPROW outptr;
4384
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4385
  int ctr;
4386
  int workspace[7*14];  /* buffers data between passes */
4387
  SHIFT_TEMPS
4388
4389
  /* Pass 1: process columns from input, store into work array.
4390
   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
4391
   */
4392
4393
  inptr = coef_block;
4394
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4395
  wsptr = workspace;
4396
  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
4397
    /* Even part */
4398
4399
    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4400
    if (ctr == 0)
4401
      CLAMP_DC(z1);
4402
    z1 <<= CONST_BITS;
4403
    /* Add fudge factor here for final descale. */
4404
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
4405
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4406
    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
4407
    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
4408
    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
4409
4410
    tmp10 = z1 + z2;
4411
    tmp11 = z1 + z3;
4412
    tmp12 = z1 - z4;
4413
4414
    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
4415
      CONST_BITS-PASS1_BITS);
4416
4417
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4418
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4419
4420
    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
4421
4422
    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
4423
    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
4424
    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
4425
      MULTIPLY(z2, FIX(1.378756276));      /* c2 */
4426
4427
    tmp20 = tmp10 + tmp13;
4428
    tmp26 = tmp10 - tmp13;
4429
    tmp21 = tmp11 + tmp14;
4430
    tmp25 = tmp11 - tmp14;
4431
    tmp22 = tmp12 + tmp15;
4432
    tmp24 = tmp12 - tmp15;
4433
4434
    /* Odd part */
4435
4436
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4437
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4438
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4439
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4440
    tmp13 = z4 << CONST_BITS;
4441
4442
    tmp14 = z1 + z3;
4443
    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
4444
    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
4445
    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
4446
    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
4447
    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
4448
    z1    -= z2;
4449
    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
4450
    tmp16 += tmp15;
4451
    z1    += z4;
4452
    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
4453
    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
4454
    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
4455
    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
4456
    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
4457
    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
4458
4459
    tmp13 = (z1 - z3) << PASS1_BITS;
4460
4461
    /* Final output stage */
4462
4463
    wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4464
    wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4465
    wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4466
    wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4467
    wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4468
    wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4469
    wsptr[7*3]  = (int) (tmp23 + tmp13);
4470
    wsptr[7*10] = (int) (tmp23 - tmp13);
4471
    wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4472
    wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4473
    wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4474
    wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4475
    wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
4476
    wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
4477
  }
4478
4479
  /* Pass 2: process 14 rows from work array, store into output array.
4480
   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
4481
   */
4482
4483
  wsptr = workspace;
4484
  for (ctr = 0; ctr < 14; ctr++) {
4485
    outptr = output_buf[ctr] + output_col;
4486
4487
    /* Even part */
4488
4489
    /* Add range center and fudge factor for final descale and range-limit. */
4490
    tmp23 = (INT32) wsptr[0] +
4491
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4492
         (ONE << (PASS1_BITS+2)));
4493
    tmp23 <<= CONST_BITS;
4494
4495
    z1 = (INT32) wsptr[2];
4496
    z2 = (INT32) wsptr[4];
4497
    z3 = (INT32) wsptr[6];
4498
4499
    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
4500
    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
4501
    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
4502
    tmp10 = z1 + z3;
4503
    z2 -= tmp10;
4504
    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
4505
    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
4506
    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
4507
    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
4508
4509
    /* Odd part */
4510
4511
    z1 = (INT32) wsptr[1];
4512
    z2 = (INT32) wsptr[3];
4513
    z3 = (INT32) wsptr[5];
4514
4515
    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
4516
    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
4517
    tmp10 = tmp11 - tmp12;
4518
    tmp11 += tmp12;
4519
    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
4520
    tmp11 += tmp12;
4521
    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
4522
    tmp10 += z2;
4523
    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
4524
4525
    /* Final output stage */
4526
4527
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4528
                CONST_BITS+PASS1_BITS+3)
4529
          & RANGE_MASK];
4530
    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4531
                CONST_BITS+PASS1_BITS+3)
4532
          & RANGE_MASK];
4533
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4534
                CONST_BITS+PASS1_BITS+3)
4535
          & RANGE_MASK];
4536
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4537
                CONST_BITS+PASS1_BITS+3)
4538
          & RANGE_MASK];
4539
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4540
                CONST_BITS+PASS1_BITS+3)
4541
          & RANGE_MASK];
4542
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4543
                CONST_BITS+PASS1_BITS+3)
4544
          & RANGE_MASK];
4545
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
4546
                CONST_BITS+PASS1_BITS+3)
4547
          & RANGE_MASK];
4548
4549
    wsptr += 7;   /* advance pointer to next row */
4550
  }
4551
}
4552
4553
4554
/*
4555
 * Perform dequantization and inverse DCT on one block of coefficients,
4556
 * producing a 6x12 output block.
4557
 *
4558
 * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
4559
 */
4560
4561
GLOBAL(void)
4562
jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4563
    JCOEFPTR coef_block,
4564
    JSAMPARRAY output_buf, JDIMENSION output_col)
4565
{
4566
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
4567
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
4568
  INT32 z1, z2, z3, z4;
4569
  JCOEFPTR inptr;
4570
  ISLOW_MULT_TYPE * quantptr;
4571
  int * wsptr;
4572
  JSAMPROW outptr;
4573
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4574
  int ctr;
4575
  int workspace[6*12];  /* buffers data between passes */
4576
  SHIFT_TEMPS
4577
4578
  /* Pass 1: process columns from input, store into work array.
4579
   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
4580
   */
4581
4582
  inptr = coef_block;
4583
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4584
  wsptr = workspace;
4585
  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
4586
    /* Even part */
4587
4588
    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4589
    if (ctr == 0)
4590
      CLAMP_DC(z3);
4591
    z3 <<= CONST_BITS;
4592
    /* Add fudge factor here for final descale. */
4593
    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4594
4595
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4596
    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
4597
4598
    tmp10 = z3 + z4;
4599
    tmp11 = z3 - z4;
4600
4601
    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4602
    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
4603
    z1 <<= CONST_BITS;
4604
    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4605
    z2 <<= CONST_BITS;
4606
4607
    tmp12 = z1 - z2;
4608
4609
    tmp21 = z3 + tmp12;
4610
    tmp24 = z3 - tmp12;
4611
4612
    tmp12 = z4 + z2;
4613
4614
    tmp20 = tmp10 + tmp12;
4615
    tmp25 = tmp10 - tmp12;
4616
4617
    tmp12 = z4 - z1 - z2;
4618
4619
    tmp22 = tmp11 + tmp12;
4620
    tmp23 = tmp11 - tmp12;
4621
4622
    /* Odd part */
4623
4624
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4625
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4626
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4627
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4628
4629
    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
4630
    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
4631
4632
    tmp10 = z1 + z3;
4633
    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
4634
    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
4635
    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
4636
    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
4637
    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
4638
    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
4639
    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
4640
       MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
4641
4642
    z1 -= z4;
4643
    z2 -= z3;
4644
    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
4645
    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
4646
    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
4647
4648
    /* Final output stage */
4649
4650
    wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4651
    wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4652
    wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4653
    wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4654
    wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4655
    wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4656
    wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4657
    wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4658
    wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4659
    wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4660
    wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4661
    wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4662
  }
4663
4664
  /* Pass 2: process 12 rows from work array, store into output array.
4665
   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4666
   */
4667
4668
  wsptr = workspace;
4669
  for (ctr = 0; ctr < 12; ctr++) {
4670
    outptr = output_buf[ctr] + output_col;
4671
4672
    /* Even part */
4673
4674
    /* Add range center and fudge factor for final descale and range-limit. */
4675
    tmp10 = (INT32) wsptr[0] +
4676
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4677
         (ONE << (PASS1_BITS+2)));
4678
    tmp10 <<= CONST_BITS;
4679
    tmp12 = (INT32) wsptr[4];
4680
    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
4681
    tmp11 = tmp10 + tmp20;
4682
    tmp21 = tmp10 - tmp20 - tmp20;
4683
    tmp20 = (INT32) wsptr[2];
4684
    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
4685
    tmp20 = tmp11 + tmp10;
4686
    tmp22 = tmp11 - tmp10;
4687
4688
    /* Odd part */
4689
4690
    z1 = (INT32) wsptr[1];
4691
    z2 = (INT32) wsptr[3];
4692
    z3 = (INT32) wsptr[5];
4693
    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4694
    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
4695
    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
4696
    tmp11 = (z1 - z2 - z3) << CONST_BITS;
4697
4698
    /* Final output stage */
4699
4700
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4701
                CONST_BITS+PASS1_BITS+3)
4702
          & RANGE_MASK];
4703
    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4704
                CONST_BITS+PASS1_BITS+3)
4705
          & RANGE_MASK];
4706
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4707
                CONST_BITS+PASS1_BITS+3)
4708
          & RANGE_MASK];
4709
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4710
                CONST_BITS+PASS1_BITS+3)
4711
          & RANGE_MASK];
4712
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4713
                CONST_BITS+PASS1_BITS+3)
4714
          & RANGE_MASK];
4715
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4716
                CONST_BITS+PASS1_BITS+3)
4717
          & RANGE_MASK];
4718
4719
    wsptr += 6;   /* advance pointer to next row */
4720
  }
4721
}
4722
4723
4724
/*
4725
 * Perform dequantization and inverse DCT on one block of coefficients,
4726
 * producing a 5x10 output block.
4727
 *
4728
 * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
4729
 */
4730
4731
GLOBAL(void)
4732
jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4733
    JCOEFPTR coef_block,
4734
    JSAMPARRAY output_buf, JDIMENSION output_col)
4735
{
4736
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4737
  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
4738
  INT32 z1, z2, z3, z4, z5;
4739
  JCOEFPTR inptr;
4740
  ISLOW_MULT_TYPE * quantptr;
4741
  int * wsptr;
4742
  JSAMPROW outptr;
4743
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4744
  int ctr;
4745
  int workspace[5*10];  /* buffers data between passes */
4746
  SHIFT_TEMPS
4747
4748
  /* Pass 1: process columns from input, store into work array.
4749
   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
4750
   */
4751
4752
  inptr = coef_block;
4753
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4754
  wsptr = workspace;
4755
  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
4756
    /* Even part */
4757
4758
    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4759
    if (ctr == 0)
4760
      CLAMP_DC(z3);
4761
    z3 <<= CONST_BITS;
4762
    /* Add fudge factor here for final descale. */
4763
    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4764
    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4765
    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
4766
    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
4767
    tmp10 = z3 + z1;
4768
    tmp11 = z3 - z2;
4769
4770
    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
4771
      CONST_BITS-PASS1_BITS);
4772
4773
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4774
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4775
4776
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
4777
    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
4778
    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
4779
4780
    tmp20 = tmp10 + tmp12;
4781
    tmp24 = tmp10 - tmp12;
4782
    tmp21 = tmp11 + tmp13;
4783
    tmp23 = tmp11 - tmp13;
4784
4785
    /* Odd part */
4786
4787
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4788
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4789
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4790
    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4791
4792
    tmp11 = z2 + z4;
4793
    tmp13 = z2 - z4;
4794
4795
    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
4796
    z5 = z3 << CONST_BITS;
4797
4798
    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
4799
    z4 = z5 + tmp12;
4800
4801
    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
4802
    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
4803
4804
    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
4805
    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
4806
4807
    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
4808
4809
    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
4810
    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
4811
4812
    /* Final output stage */
4813
4814
    wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4815
    wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4816
    wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4817
    wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4818
    wsptr[5*2] = (int) (tmp22 + tmp12);
4819
    wsptr[5*7] = (int) (tmp22 - tmp12);
4820
    wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4821
    wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4822
    wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4823
    wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4824
  }
4825
4826
  /* Pass 2: process 10 rows from work array, store into output array.
4827
   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4828
   */
4829
4830
  wsptr = workspace;
4831
  for (ctr = 0; ctr < 10; ctr++) {
4832
    outptr = output_buf[ctr] + output_col;
4833
4834
    /* Even part */
4835
4836
    /* Add range center and fudge factor for final descale and range-limit. */
4837
    tmp12 = (INT32) wsptr[0] +
4838
        ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
4839
         (ONE << (PASS1_BITS+2)));
4840
    tmp12 <<= CONST_BITS;
4841
    tmp13 = (INT32) wsptr[2];
4842
    tmp14 = (INT32) wsptr[4];
4843
    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
4844
    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
4845
    z3 = tmp12 + z2;
4846
    tmp10 = z3 + z1;
4847
    tmp11 = z3 - z1;
4848
    tmp12 -= z2 << 2;
4849
4850
    /* Odd part */
4851
4852
    z2 = (INT32) wsptr[1];
4853
    z3 = (INT32) wsptr[3];
4854
4855
    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
4856
    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
4857
    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
4858
4859
    /* Final output stage */
4860
4861
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
4862
                CONST_BITS+PASS1_BITS+3)
4863
          & RANGE_MASK];
4864
    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
4865
                CONST_BITS+PASS1_BITS+3)
4866
          & RANGE_MASK];
4867
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
4868
                CONST_BITS+PASS1_BITS+3)
4869
          & RANGE_MASK];
4870
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
4871
                CONST_BITS+PASS1_BITS+3)
4872
          & RANGE_MASK];
4873
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
4874
                CONST_BITS+PASS1_BITS+3)
4875
          & RANGE_MASK];
4876
4877
    wsptr += 5;   /* advance pointer to next row */
4878
  }
4879
}
4880
4881
4882
/*
4883
 * Perform dequantization and inverse DCT on one block of coefficients,
4884
 * producing a 4x8 output block.
4885
 *
4886
 * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4887
 */
4888
4889
GLOBAL(void)
4890
jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4891
         JCOEFPTR coef_block,
4892
         JSAMPARRAY output_buf, JDIMENSION output_col)
4893
{
4894
  INT32 tmp0, tmp1, tmp2, tmp3;
4895
  INT32 tmp10, tmp11, tmp12, tmp13;
4896
  INT32 z1, z2, z3;
4897
  JCOEFPTR inptr;
4898
  ISLOW_MULT_TYPE * quantptr;
4899
  int * wsptr;
4900
  JSAMPROW outptr;
4901
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4902
  int ctr;
4903
  int workspace[4*8]; /* buffers data between passes */
4904
  SHIFT_TEMPS
4905
4906
  /* Pass 1: process columns from input, store into work array.
4907
   * Note results are scaled up by sqrt(8) compared to a true IDCT;
4908
   * furthermore, we scale the results by 2**PASS1_BITS.
4909
   * 8-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4910
   */
4911
4912
  inptr = coef_block;
4913
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4914
  wsptr = workspace;
4915
  for (ctr = 4; ctr > 0; ctr--) {
4916
    /* Due to quantization, we will usually find that many of the input
4917
     * coefficients are zero, especially the AC terms.  We can exploit this
4918
     * by short-circuiting the IDCT calculation for any column in which all
4919
     * the AC terms are zero.  In that case each output is equal to the
4920
     * DC coefficient (with scale factor as needed).
4921
     * With typical images and quantization tables, half or more of the
4922
     * column DCT calculations can be simplified this way.
4923
     */
4924
4925
    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
4926
  inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
4927
  inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
4928
  inptr[DCTSIZE*7] == 0) {
4929
      /* AC terms all zero */
4930
      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4931
      if (ctr == 4)
4932
        CLAMP_DC(dcval);
4933
      dcval <<= PASS1_BITS;
4934
4935
      wsptr[4*0] = dcval;
4936
      wsptr[4*1] = dcval;
4937
      wsptr[4*2] = dcval;
4938
      wsptr[4*3] = dcval;
4939
      wsptr[4*4] = dcval;
4940
      wsptr[4*5] = dcval;
4941
      wsptr[4*6] = dcval;
4942
      wsptr[4*7] = dcval;
4943
4944
      inptr++;      /* advance pointers to next column */
4945
      quantptr++;
4946
      wsptr++;
4947
      continue;
4948
    }
4949
4950
    /* Even part: reverse the even part of the forward DCT.
4951
     * The rotator is c(-6).
4952
     */
4953
4954
    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4955
    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4956
    z2 <<= CONST_BITS;
4957
    z3 <<= CONST_BITS;
4958
    /* Add fudge factor here for final descale. */
4959
    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
4960
4961
    tmp0 = z2 + z3;
4962
    tmp1 = z2 - z3;
4963
4964
    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4965
    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4966
4967
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);       /* c6 */
4968
    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);     /* c2-c6 */
4969
    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);     /* c2+c6 */
4970
4971
    tmp10 = tmp0 + tmp2;
4972
    tmp13 = tmp0 - tmp2;
4973
    tmp11 = tmp1 + tmp3;
4974
    tmp12 = tmp1 - tmp3;
4975
4976
    /* Odd part per figure 8; the matrix is unitary and hence its
4977
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4978
     */
4979
4980
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4981
    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4982
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4983
    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4984
4985
    z2 = tmp0 + tmp2;
4986
    z3 = tmp1 + tmp3;
4987
4988
    z1 = MULTIPLY(z2 + z3, FIX_1_175875602);       /*  c3 */
4989
    z2 = MULTIPLY(z2, - FIX_1_961570560);          /* -c3-c5 */
4990
    z3 = MULTIPLY(z3, - FIX_0_390180644);          /* -c3+c5 */
4991
    z2 += z1;
4992
    z3 += z1;
4993
4994
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* -c3+c7 */
4995
    tmp0 = MULTIPLY(tmp0, FIX_0_298631336);        /* -c1+c3+c5-c7 */
4996
    tmp3 = MULTIPLY(tmp3, FIX_1_501321110);        /*  c1+c3-c5-c7 */
4997
    tmp0 += z1 + z2;
4998
    tmp3 += z1 + z3;
4999
5000
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* -c1-c3 */
5001
    tmp1 = MULTIPLY(tmp1, FIX_2_053119869);        /*  c1+c3-c5+c7 */
5002
    tmp2 = MULTIPLY(tmp2, FIX_3_072711026);        /*  c1+c3+c5-c7 */
5003
    tmp1 += z1 + z3;
5004
    tmp2 += z1 + z2;
5005
5006
    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
5007
5008
    wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
5009
    wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
5010
    wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
5011
    wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
5012
    wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
5013
    wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
5014
    wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
5015
    wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
5016
5017
    inptr++;      /* advance pointers to next column */
5018
    quantptr++;
5019
    wsptr++;
5020
  }
5021
5022
  /* Pass 2: process 8 rows from work array, store into output array.
5023
   * 4-point IDCT kernel,
5024
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5025
   */
5026
5027
  wsptr = workspace;
5028
  for (ctr = 0; ctr < 8; ctr++) {
5029
    outptr = output_buf[ctr] + output_col;
5030
5031
    /* Even part */
5032
5033
    /* Add range center and fudge factor for final descale and range-limit. */
5034
    tmp0 = (INT32) wsptr[0] +
5035
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
5036
        (ONE << (PASS1_BITS+2)));
5037
    tmp2 = (INT32) wsptr[2];
5038
5039
    tmp10 = (tmp0 + tmp2) << CONST_BITS;
5040
    tmp12 = (tmp0 - tmp2) << CONST_BITS;
5041
5042
    /* Odd part */
5043
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5044
5045
    z2 = (INT32) wsptr[1];
5046
    z3 = (INT32) wsptr[3];
5047
5048
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5049
    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5050
    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5051
5052
    /* Final output stage */
5053
5054
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
5055
                CONST_BITS+PASS1_BITS+3)
5056
          & RANGE_MASK];
5057
    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5058
                CONST_BITS+PASS1_BITS+3)
5059
          & RANGE_MASK];
5060
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
5061
                CONST_BITS+PASS1_BITS+3)
5062
          & RANGE_MASK];
5063
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
5064
                CONST_BITS+PASS1_BITS+3)
5065
          & RANGE_MASK];
5066
5067
    wsptr += 4;   /* advance pointer to next row */
5068
  }
5069
}
5070
5071
5072
/*
5073
 * Perform dequantization and inverse DCT on one block of coefficients,
5074
 * producing a 3x6 output block.
5075
 *
5076
 * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
5077
 */
5078
5079
GLOBAL(void)
5080
jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5081
         JCOEFPTR coef_block,
5082
         JSAMPARRAY output_buf, JDIMENSION output_col)
5083
{
5084
  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
5085
  INT32 z1, z2, z3;
5086
  JCOEFPTR inptr;
5087
  ISLOW_MULT_TYPE * quantptr;
5088
  int * wsptr;
5089
  JSAMPROW outptr;
5090
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5091
  int ctr;
5092
  int workspace[3*6]; /* buffers data between passes */
5093
  SHIFT_TEMPS
5094
5095
  /* Pass 1: process columns from input, store into work array.
5096
   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
5097
   */
5098
5099
  inptr = coef_block;
5100
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5101
  wsptr = workspace;
5102
  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
5103
    /* Even part */
5104
5105
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5106
    if (ctr == 0)
5107
      CLAMP_DC(tmp0);
5108
    tmp0 <<= CONST_BITS;
5109
    /* Add fudge factor here for final descale. */
5110
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
5111
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
5112
    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
5113
    tmp1 = tmp0 + tmp10;
5114
    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
5115
    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5116
    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
5117
    tmp10 = tmp1 + tmp0;
5118
    tmp12 = tmp1 - tmp0;
5119
5120
    /* Odd part */
5121
5122
    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5123
    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5124
    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
5125
    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
5126
    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
5127
    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
5128
    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
5129
5130
    /* Final output stage */
5131
5132
    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
5133
    wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
5134
    wsptr[3*1] = (int) (tmp11 + tmp1);
5135
    wsptr[3*4] = (int) (tmp11 - tmp1);
5136
    wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
5137
    wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
5138
  }
5139
5140
  /* Pass 2: process 6 rows from work array, store into output array.
5141
   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
5142
   */
5143
5144
  wsptr = workspace;
5145
  for (ctr = 0; ctr < 6; ctr++) {
5146
    outptr = output_buf[ctr] + output_col;
5147
5148
    /* Even part */
5149
5150
    /* Add range center and fudge factor for final descale and range-limit. */
5151
    tmp0 = (INT32) wsptr[0] +
5152
       ((((INT32) RANGE_CENTER) << (PASS1_BITS+3)) +
5153
        (ONE << (PASS1_BITS+2)));
5154
    tmp0 <<= CONST_BITS;
5155
    tmp2 = (INT32) wsptr[2];
5156
    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
5157
    tmp10 = tmp0 + tmp12;
5158
    tmp2 = tmp0 - tmp12 - tmp12;
5159
5160
    /* Odd part */
5161
5162
    tmp12 = (INT32) wsptr[1];
5163
    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
5164
5165
    /* Final output stage */
5166
5167
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
5168
                CONST_BITS+PASS1_BITS+3)
5169
          & RANGE_MASK];
5170
    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5171
                CONST_BITS+PASS1_BITS+3)
5172
          & RANGE_MASK];
5173
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
5174
                CONST_BITS+PASS1_BITS+3)
5175
          & RANGE_MASK];
5176
5177
    wsptr += 3;   /* advance pointer to next row */
5178
  }
5179
}
5180
5181
5182
/*
5183
 * Perform dequantization and inverse DCT on one block of coefficients,
5184
 * producing a 2x4 output block.
5185
 *
5186
 * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
5187
 */
5188
5189
GLOBAL(void)
5190
jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5191
         JCOEFPTR coef_block,
5192
         JSAMPARRAY output_buf, JDIMENSION output_col)
5193
{
5194
  INT32 tmp0, tmp2, tmp10, tmp12;
5195
  INT32 z1, z2, z3;
5196
  JCOEFPTR inptr;
5197
  ISLOW_MULT_TYPE * quantptr;
5198
  INT32 * wsptr;
5199
  JSAMPROW outptr;
5200
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5201
  int ctr;
5202
  INT32 workspace[2*4]; /* buffers data between passes */
5203
  SHIFT_TEMPS
5204
5205
  /* Pass 1: process columns from input, store into work array.
5206
   * 4-point IDCT kernel,
5207
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5208
   */
5209
5210
  inptr = coef_block;
5211
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5212
  wsptr = workspace;
5213
  for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
5214
    /* Even part */
5215
5216
    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5217
    if (ctr == 0)
5218
      CLAMP_DC(tmp0);
5219
    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5220
5221
    tmp10 = (tmp0 + tmp2) << CONST_BITS;
5222
    tmp12 = (tmp0 - tmp2) << CONST_BITS;
5223
5224
    /* Odd part */
5225
    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5226
5227
    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5228
    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5229
5230
    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5231
    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5232
    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5233
5234
    /* Final output stage */
5235
5236
    wsptr[2*0] = tmp10 + tmp0;
5237
    wsptr[2*3] = tmp10 - tmp0;
5238
    wsptr[2*1] = tmp12 + tmp2;
5239
    wsptr[2*2] = tmp12 - tmp2;
5240
  }
5241
5242
  /* Pass 2: process 4 rows from work array, store into output array. */
5243
5244
  wsptr = workspace;
5245
  for (ctr = 0; ctr < 4; ctr++) {
5246
    outptr = output_buf[ctr] + output_col;
5247
5248
    /* Even part */
5249
5250
    /* Add range center and fudge factor for final descale and range-limit. */
5251
    tmp10 = wsptr[0] +
5252
        ((((INT32) RANGE_CENTER) << (CONST_BITS+3)) +
5253
         (ONE << (CONST_BITS+2)));
5254
5255
    /* Odd part */
5256
5257
    tmp0 = wsptr[1];
5258
5259
    /* Final output stage */
5260
5261
    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
5262
          & RANGE_MASK];
5263
    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
5264
          & RANGE_MASK];
5265
5266
    wsptr += 2;   /* advance pointer to next row */
5267
  }
5268
}
5269
5270
5271
/*
5272
 * Perform dequantization and inverse DCT on one block of coefficients,
5273
 * producing a 1x2 output block.
5274
 *
5275
 * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
5276
 */
5277
5278
GLOBAL(void)
5279
jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5280
         JCOEFPTR coef_block,
5281
         JSAMPARRAY output_buf, JDIMENSION output_col)
5282
{
5283
  INT32 tmp0, tmp1;
5284
  ISLOW_MULT_TYPE * quantptr;
5285
  JSAMPROW outptr;
5286
  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5287
  SHIFT_TEMPS
5288
5289
  /* Pass 1: empty. */
5290
5291
  /* Pass 2: process 1 row from input, store into output array. */
5292
5293
  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5294
  outptr = output_buf[0] + output_col;
5295
5296
  /* Even part */
5297
5298
  tmp0 = DEQUANTIZE(coef_block[0], quantptr[0]);
5299
  CLAMP_DC(tmp0);
5300
  /* Add fudge factor here for final descale. */
5301
  tmp0 += ONE << 2;
5302
5303
  /* Odd part */
5304
5305
  tmp1 = DEQUANTIZE(coef_block[1], quantptr[1]);
5306
5307
  /* Final output stage */
5308
5309
  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
5310
  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
5311
}
5312
5313
#endif /* IDCT_SCALING_SUPPORTED */
5314
#endif /* DCT_ISLOW_SUPPORTED */