Coverage Report

Created: 2025-06-24 07:01

/src/ghostpdl/obj/jfdctint.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * jfdctint.c
3
 *
4
 * Copyright (C) 1991-1996, Thomas G. Lane.
5
 * Modification developed 2003-2018 by Guido Vollbeding.
6
 * This file is part of the Independent JPEG Group's software.
7
 * For conditions of distribution and use, see the accompanying README file.
8
 *
9
 * This file contains a slow-but-accurate integer implementation of the
10
 * forward DCT (Discrete Cosine Transform).
11
 *
12
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13
 * on each column.  Direct algorithms are also available, but they are
14
 * much more complex and seem not to be any faster when reduced to code.
15
 *
16
 * This implementation is based on an algorithm described in
17
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20
 * The primary algorithm described there uses 11 multiplies and 29 adds.
21
 * We use their alternate method with 12 multiplies and 32 adds.
22
 * The advantage of this method is that no data path contains more than one
23
 * multiplication; this allows a very simple and accurate implementation in
24
 * scaled fixed-point arithmetic, with a minimal number of shifts.
25
 *
26
 * We also provide FDCT routines with various input sample block sizes for
27
 * direct resolution reduction or enlargement and for direct resolving the
28
 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29
 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
30
 *
31
 * For N<8 we fill the remaining block coefficients with zero.
32
 * For N>8 we apply a partial N-point FDCT on the input samples, computing
33
 * just the lower 8 frequency coefficients and discarding the rest.
34
 *
35
 * We must scale the output coefficients of the N-point FDCT appropriately
36
 * to the standard 8-point FDCT level by 8/N per 1-D pass.  This scaling
37
 * is folded into the constant multipliers (pass 2) and/or final/initial
38
 * shifting.
39
 *
40
 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41
 * since there would be too many additional constants to pre-calculate.
42
 */
43
44
#define JPEG_INTERNALS
45
#include "jinclude.h"
46
#include "jpeglib.h"
47
#include "jdct.h"   /* Private declarations for DCT subsystem */
48
49
#ifdef DCT_ISLOW_SUPPORTED
50
51
52
/*
53
 * This module is specialized to the case DCTSIZE = 8.
54
 */
55
56
#if DCTSIZE != 8
57
  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58
#endif
59
60
61
/*
62
 * The poop on this scaling stuff is as follows:
63
 *
64
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65
 * larger than the true DCT outputs.  The final outputs are therefore
66
 * a factor of N larger than desired; since N=8 this can be cured by
67
 * a simple right shift at the end of the algorithm.  The advantage of
68
 * this arrangement is that we save two multiplications per 1-D DCT,
69
 * because the y0 and y4 outputs need not be divided by sqrt(N).
70
 * In the IJG code, this factor of 8 is removed by the quantization step
71
 * (in jcdctmgr.c), NOT in this module.
72
 *
73
 * We have to do addition and subtraction of the integer inputs, which
74
 * is no problem, and multiplication by fractional constants, which is
75
 * a problem to do in integer arithmetic.  We multiply all the constants
76
 * by CONST_SCALE and convert them to integer constants (thus retaining
77
 * CONST_BITS bits of precision in the constants).  After doing a
78
 * multiplication we have to divide the product by CONST_SCALE, with proper
79
 * rounding, to produce the correct output.  This division can be done
80
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
81
 * as long as possible so that partial sums can be added together with
82
 * full fractional precision.
83
 *
84
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
85
 * they are represented to better-than-integral precision.  These outputs
86
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87
 * with the recommended scaling.  (For 12-bit sample data, the intermediate
88
 * array is INT32 anyway.)
89
 *
90
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
91
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
92
 * shows that the values given below are the most effective.
93
 */
94
95
#if BITS_IN_JSAMPLE == 8
96
31.4M
#define CONST_BITS  13
97
59.2M
#define PASS1_BITS  2
98
#else
99
#define CONST_BITS  13
100
#define PASS1_BITS  1   /* lose a little precision to avoid overflow */
101
#endif
102
103
/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
104
 * causing a lot of useless floating-point operations at run time.
105
 * To get around this we use the following pre-calculated constants.
106
 * If you change CONST_BITS you may want to add appropriate values.
107
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
108
 */
109
110
#if CONST_BITS == 13
111
#define FIX_0_298631336  ((INT32)  2446)  /* FIX(0.298631336) */
112
#define FIX_0_390180644  ((INT32)  3196)  /* FIX(0.390180644) */
113
#define FIX_0_541196100  ((INT32)  4433)  /* FIX(0.541196100) */
114
#define FIX_0_765366865  ((INT32)  6270)  /* FIX(0.765366865) */
115
#define FIX_0_899976223  ((INT32)  7373)  /* FIX(0.899976223) */
116
#define FIX_1_175875602  ((INT32)  9633)  /* FIX(1.175875602) */
117
#define FIX_1_501321110  ((INT32)  12299) /* FIX(1.501321110) */
118
#define FIX_1_847759065  ((INT32)  15137) /* FIX(1.847759065) */
119
#define FIX_1_961570560  ((INT32)  16069) /* FIX(1.961570560) */
120
#define FIX_2_053119869  ((INT32)  16819) /* FIX(2.053119869) */
121
#define FIX_2_562915447  ((INT32)  20995) /* FIX(2.562915447) */
122
#define FIX_3_072711026  ((INT32)  25172) /* FIX(3.072711026) */
123
#else
124
#define FIX_0_298631336  FIX(0.298631336)
125
#define FIX_0_390180644  FIX(0.390180644)
126
#define FIX_0_541196100  FIX(0.541196100)
127
#define FIX_0_765366865  FIX(0.765366865)
128
#define FIX_0_899976223  FIX(0.899976223)
129
#define FIX_1_175875602  FIX(1.175875602)
130
#define FIX_1_501321110  FIX(1.501321110)
131
#define FIX_1_847759065  FIX(1.847759065)
132
#define FIX_1_961570560  FIX(1.961570560)
133
#define FIX_2_053119869  FIX(2.053119869)
134
#define FIX_2_562915447  FIX(2.562915447)
135
#define FIX_3_072711026  FIX(3.072711026)
136
#endif
137
138
139
/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
140
 * For 8-bit samples with the recommended scaling, all the variable
141
 * and constant values involved are no more than 16 bits wide, so a
142
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143
 * For 12-bit samples, a full 32-bit multiplication will be needed.
144
 */
145
146
#if BITS_IN_JSAMPLE == 8
147
295M
#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
148
#else
149
#define MULTIPLY(var,const)  ((var) * (const))
150
#endif
151
152
153
/*
154
 * Perform the forward DCT on one block of samples.
155
 */
156
157
GLOBAL(void)
158
jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
159
982k
{
160
982k
  INT32 tmp0, tmp1, tmp2, tmp3;
161
982k
  INT32 tmp10, tmp11, tmp12, tmp13;
162
982k
  INT32 z1;
163
982k
  DCTELEM *dataptr;
164
982k
  JSAMPROW elemptr;
165
982k
  int ctr;
166
982k
  SHIFT_TEMPS
167
168
  /* Pass 1: process rows.
169
   * Note results are scaled up by sqrt(8) compared to a true DCT;
170
   * furthermore, we scale the results by 2**PASS1_BITS.
171
   * cK represents sqrt(2) * cos(K*pi/16).
172
   */
173
174
982k
  dataptr = data;
175
8.84M
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
176
7.86M
    elemptr = sample_data[ctr] + start_col;
177
178
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
179
     * rotator "c1" should be "c6".
180
     */
181
182
7.86M
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
183
7.86M
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
184
7.86M
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
185
7.86M
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
186
187
7.86M
    tmp10 = tmp0 + tmp3;
188
7.86M
    tmp12 = tmp0 - tmp3;
189
7.86M
    tmp11 = tmp1 + tmp2;
190
7.86M
    tmp13 = tmp1 - tmp2;
191
192
7.86M
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
193
7.86M
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
194
7.86M
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
195
7.86M
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
196
197
    /* Apply unsigned->signed conversion. */
198
7.86M
    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
199
7.86M
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
200
201
7.86M
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
202
    /* Add fudge factor here for final descale. */
203
7.86M
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
204
205
7.86M
    dataptr[2] = (DCTELEM)
206
7.86M
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
207
7.86M
      CONST_BITS-PASS1_BITS);
208
7.86M
    dataptr[6] = (DCTELEM)
209
7.86M
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
210
7.86M
      CONST_BITS-PASS1_BITS);
211
212
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
213
     * i0..i3 in the paper are tmp0..tmp3 here.
214
     */
215
216
7.86M
    tmp12 = tmp0 + tmp2;
217
7.86M
    tmp13 = tmp1 + tmp3;
218
219
7.86M
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
220
    /* Add fudge factor here for final descale. */
221
7.86M
    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
222
223
7.86M
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
224
7.86M
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
225
7.86M
    tmp12 += z1;
226
7.86M
    tmp13 += z1;
227
228
7.86M
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
229
7.86M
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
230
7.86M
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
231
7.86M
    tmp0 += z1 + tmp12;
232
7.86M
    tmp3 += z1 + tmp13;
233
234
7.86M
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
235
7.86M
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
236
7.86M
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
237
7.86M
    tmp1 += z1 + tmp13;
238
7.86M
    tmp2 += z1 + tmp12;
239
240
7.86M
    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS);
241
7.86M
    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS);
242
7.86M
    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
243
7.86M
    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS);
244
245
7.86M
    dataptr += DCTSIZE;   /* advance pointer to next row */
246
7.86M
  }
247
248
  /* Pass 2: process columns.
249
   * We remove the PASS1_BITS scaling, but leave the results scaled up
250
   * by an overall factor of 8.
251
   * cK represents sqrt(2) * cos(K*pi/16).
252
   */
253
254
982k
  dataptr = data;
255
8.84M
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
256
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
257
     * rotator "c1" should be "c6".
258
     */
259
260
7.86M
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
261
7.86M
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
262
7.86M
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
263
7.86M
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
264
265
    /* Add fudge factor here for final descale. */
266
7.86M
    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
267
7.86M
    tmp12 = tmp0 - tmp3;
268
7.86M
    tmp11 = tmp1 + tmp2;
269
7.86M
    tmp13 = tmp1 - tmp2;
270
271
7.86M
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
272
7.86M
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
273
7.86M
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
274
7.86M
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
275
276
7.86M
    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
277
7.86M
    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
278
279
7.86M
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
280
    /* Add fudge factor here for final descale. */
281
7.86M
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
282
283
7.86M
    dataptr[DCTSIZE*2] = (DCTELEM)
284
7.86M
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
285
7.86M
      CONST_BITS+PASS1_BITS);
286
7.86M
    dataptr[DCTSIZE*6] = (DCTELEM)
287
7.86M
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
288
7.86M
      CONST_BITS+PASS1_BITS);
289
290
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
291
     * i0..i3 in the paper are tmp0..tmp3 here.
292
     */
293
294
7.86M
    tmp12 = tmp0 + tmp2;
295
7.86M
    tmp13 = tmp1 + tmp3;
296
297
7.86M
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
298
    /* Add fudge factor here for final descale. */
299
7.86M
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
300
301
7.86M
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
302
7.86M
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
303
7.86M
    tmp12 += z1;
304
7.86M
    tmp13 += z1;
305
306
7.86M
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
307
7.86M
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
308
7.86M
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
309
7.86M
    tmp0 += z1 + tmp12;
310
7.86M
    tmp3 += z1 + tmp13;
311
312
7.86M
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
313
7.86M
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
314
7.86M
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
315
7.86M
    tmp1 += z1 + tmp13;
316
7.86M
    tmp2 += z1 + tmp12;
317
318
7.86M
    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
319
7.86M
    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
320
7.86M
    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
321
7.86M
    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
322
323
7.86M
    dataptr++;      /* advance pointer to next column */
324
7.86M
  }
325
982k
}
326
327
#ifdef DCT_SCALING_SUPPORTED
328
329
330
/*
331
 * Perform the forward DCT on a 7x7 sample block.
332
 */
333
334
GLOBAL(void)
335
jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
336
0
{
337
0
  INT32 tmp0, tmp1, tmp2, tmp3;
338
0
  INT32 tmp10, tmp11, tmp12;
339
0
  INT32 z1, z2, z3;
340
0
  DCTELEM *dataptr;
341
0
  JSAMPROW elemptr;
342
0
  int ctr;
343
0
  SHIFT_TEMPS
344
345
  /* Pre-zero output coefficient block. */
346
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
347
348
  /* Pass 1: process rows.
349
   * Note results are scaled up by sqrt(8) compared to a true DCT;
350
   * furthermore, we scale the results by 2**PASS1_BITS.
351
   * cK represents sqrt(2) * cos(K*pi/14).
352
   */
353
354
0
  dataptr = data;
355
0
  for (ctr = 0; ctr < 7; ctr++) {
356
0
    elemptr = sample_data[ctr] + start_col;
357
358
    /* Even part */
359
360
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
361
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
362
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
363
0
    tmp3 = GETJSAMPLE(elemptr[3]);
364
365
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
366
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
367
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
368
369
0
    z1 = tmp0 + tmp2;
370
    /* Apply unsigned->signed conversion. */
371
0
    dataptr[0] = (DCTELEM)
372
0
      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
373
0
    tmp3 += tmp3;
374
0
    z1 -= tmp3;
375
0
    z1 -= tmp3;
376
0
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
377
0
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
378
0
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
379
0
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
380
0
    z1 -= z2;
381
0
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
382
0
    dataptr[4] = (DCTELEM)
383
0
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
384
0
        CONST_BITS-PASS1_BITS);
385
0
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
386
387
    /* Odd part */
388
389
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
390
0
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
391
0
    tmp0 = tmp1 - tmp2;
392
0
    tmp1 += tmp2;
393
0
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
394
0
    tmp1 += tmp2;
395
0
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
396
0
    tmp0 += tmp3;
397
0
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
398
399
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
400
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
401
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
402
403
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
404
0
  }
405
406
  /* Pass 2: process columns.
407
   * We remove the PASS1_BITS scaling, but leave the results scaled up
408
   * by an overall factor of 8.
409
   * We must also scale the output by (8/7)**2 = 64/49, which we fold
410
   * into the constant multipliers:
411
   * cK now represents sqrt(2) * cos(K*pi/14) * 64/49.
412
   */
413
414
0
  dataptr = data;
415
0
  for (ctr = 0; ctr < 7; ctr++) {
416
    /* Even part */
417
418
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
419
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
420
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
421
0
    tmp3 = dataptr[DCTSIZE*3];
422
423
0
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
424
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
425
0
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
426
427
0
    z1 = tmp0 + tmp2;
428
0
    dataptr[DCTSIZE*0] = (DCTELEM)
429
0
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
430
0
        CONST_BITS+PASS1_BITS);
431
0
    tmp3 += tmp3;
432
0
    z1 -= tmp3;
433
0
    z1 -= tmp3;
434
0
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
435
0
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
436
0
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
437
0
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
438
0
    z1 -= z2;
439
0
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
440
0
    dataptr[DCTSIZE*4] = (DCTELEM)
441
0
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
442
0
        CONST_BITS+PASS1_BITS);
443
0
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
444
445
    /* Odd part */
446
447
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
448
0
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
449
0
    tmp0 = tmp1 - tmp2;
450
0
    tmp1 += tmp2;
451
0
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
452
0
    tmp1 += tmp2;
453
0
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
454
0
    tmp0 += tmp3;
455
0
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
456
457
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
458
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
459
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
460
461
0
    dataptr++;      /* advance pointer to next column */
462
0
  }
463
0
}
464
465
466
/*
467
 * Perform the forward DCT on a 6x6 sample block.
468
 */
469
470
GLOBAL(void)
471
jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
472
0
{
473
0
  INT32 tmp0, tmp1, tmp2;
474
0
  INT32 tmp10, tmp11, tmp12;
475
0
  DCTELEM *dataptr;
476
0
  JSAMPROW elemptr;
477
0
  int ctr;
478
0
  SHIFT_TEMPS
479
480
  /* Pre-zero output coefficient block. */
481
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
482
483
  /* Pass 1: process rows.
484
   * Note results are scaled up by sqrt(8) compared to a true DCT;
485
   * furthermore, we scale the results by 2**PASS1_BITS.
486
   * cK represents sqrt(2) * cos(K*pi/12).
487
   */
488
489
0
  dataptr = data;
490
0
  for (ctr = 0; ctr < 6; ctr++) {
491
0
    elemptr = sample_data[ctr] + start_col;
492
493
    /* Even part */
494
495
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
496
0
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
497
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
498
499
0
    tmp10 = tmp0 + tmp2;
500
0
    tmp12 = tmp0 - tmp2;
501
502
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
503
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
504
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
505
506
    /* Apply unsigned->signed conversion. */
507
0
    dataptr[0] = (DCTELEM)
508
0
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
509
0
    dataptr[2] = (DCTELEM)
510
0
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
511
0
        CONST_BITS-PASS1_BITS);
512
0
    dataptr[4] = (DCTELEM)
513
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
514
0
        CONST_BITS-PASS1_BITS);
515
516
    /* Odd part */
517
518
0
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
519
0
        CONST_BITS-PASS1_BITS);
520
521
0
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
522
0
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
523
0
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
524
525
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
526
0
  }
527
528
  /* Pass 2: process columns.
529
   * We remove the PASS1_BITS scaling, but leave the results scaled up
530
   * by an overall factor of 8.
531
   * We must also scale the output by (8/6)**2 = 16/9, which we fold
532
   * into the constant multipliers:
533
   * cK now represents sqrt(2) * cos(K*pi/12) * 16/9.
534
   */
535
536
0
  dataptr = data;
537
0
  for (ctr = 0; ctr < 6; ctr++) {
538
    /* Even part */
539
540
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
541
0
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
542
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
543
544
0
    tmp10 = tmp0 + tmp2;
545
0
    tmp12 = tmp0 - tmp2;
546
547
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
548
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
549
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
550
551
0
    dataptr[DCTSIZE*0] = (DCTELEM)
552
0
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
553
0
        CONST_BITS+PASS1_BITS);
554
0
    dataptr[DCTSIZE*2] = (DCTELEM)
555
0
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
556
0
        CONST_BITS+PASS1_BITS);
557
0
    dataptr[DCTSIZE*4] = (DCTELEM)
558
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
559
0
        CONST_BITS+PASS1_BITS);
560
561
    /* Odd part */
562
563
0
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
564
565
0
    dataptr[DCTSIZE*1] = (DCTELEM)
566
0
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
567
0
        CONST_BITS+PASS1_BITS);
568
0
    dataptr[DCTSIZE*3] = (DCTELEM)
569
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
570
0
        CONST_BITS+PASS1_BITS);
571
0
    dataptr[DCTSIZE*5] = (DCTELEM)
572
0
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
573
0
        CONST_BITS+PASS1_BITS);
574
575
0
    dataptr++;      /* advance pointer to next column */
576
0
  }
577
0
}
578
579
580
/*
581
 * Perform the forward DCT on a 5x5 sample block.
582
 */
583
584
GLOBAL(void)
585
jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
586
0
{
587
0
  INT32 tmp0, tmp1, tmp2;
588
0
  INT32 tmp10, tmp11;
589
0
  DCTELEM *dataptr;
590
0
  JSAMPROW elemptr;
591
0
  int ctr;
592
0
  SHIFT_TEMPS
593
594
  /* Pre-zero output coefficient block. */
595
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
596
597
  /* Pass 1: process rows.
598
   * Note results are scaled up by sqrt(8) compared to a true DCT;
599
   * furthermore, we scale the results by 2**PASS1_BITS.
600
   * We scale the results further by 2 as part of output adaption
601
   * scaling for different DCT size.
602
   * cK represents sqrt(2) * cos(K*pi/10).
603
   */
604
605
0
  dataptr = data;
606
0
  for (ctr = 0; ctr < 5; ctr++) {
607
0
    elemptr = sample_data[ctr] + start_col;
608
609
    /* Even part */
610
611
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
612
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
613
0
    tmp2 = GETJSAMPLE(elemptr[2]);
614
615
0
    tmp10 = tmp0 + tmp1;
616
0
    tmp11 = tmp0 - tmp1;
617
618
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
619
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
620
621
    /* Apply unsigned->signed conversion. */
622
0
    dataptr[0] = (DCTELEM)
623
0
      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
624
0
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
625
0
    tmp10 -= tmp2 << 2;
626
0
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
627
0
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
628
0
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
629
630
    /* Odd part */
631
632
0
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
633
634
0
    dataptr[1] = (DCTELEM)
635
0
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
636
0
        CONST_BITS-PASS1_BITS-1);
637
0
    dataptr[3] = (DCTELEM)
638
0
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
639
0
        CONST_BITS-PASS1_BITS-1);
640
641
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
642
0
  }
643
644
  /* Pass 2: process columns.
645
   * We remove the PASS1_BITS scaling, but leave the results scaled up
646
   * by an overall factor of 8.
647
   * We must also scale the output by (8/5)**2 = 64/25, which we partially
648
   * fold into the constant multipliers (other part was done in pass 1):
649
   * cK now represents sqrt(2) * cos(K*pi/10) * 32/25.
650
   */
651
652
0
  dataptr = data;
653
0
  for (ctr = 0; ctr < 5; ctr++) {
654
    /* Even part */
655
656
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
657
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
658
0
    tmp2 = dataptr[DCTSIZE*2];
659
660
0
    tmp10 = tmp0 + tmp1;
661
0
    tmp11 = tmp0 - tmp1;
662
663
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
664
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
665
666
0
    dataptr[DCTSIZE*0] = (DCTELEM)
667
0
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
668
0
        CONST_BITS+PASS1_BITS);
669
0
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
670
0
    tmp10 -= tmp2 << 2;
671
0
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
672
0
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
673
0
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
674
675
    /* Odd part */
676
677
0
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
678
679
0
    dataptr[DCTSIZE*1] = (DCTELEM)
680
0
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
681
0
        CONST_BITS+PASS1_BITS);
682
0
    dataptr[DCTSIZE*3] = (DCTELEM)
683
0
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
684
0
        CONST_BITS+PASS1_BITS);
685
686
0
    dataptr++;      /* advance pointer to next column */
687
0
  }
688
0
}
689
690
691
/*
692
 * Perform the forward DCT on a 4x4 sample block.
693
 */
694
695
GLOBAL(void)
696
jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
697
0
{
698
0
  INT32 tmp0, tmp1;
699
0
  INT32 tmp10, tmp11;
700
0
  DCTELEM *dataptr;
701
0
  JSAMPROW elemptr;
702
0
  int ctr;
703
0
  SHIFT_TEMPS
704
705
  /* Pre-zero output coefficient block. */
706
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
707
708
  /* Pass 1: process rows.
709
   * Note results are scaled up by sqrt(8) compared to a true DCT;
710
   * furthermore, we scale the results by 2**PASS1_BITS.
711
   * We must also scale the output by (8/4)**2 = 2**2, which we add here.
712
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
713
   */
714
715
0
  dataptr = data;
716
0
  for (ctr = 0; ctr < 4; ctr++) {
717
0
    elemptr = sample_data[ctr] + start_col;
718
719
    /* Even part */
720
721
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
722
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
723
724
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
725
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
726
727
    /* Apply unsigned->signed conversion. */
728
0
    dataptr[0] = (DCTELEM)
729
0
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
730
0
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
731
732
    /* Odd part */
733
734
0
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
735
    /* Add fudge factor here for final descale. */
736
0
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
737
738
0
    dataptr[1] = (DCTELEM)
739
0
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
740
0
      CONST_BITS-PASS1_BITS-2);
741
0
    dataptr[3] = (DCTELEM)
742
0
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
743
0
      CONST_BITS-PASS1_BITS-2);
744
745
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
746
0
  }
747
748
  /* Pass 2: process columns.
749
   * We remove the PASS1_BITS scaling, but leave the results scaled up
750
   * by an overall factor of 8.
751
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
752
   */
753
754
0
  dataptr = data;
755
0
  for (ctr = 0; ctr < 4; ctr++) {
756
    /* Even part */
757
758
    /* Add fudge factor here for final descale. */
759
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
760
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
761
762
0
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
763
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
764
765
0
    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
766
0
    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
767
768
    /* Odd part */
769
770
0
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
771
    /* Add fudge factor here for final descale. */
772
0
    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
773
774
0
    dataptr[DCTSIZE*1] = (DCTELEM)
775
0
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
776
0
      CONST_BITS+PASS1_BITS);
777
0
    dataptr[DCTSIZE*3] = (DCTELEM)
778
0
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
779
0
      CONST_BITS+PASS1_BITS);
780
781
0
    dataptr++;      /* advance pointer to next column */
782
0
  }
783
0
}
784
785
786
/*
787
 * Perform the forward DCT on a 3x3 sample block.
788
 */
789
790
GLOBAL(void)
791
jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
792
0
{
793
0
  INT32 tmp0, tmp1, tmp2;
794
0
  DCTELEM *dataptr;
795
0
  JSAMPROW elemptr;
796
0
  int ctr;
797
0
  SHIFT_TEMPS
798
799
  /* Pre-zero output coefficient block. */
800
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
801
802
  /* Pass 1: process rows.
803
   * Note results are scaled up by sqrt(8) compared to a true DCT;
804
   * furthermore, we scale the results by 2**PASS1_BITS.
805
   * We scale the results further by 2**2 as part of output adaption
806
   * scaling for different DCT size.
807
   * cK represents sqrt(2) * cos(K*pi/6).
808
   */
809
810
0
  dataptr = data;
811
0
  for (ctr = 0; ctr < 3; ctr++) {
812
0
    elemptr = sample_data[ctr] + start_col;
813
814
    /* Even part */
815
816
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
817
0
    tmp1 = GETJSAMPLE(elemptr[1]);
818
819
0
    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
820
821
    /* Apply unsigned->signed conversion. */
822
0
    dataptr[0] = (DCTELEM)
823
0
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
824
0
    dataptr[2] = (DCTELEM)
825
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
826
0
        CONST_BITS-PASS1_BITS-2);
827
828
    /* Odd part */
829
830
0
    dataptr[1] = (DCTELEM)
831
0
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
832
0
        CONST_BITS-PASS1_BITS-2);
833
834
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
835
0
  }
836
837
  /* Pass 2: process columns.
838
   * We remove the PASS1_BITS scaling, but leave the results scaled up
839
   * by an overall factor of 8.
840
   * We must also scale the output by (8/3)**2 = 64/9, which we partially
841
   * fold into the constant multipliers (other part was done in pass 1):
842
   * cK now represents sqrt(2) * cos(K*pi/6) * 16/9.
843
   */
844
845
0
  dataptr = data;
846
0
  for (ctr = 0; ctr < 3; ctr++) {
847
    /* Even part */
848
849
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
850
0
    tmp1 = dataptr[DCTSIZE*1];
851
852
0
    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
853
854
0
    dataptr[DCTSIZE*0] = (DCTELEM)
855
0
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
856
0
        CONST_BITS+PASS1_BITS);
857
0
    dataptr[DCTSIZE*2] = (DCTELEM)
858
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
859
0
        CONST_BITS+PASS1_BITS);
860
861
    /* Odd part */
862
863
0
    dataptr[DCTSIZE*1] = (DCTELEM)
864
0
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
865
0
        CONST_BITS+PASS1_BITS);
866
867
0
    dataptr++;      /* advance pointer to next column */
868
0
  }
869
0
}
870
871
872
/*
873
 * Perform the forward DCT on a 2x2 sample block.
874
 */
875
876
GLOBAL(void)
877
jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
878
0
{
879
0
  DCTELEM tmp0, tmp1, tmp2, tmp3;
880
0
  JSAMPROW elemptr;
881
882
  /* Pre-zero output coefficient block. */
883
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
884
885
  /* Pass 1: process rows.
886
   * Note results are scaled up by sqrt(8) compared to a true DCT.
887
   */
888
889
  /* Row 0 */
890
0
  elemptr = sample_data[0] + start_col;
891
892
0
  tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
893
0
  tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
894
895
  /* Row 1 */
896
0
  elemptr = sample_data[1] + start_col;
897
898
0
  tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
899
0
  tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
900
901
  /* Pass 2: process columns.
902
   * We leave the results scaled up by an overall factor of 8.
903
   * We must also scale the output by (8/2)**2 = 2**4.
904
   */
905
906
  /* Column 0 */
907
  /* Apply unsigned->signed conversion. */
908
0
  data[DCTSIZE*0] = (tmp0 + tmp2 - 4 * CENTERJSAMPLE) << 4;
909
0
  data[DCTSIZE*1] = (tmp0 - tmp2) << 4;
910
911
  /* Column 1 */
912
0
  data[DCTSIZE*0+1] = (tmp1 + tmp3) << 4;
913
0
  data[DCTSIZE*1+1] = (tmp1 - tmp3) << 4;
914
0
}
915
916
917
/*
918
 * Perform the forward DCT on a 1x1 sample block.
919
 */
920
921
GLOBAL(void)
922
jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
923
0
{
924
0
  DCTELEM dcval;
925
926
  /* Pre-zero output coefficient block. */
927
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
928
929
0
  dcval = GETJSAMPLE(sample_data[0][start_col]);
930
931
  /* We leave the result scaled up by an overall factor of 8. */
932
  /* We must also scale the output by (8/1)**2 = 2**6. */
933
  /* Apply unsigned->signed conversion. */
934
0
  data[0] = (dcval - CENTERJSAMPLE) << 6;
935
0
}
936
937
938
/*
939
 * Perform the forward DCT on a 9x9 sample block.
940
 */
941
942
GLOBAL(void)
943
jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
944
0
{
945
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
946
0
  INT32 tmp10, tmp11, tmp12, tmp13;
947
0
  INT32 z1, z2;
948
0
  DCTELEM workspace[8];
949
0
  DCTELEM *dataptr;
950
0
  DCTELEM *wsptr;
951
0
  JSAMPROW elemptr;
952
0
  int ctr;
953
0
  SHIFT_TEMPS
954
955
  /* Pass 1: process rows.
956
   * Note results are scaled up by sqrt(8) compared to a true DCT;
957
   * we scale the results further by 2 as part of output adaption
958
   * scaling for different DCT size.
959
   * cK represents sqrt(2) * cos(K*pi/18).
960
   */
961
962
0
  dataptr = data;
963
0
  ctr = 0;
964
0
  for (;;) {
965
0
    elemptr = sample_data[ctr] + start_col;
966
967
    /* Even part */
968
969
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
970
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
971
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
972
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
973
0
    tmp4 = GETJSAMPLE(elemptr[4]);
974
975
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
976
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
977
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
978
0
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
979
980
0
    z1 = tmp0 + tmp2 + tmp3;
981
0
    z2 = tmp1 + tmp4;
982
    /* Apply unsigned->signed conversion. */
983
0
    dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
984
0
    dataptr[6] = (DCTELEM)
985
0
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)),  /* c6 */
986
0
        CONST_BITS-1);
987
0
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049));        /* c2 */
988
0
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
989
0
    dataptr[2] = (DCTELEM)
990
0
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441))    /* c4 */
991
0
        + z1 + z2, CONST_BITS-1);
992
0
    dataptr[4] = (DCTELEM)
993
0
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608))    /* c8 */
994
0
        + z1 - z2, CONST_BITS-1);
995
996
    /* Odd part */
997
998
0
    dataptr[3] = (DCTELEM)
999
0
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
1000
0
        CONST_BITS-1);
1001
1002
0
    tmp11 = MULTIPLY(tmp11, FIX(1.224744871));        /* c3 */
1003
0
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
1004
0
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
1005
1006
0
    dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
1007
1008
0
    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
1009
1010
0
    dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
1011
0
    dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
1012
1013
0
    ctr++;
1014
1015
0
    if (ctr != DCTSIZE) {
1016
0
      if (ctr == 9)
1017
0
  break;     /* Done. */
1018
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1019
0
    } else
1020
0
      dataptr = workspace; /* switch pointer to extended workspace */
1021
0
  }
1022
1023
  /* Pass 2: process columns.
1024
   * We leave the results scaled up by an overall factor of 8.
1025
   * We must also scale the output by (8/9)**2 = 64/81, which we partially
1026
   * fold into the constant multipliers and final/initial shifting:
1027
   * cK now represents sqrt(2) * cos(K*pi/18) * 128/81.
1028
   */
1029
1030
0
  dataptr = data;
1031
0
  wsptr = workspace;
1032
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1033
    /* Even part */
1034
1035
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*0];
1036
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*7];
1037
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*6];
1038
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*5];
1039
0
    tmp4 = dataptr[DCTSIZE*4];
1040
1041
0
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*0];
1042
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*7];
1043
0
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*6];
1044
0
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*5];
1045
1046
0
    z1 = tmp0 + tmp2 + tmp3;
1047
0
    z2 = tmp1 + tmp4;
1048
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1049
0
      DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)),       /* 128/81 */
1050
0
        CONST_BITS+2);
1051
0
    dataptr[DCTSIZE*6] = (DCTELEM)
1052
0
      DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)),  /* c6 */
1053
0
        CONST_BITS+2);
1054
0
    z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287));        /* c2 */
1055
0
    z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1056
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1057
0
      DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190))    /* c4 */
1058
0
        + z1 + z2, CONST_BITS+2);
1059
0
    dataptr[DCTSIZE*4] = (DCTELEM)
1060
0
      DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096))    /* c8 */
1061
0
        + z1 - z2, CONST_BITS+2);
1062
1063
    /* Odd part */
1064
1065
0
    dataptr[DCTSIZE*3] = (DCTELEM)
1066
0
      DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1067
0
        CONST_BITS+2);
1068
1069
0
    tmp11 = MULTIPLY(tmp11, FIX(1.935399303));        /* c3 */
1070
0
    tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1071
0
    tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1072
1073
0
    dataptr[DCTSIZE*1] = (DCTELEM)
1074
0
      DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
1075
1076
0
    tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1077
1078
0
    dataptr[DCTSIZE*5] = (DCTELEM)
1079
0
      DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
1080
0
    dataptr[DCTSIZE*7] = (DCTELEM)
1081
0
      DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
1082
1083
0
    dataptr++;      /* advance pointer to next column */
1084
0
    wsptr++;      /* advance pointer to next column */
1085
0
  }
1086
0
}
1087
1088
1089
/*
1090
 * Perform the forward DCT on a 10x10 sample block.
1091
 */
1092
1093
GLOBAL(void)
1094
jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1095
0
{
1096
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1097
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1098
0
  DCTELEM workspace[8*2];
1099
0
  DCTELEM *dataptr;
1100
0
  DCTELEM *wsptr;
1101
0
  JSAMPROW elemptr;
1102
0
  int ctr;
1103
0
  SHIFT_TEMPS
1104
1105
  /* Pass 1: process rows.
1106
   * Note results are scaled up by sqrt(8) compared to a true DCT;
1107
   * we scale the results further by 2 as part of output adaption
1108
   * scaling for different DCT size.
1109
   * cK represents sqrt(2) * cos(K*pi/20).
1110
   */
1111
1112
0
  dataptr = data;
1113
0
  ctr = 0;
1114
0
  for (;;) {
1115
0
    elemptr = sample_data[ctr] + start_col;
1116
1117
    /* Even part */
1118
1119
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1120
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1121
0
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1122
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1123
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1124
1125
0
    tmp10 = tmp0 + tmp4;
1126
0
    tmp13 = tmp0 - tmp4;
1127
0
    tmp11 = tmp1 + tmp3;
1128
0
    tmp14 = tmp1 - tmp3;
1129
1130
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1131
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1132
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1133
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1134
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1135
1136
    /* Apply unsigned->signed conversion. */
1137
0
    dataptr[0] = (DCTELEM)
1138
0
      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1139
0
    tmp12 += tmp12;
1140
0
    dataptr[4] = (DCTELEM)
1141
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1142
0
        MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
1143
0
        CONST_BITS-1);
1144
0
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
1145
0
    dataptr[2] = (DCTELEM)
1146
0
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
1147
0
        CONST_BITS-1);
1148
0
    dataptr[6] = (DCTELEM)
1149
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
1150
0
        CONST_BITS-1);
1151
1152
    /* Odd part */
1153
1154
0
    tmp10 = tmp0 + tmp4;
1155
0
    tmp11 = tmp1 - tmp3;
1156
0
    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
1157
0
    tmp2 <<= CONST_BITS;
1158
0
    dataptr[1] = (DCTELEM)
1159
0
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
1160
0
        MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
1161
0
        MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
1162
0
        MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
1163
0
        CONST_BITS-1);
1164
0
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
1165
0
      MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
1166
0
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
1167
0
      (tmp11 << (CONST_BITS - 1)) - tmp2;
1168
0
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
1169
0
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
1170
1171
0
    ctr++;
1172
1173
0
    if (ctr != DCTSIZE) {
1174
0
      if (ctr == 10)
1175
0
  break;     /* Done. */
1176
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1177
0
    } else
1178
0
      dataptr = workspace; /* switch pointer to extended workspace */
1179
0
  }
1180
1181
  /* Pass 2: process columns.
1182
   * We leave the results scaled up by an overall factor of 8.
1183
   * We must also scale the output by (8/10)**2 = 16/25, which we partially
1184
   * fold into the constant multipliers and final/initial shifting:
1185
   * cK now represents sqrt(2) * cos(K*pi/20) * 32/25.
1186
   */
1187
1188
0
  dataptr = data;
1189
0
  wsptr = workspace;
1190
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1191
    /* Even part */
1192
1193
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
1194
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
1195
0
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
1196
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
1197
0
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
1198
1199
0
    tmp10 = tmp0 + tmp4;
1200
0
    tmp13 = tmp0 - tmp4;
1201
0
    tmp11 = tmp1 + tmp3;
1202
0
    tmp14 = tmp1 - tmp3;
1203
1204
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
1205
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
1206
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
1207
0
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
1208
0
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
1209
1210
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1211
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1212
0
        CONST_BITS+2);
1213
0
    tmp12 += tmp12;
1214
0
    dataptr[DCTSIZE*4] = (DCTELEM)
1215
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1216
0
        MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
1217
0
        CONST_BITS+2);
1218
0
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
1219
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1220
0
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
1221
0
        CONST_BITS+2);
1222
0
    dataptr[DCTSIZE*6] = (DCTELEM)
1223
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
1224
0
        CONST_BITS+2);
1225
1226
    /* Odd part */
1227
1228
0
    tmp10 = tmp0 + tmp4;
1229
0
    tmp11 = tmp1 - tmp3;
1230
0
    dataptr[DCTSIZE*5] = (DCTELEM)
1231
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
1232
0
        CONST_BITS+2);
1233
0
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
1234
0
    dataptr[DCTSIZE*1] = (DCTELEM)
1235
0
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
1236
0
        MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
1237
0
        MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
1238
0
        MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
1239
0
        CONST_BITS+2);
1240
0
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
1241
0
      MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
1242
0
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
1243
0
      MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
1244
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
1245
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
1246
1247
0
    dataptr++;      /* advance pointer to next column */
1248
0
    wsptr++;      /* advance pointer to next column */
1249
0
  }
1250
0
}
1251
1252
1253
/*
1254
 * Perform the forward DCT on an 11x11 sample block.
1255
 */
1256
1257
GLOBAL(void)
1258
jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1259
0
{
1260
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1261
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1262
0
  INT32 z1, z2, z3;
1263
0
  DCTELEM workspace[8*3];
1264
0
  DCTELEM *dataptr;
1265
0
  DCTELEM *wsptr;
1266
0
  JSAMPROW elemptr;
1267
0
  int ctr;
1268
0
  SHIFT_TEMPS
1269
1270
  /* Pass 1: process rows.
1271
   * Note results are scaled up by sqrt(8) compared to a true DCT;
1272
   * we scale the results further by 2 as part of output adaption
1273
   * scaling for different DCT size.
1274
   * cK represents sqrt(2) * cos(K*pi/22).
1275
   */
1276
1277
0
  dataptr = data;
1278
0
  ctr = 0;
1279
0
  for (;;) {
1280
0
    elemptr = sample_data[ctr] + start_col;
1281
1282
    /* Even part */
1283
1284
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1285
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1286
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1287
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1288
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1289
0
    tmp5 = GETJSAMPLE(elemptr[5]);
1290
1291
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1292
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1293
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1294
0
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1295
0
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1296
1297
    /* Apply unsigned->signed conversion. */
1298
0
    dataptr[0] = (DCTELEM)
1299
0
      ((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1300
0
    tmp5 += tmp5;
1301
0
    tmp0 -= tmp5;
1302
0
    tmp1 -= tmp5;
1303
0
    tmp2 -= tmp5;
1304
0
    tmp3 -= tmp5;
1305
0
    tmp4 -= tmp5;
1306
0
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) +       /* c2 */
1307
0
   MULTIPLY(tmp2 + tmp4, FIX(0.201263574));        /* c10 */
1308
0
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931));        /* c6 */
1309
0
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156));        /* c4 */
1310
0
    dataptr[2] = (DCTELEM)
1311
0
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1312
0
        - MULTIPLY(tmp4, FIX(1.390975730)),        /* c4+c10 */
1313
0
        CONST_BITS-1);
1314
0
    dataptr[4] = (DCTELEM)
1315
0
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1316
0
        - MULTIPLY(tmp2, FIX(1.356927976))         /* c2 */
1317
0
        + MULTIPLY(tmp4, FIX(0.587485545)),        /* c8 */
1318
0
        CONST_BITS-1);
1319
0
    dataptr[6] = (DCTELEM)
1320
0
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1321
0
        - MULTIPLY(tmp2, FIX(0.788749120)),        /* c8+c10 */
1322
0
        CONST_BITS-1);
1323
1324
    /* Odd part */
1325
1326
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905));    /* c3 */
1327
0
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298));    /* c5 */
1328
0
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576));    /* c7 */
1329
0
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1330
0
     + MULTIPLY(tmp14, FIX(0.398430003));          /* c9 */
1331
0
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576));  /* -c7 */
1332
0
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907));  /* -c1 */
1333
0
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1334
0
      - MULTIPLY(tmp14, FIX(1.068791298));         /* c5 */
1335
0
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003));   /* c9 */
1336
0
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1337
0
      + MULTIPLY(tmp14, FIX(1.399818907));         /* c1 */
1338
0
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1339
0
      - MULTIPLY(tmp14, FIX(1.286413905));         /* c3 */
1340
1341
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
1342
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
1343
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
1344
0
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
1345
1346
0
    ctr++;
1347
1348
0
    if (ctr != DCTSIZE) {
1349
0
      if (ctr == 11)
1350
0
  break;     /* Done. */
1351
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1352
0
    } else
1353
0
      dataptr = workspace; /* switch pointer to extended workspace */
1354
0
  }
1355
1356
  /* Pass 2: process columns.
1357
   * We leave the results scaled up by an overall factor of 8.
1358
   * We must also scale the output by (8/11)**2 = 64/121, which we partially
1359
   * fold into the constant multipliers and final/initial shifting:
1360
   * cK now represents sqrt(2) * cos(K*pi/22) * 128/121.
1361
   */
1362
1363
0
  dataptr = data;
1364
0
  wsptr = workspace;
1365
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1366
    /* Even part */
1367
1368
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*2];
1369
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*1];
1370
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*0];
1371
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*7];
1372
0
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*6];
1373
0
    tmp5 = dataptr[DCTSIZE*5];
1374
1375
0
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*2];
1376
0
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*1];
1377
0
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*0];
1378
0
    tmp13 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*7];
1379
0
    tmp14 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*6];
1380
1381
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1382
0
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1383
0
           FIX(1.057851240)),                /* 128/121 */
1384
0
        CONST_BITS+2);
1385
0
    tmp5 += tmp5;
1386
0
    tmp0 -= tmp5;
1387
0
    tmp1 -= tmp5;
1388
0
    tmp2 -= tmp5;
1389
0
    tmp3 -= tmp5;
1390
0
    tmp4 -= tmp5;
1391
0
    z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) +       /* c2 */
1392
0
   MULTIPLY(tmp2 + tmp4, FIX(0.212906922));        /* c10 */
1393
0
    z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713));        /* c6 */
1394
0
    z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479));        /* c4 */
1395
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1396
0
      DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1397
0
        - MULTIPLY(tmp4, FIX(1.471445400)),        /* c4+c10 */
1398
0
        CONST_BITS+2);
1399
0
    dataptr[DCTSIZE*4] = (DCTELEM)
1400
0
      DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1401
0
        - MULTIPLY(tmp2, FIX(1.435427942))         /* c2 */
1402
0
        + MULTIPLY(tmp4, FIX(0.621472312)),        /* c8 */
1403
0
        CONST_BITS+2);
1404
0
    dataptr[DCTSIZE*6] = (DCTELEM)
1405
0
      DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1406
0
        - MULTIPLY(tmp2, FIX(0.834379234)),        /* c8+c10 */
1407
0
        CONST_BITS+2);
1408
1409
    /* Odd part */
1410
1411
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544));    /* c3 */
1412
0
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199));    /* c5 */
1413
0
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568));    /* c7 */
1414
0
    tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1415
0
     + MULTIPLY(tmp14, FIX(0.421479672));          /* c9 */
1416
0
    tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568));  /* -c7 */
1417
0
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167));  /* -c1 */
1418
0
    tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1419
0
      - MULTIPLY(tmp14, FIX(1.130622199));         /* c5 */
1420
0
    tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672));   /* c9 */
1421
0
    tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1422
0
      + MULTIPLY(tmp14, FIX(1.480800167));         /* c1 */
1423
0
    tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1424
0
      - MULTIPLY(tmp14, FIX(1.360834544));         /* c3 */
1425
1426
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
1427
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
1428
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
1429
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
1430
1431
0
    dataptr++;      /* advance pointer to next column */
1432
0
    wsptr++;      /* advance pointer to next column */
1433
0
  }
1434
0
}
1435
1436
1437
/*
1438
 * Perform the forward DCT on a 12x12 sample block.
1439
 */
1440
1441
GLOBAL(void)
1442
jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1443
0
{
1444
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1445
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1446
0
  DCTELEM workspace[8*4];
1447
0
  DCTELEM *dataptr;
1448
0
  DCTELEM *wsptr;
1449
0
  JSAMPROW elemptr;
1450
0
  int ctr;
1451
0
  SHIFT_TEMPS
1452
1453
  /* Pass 1: process rows.
1454
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1455
   * cK represents sqrt(2) * cos(K*pi/24).
1456
   */
1457
1458
0
  dataptr = data;
1459
0
  ctr = 0;
1460
0
  for (;;) {
1461
0
    elemptr = sample_data[ctr] + start_col;
1462
1463
    /* Even part */
1464
1465
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1466
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1467
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1468
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1469
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1470
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1471
1472
0
    tmp10 = tmp0 + tmp5;
1473
0
    tmp13 = tmp0 - tmp5;
1474
0
    tmp11 = tmp1 + tmp4;
1475
0
    tmp14 = tmp1 - tmp4;
1476
0
    tmp12 = tmp2 + tmp3;
1477
0
    tmp15 = tmp2 - tmp3;
1478
1479
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1480
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1481
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1482
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1483
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1484
0
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1485
1486
    /* Apply unsigned->signed conversion. */
1487
0
    dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1488
0
    dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1489
0
    dataptr[4] = (DCTELEM)
1490
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1491
0
        CONST_BITS);
1492
0
    dataptr[2] = (DCTELEM)
1493
0
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1494
0
        CONST_BITS);
1495
1496
    /* Odd part */
1497
1498
0
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
1499
0
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
1500
0
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
1501
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
1502
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
1503
0
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1504
0
      + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
1505
0
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1506
0
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1507
0
      + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
1508
0
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1509
0
      - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
1510
0
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1511
0
      - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
1512
1513
0
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1514
0
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1515
0
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1516
0
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1517
1518
0
    ctr++;
1519
1520
0
    if (ctr != DCTSIZE) {
1521
0
      if (ctr == 12)
1522
0
  break;     /* Done. */
1523
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1524
0
    } else
1525
0
      dataptr = workspace; /* switch pointer to extended workspace */
1526
0
  }
1527
1528
  /* Pass 2: process columns.
1529
   * We leave the results scaled up by an overall factor of 8.
1530
   * We must also scale the output by (8/12)**2 = 4/9, which we partially
1531
   * fold into the constant multipliers and final shifting:
1532
   * cK now represents sqrt(2) * cos(K*pi/24) * 8/9.
1533
   */
1534
1535
0
  dataptr = data;
1536
0
  wsptr = workspace;
1537
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1538
    /* Even part */
1539
1540
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
1541
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
1542
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
1543
0
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
1544
0
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
1545
0
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
1546
1547
0
    tmp10 = tmp0 + tmp5;
1548
0
    tmp13 = tmp0 - tmp5;
1549
0
    tmp11 = tmp1 + tmp4;
1550
0
    tmp14 = tmp1 - tmp4;
1551
0
    tmp12 = tmp2 + tmp3;
1552
0
    tmp15 = tmp2 - tmp3;
1553
1554
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
1555
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
1556
0
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
1557
0
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
1558
0
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
1559
0
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
1560
1561
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1562
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1563
0
        CONST_BITS+1);
1564
0
    dataptr[DCTSIZE*6] = (DCTELEM)
1565
0
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1566
0
        CONST_BITS+1);
1567
0
    dataptr[DCTSIZE*4] = (DCTELEM)
1568
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
1569
0
        CONST_BITS+1);
1570
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1571
0
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
1572
0
        MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
1573
0
        CONST_BITS+1);
1574
1575
    /* Odd part */
1576
1577
0
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
1578
0
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
1579
0
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
1580
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
1581
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
1582
0
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1583
0
      + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
1584
0
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1585
0
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1586
0
      + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
1587
0
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1588
0
      - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
1589
0
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1590
0
      - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1591
1592
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
1593
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
1594
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
1595
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
1596
1597
0
    dataptr++;      /* advance pointer to next column */
1598
0
    wsptr++;      /* advance pointer to next column */
1599
0
  }
1600
0
}
1601
1602
1603
/*
1604
 * Perform the forward DCT on a 13x13 sample block.
1605
 */
1606
1607
GLOBAL(void)
1608
jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1609
0
{
1610
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1611
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1612
0
  INT32 z1, z2;
1613
0
  DCTELEM workspace[8*5];
1614
0
  DCTELEM *dataptr;
1615
0
  DCTELEM *wsptr;
1616
0
  JSAMPROW elemptr;
1617
0
  int ctr;
1618
0
  SHIFT_TEMPS
1619
1620
  /* Pass 1: process rows.
1621
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1622
   * cK represents sqrt(2) * cos(K*pi/26).
1623
   */
1624
1625
0
  dataptr = data;
1626
0
  ctr = 0;
1627
0
  for (;;) {
1628
0
    elemptr = sample_data[ctr] + start_col;
1629
1630
    /* Even part */
1631
1632
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1633
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1634
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1635
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1636
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1637
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1638
0
    tmp6 = GETJSAMPLE(elemptr[6]);
1639
1640
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1641
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1642
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1643
0
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1644
0
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1645
0
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1646
1647
    /* Apply unsigned->signed conversion. */
1648
0
    dataptr[0] = (DCTELEM)
1649
0
      (tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1650
0
    tmp6 += tmp6;
1651
0
    tmp0 -= tmp6;
1652
0
    tmp1 -= tmp6;
1653
0
    tmp2 -= tmp6;
1654
0
    tmp3 -= tmp6;
1655
0
    tmp4 -= tmp6;
1656
0
    tmp5 -= tmp6;
1657
0
    dataptr[2] = (DCTELEM)
1658
0
      DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) +   /* c2 */
1659
0
        MULTIPLY(tmp1, FIX(1.058554052)) +   /* c6 */
1660
0
        MULTIPLY(tmp2, FIX(0.501487041)) -   /* c10 */
1661
0
        MULTIPLY(tmp3, FIX(0.170464608)) -   /* c12 */
1662
0
        MULTIPLY(tmp4, FIX(0.803364869)) -   /* c8 */
1663
0
        MULTIPLY(tmp5, FIX(1.252223920)),    /* c4 */
1664
0
        CONST_BITS);
1665
0
    z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1666
0
   MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1667
0
   MULTIPLY(tmp1 - tmp5, FIX(0.316450131));  /* (c8-c12)/2 */
1668
0
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1669
0
   MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1670
0
   MULTIPLY(tmp1 + tmp5, FIX(0.486914739));  /* (c8+c12)/2 */
1671
1672
0
    dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1673
0
    dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1674
1675
    /* Odd part */
1676
1677
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651));   /* c3 */
1678
0
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945));   /* c5 */
1679
0
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) +  /* c7 */
1680
0
     MULTIPLY(tmp14 + tmp15, FIX(0.338443458));   /* c11 */
1681
0
    tmp0 = tmp1 + tmp2 + tmp3 -
1682
0
     MULTIPLY(tmp10, FIX(2.020082300)) +          /* c3+c5+c7-c1 */
1683
0
     MULTIPLY(tmp14, FIX(0.318774355));           /* c9-c11 */
1684
0
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) -  /* c7 */
1685
0
     MULTIPLY(tmp11 + tmp12, FIX(0.338443458));   /* c11 */
1686
0
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1687
0
    tmp1 += tmp4 + tmp5 +
1688
0
      MULTIPLY(tmp11, FIX(0.837223564)) -         /* c5+c9+c11-c3 */
1689
0
      MULTIPLY(tmp14, FIX(2.341699410));          /* c1+c7 */
1690
0
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1691
0
    tmp2 += tmp4 + tmp6 -
1692
0
      MULTIPLY(tmp12, FIX(1.572116027)) +         /* c1+c5-c9-c11 */
1693
0
      MULTIPLY(tmp15, FIX(2.260109708));          /* c3+c7 */
1694
0
    tmp3 += tmp5 + tmp6 +
1695
0
      MULTIPLY(tmp13, FIX(2.205608352)) -         /* c3+c5+c9-c7 */
1696
0
      MULTIPLY(tmp15, FIX(1.742345811));          /* c1+c11 */
1697
1698
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1699
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1700
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1701
0
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1702
1703
0
    ctr++;
1704
1705
0
    if (ctr != DCTSIZE) {
1706
0
      if (ctr == 13)
1707
0
  break;     /* Done. */
1708
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1709
0
    } else
1710
0
      dataptr = workspace; /* switch pointer to extended workspace */
1711
0
  }
1712
1713
  /* Pass 2: process columns.
1714
   * We leave the results scaled up by an overall factor of 8.
1715
   * We must also scale the output by (8/13)**2 = 64/169, which we partially
1716
   * fold into the constant multipliers and final shifting:
1717
   * cK now represents sqrt(2) * cos(K*pi/26) * 128/169.
1718
   */
1719
1720
0
  dataptr = data;
1721
0
  wsptr = workspace;
1722
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1723
    /* Even part */
1724
1725
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*4];
1726
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*3];
1727
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*2];
1728
0
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*1];
1729
0
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*0];
1730
0
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*7];
1731
0
    tmp6 = dataptr[DCTSIZE*6];
1732
1733
0
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*4];
1734
0
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*3];
1735
0
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*2];
1736
0
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*1];
1737
0
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*0];
1738
0
    tmp15 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*7];
1739
1740
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1741
0
      DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1742
0
           FIX(0.757396450)),          /* 128/169 */
1743
0
        CONST_BITS+1);
1744
0
    tmp6 += tmp6;
1745
0
    tmp0 -= tmp6;
1746
0
    tmp1 -= tmp6;
1747
0
    tmp2 -= tmp6;
1748
0
    tmp3 -= tmp6;
1749
0
    tmp4 -= tmp6;
1750
0
    tmp5 -= tmp6;
1751
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1752
0
      DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) +   /* c2 */
1753
0
        MULTIPLY(tmp1, FIX(0.801745081)) +   /* c6 */
1754
0
        MULTIPLY(tmp2, FIX(0.379824504)) -   /* c10 */
1755
0
        MULTIPLY(tmp3, FIX(0.129109289)) -   /* c12 */
1756
0
        MULTIPLY(tmp4, FIX(0.608465700)) -   /* c8 */
1757
0
        MULTIPLY(tmp5, FIX(0.948429952)),    /* c4 */
1758
0
        CONST_BITS+1);
1759
0
    z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1760
0
   MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1761
0
   MULTIPLY(tmp1 - tmp5, FIX(0.239678205));  /* (c8-c12)/2 */
1762
0
    z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1763
0
   MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1764
0
   MULTIPLY(tmp1 + tmp5, FIX(0.368787494));  /* (c8+c12)/2 */
1765
1766
0
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
1767
0
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
1768
1769
    /* Odd part */
1770
1771
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908));   /* c3 */
1772
0
    tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751));   /* c5 */
1773
0
    tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) +  /* c7 */
1774
0
     MULTIPLY(tmp14 + tmp15, FIX(0.256335874));   /* c11 */
1775
0
    tmp0 = tmp1 + tmp2 + tmp3 -
1776
0
     MULTIPLY(tmp10, FIX(1.530003162)) +          /* c3+c5+c7-c1 */
1777
0
     MULTIPLY(tmp14, FIX(0.241438564));           /* c9-c11 */
1778
0
    tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) -  /* c7 */
1779
0
     MULTIPLY(tmp11 + tmp12, FIX(0.256335874));   /* c11 */
1780
0
    tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1781
0
    tmp1 += tmp4 + tmp5 +
1782
0
      MULTIPLY(tmp11, FIX(0.634110155)) -         /* c5+c9+c11-c3 */
1783
0
      MULTIPLY(tmp14, FIX(1.773594819));          /* c1+c7 */
1784
0
    tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1785
0
    tmp2 += tmp4 + tmp6 -
1786
0
      MULTIPLY(tmp12, FIX(1.190715098)) +         /* c1+c5-c9-c11 */
1787
0
      MULTIPLY(tmp15, FIX(1.711799069));          /* c3+c7 */
1788
0
    tmp3 += tmp5 + tmp6 +
1789
0
      MULTIPLY(tmp13, FIX(1.670519935)) -         /* c3+c5+c9-c7 */
1790
0
      MULTIPLY(tmp15, FIX(1.319646532));          /* c1+c11 */
1791
1792
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
1793
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
1794
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
1795
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
1796
1797
0
    dataptr++;      /* advance pointer to next column */
1798
0
    wsptr++;      /* advance pointer to next column */
1799
0
  }
1800
0
}
1801
1802
1803
/*
1804
 * Perform the forward DCT on a 14x14 sample block.
1805
 */
1806
1807
GLOBAL(void)
1808
jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1809
0
{
1810
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1811
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1812
0
  DCTELEM workspace[8*6];
1813
0
  DCTELEM *dataptr;
1814
0
  DCTELEM *wsptr;
1815
0
  JSAMPROW elemptr;
1816
0
  int ctr;
1817
0
  SHIFT_TEMPS
1818
1819
  /* Pass 1: process rows.
1820
   * Note results are scaled up by sqrt(8) compared to a true DCT.
1821
   * cK represents sqrt(2) * cos(K*pi/28).
1822
   */
1823
1824
0
  dataptr = data;
1825
0
  ctr = 0;
1826
0
  for (;;) {
1827
0
    elemptr = sample_data[ctr] + start_col;
1828
1829
    /* Even part */
1830
1831
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1832
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1833
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1834
0
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1835
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1836
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1837
0
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1838
1839
0
    tmp10 = tmp0 + tmp6;
1840
0
    tmp14 = tmp0 - tmp6;
1841
0
    tmp11 = tmp1 + tmp5;
1842
0
    tmp15 = tmp1 - tmp5;
1843
0
    tmp12 = tmp2 + tmp4;
1844
0
    tmp16 = tmp2 - tmp4;
1845
1846
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1847
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1848
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1849
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1850
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1851
0
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1852
0
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1853
1854
    /* Apply unsigned->signed conversion. */
1855
0
    dataptr[0] = (DCTELEM)
1856
0
      (tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1857
0
    tmp13 += tmp13;
1858
0
    dataptr[4] = (DCTELEM)
1859
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1860
0
        MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1861
0
        MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
1862
0
        CONST_BITS);
1863
1864
0
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
1865
1866
0
    dataptr[2] = (DCTELEM)
1867
0
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
1868
0
        + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
1869
0
        CONST_BITS);
1870
0
    dataptr[6] = (DCTELEM)
1871
0
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
1872
0
        - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
1873
0
        CONST_BITS);
1874
1875
    /* Odd part */
1876
1877
0
    tmp10 = tmp1 + tmp2;
1878
0
    tmp11 = tmp5 - tmp4;
1879
0
    dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1880
0
    tmp3 <<= CONST_BITS;
1881
0
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
1882
0
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
1883
0
    tmp10 += tmp11 - tmp3;
1884
0
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
1885
0
      MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
1886
0
    dataptr[5] = (DCTELEM)
1887
0
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1888
0
        + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
1889
0
        CONST_BITS);
1890
0
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
1891
0
      MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
1892
0
    dataptr[3] = (DCTELEM)
1893
0
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1894
0
        - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
1895
0
        CONST_BITS);
1896
0
    dataptr[1] = (DCTELEM)
1897
0
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1898
0
        MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
1899
0
        CONST_BITS);
1900
1901
0
    ctr++;
1902
1903
0
    if (ctr != DCTSIZE) {
1904
0
      if (ctr == 14)
1905
0
  break;     /* Done. */
1906
0
      dataptr += DCTSIZE; /* advance pointer to next row */
1907
0
    } else
1908
0
      dataptr = workspace; /* switch pointer to extended workspace */
1909
0
  }
1910
1911
  /* Pass 2: process columns.
1912
   * We leave the results scaled up by an overall factor of 8.
1913
   * We must also scale the output by (8/14)**2 = 16/49, which we partially
1914
   * fold into the constant multipliers and final shifting:
1915
   * cK now represents sqrt(2) * cos(K*pi/28) * 32/49.
1916
   */
1917
1918
0
  dataptr = data;
1919
0
  wsptr = workspace;
1920
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1921
    /* Even part */
1922
1923
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
1924
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
1925
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
1926
0
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
1927
0
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
1928
0
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
1929
0
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
1930
1931
0
    tmp10 = tmp0 + tmp6;
1932
0
    tmp14 = tmp0 - tmp6;
1933
0
    tmp11 = tmp1 + tmp5;
1934
0
    tmp15 = tmp1 - tmp5;
1935
0
    tmp12 = tmp2 + tmp4;
1936
0
    tmp16 = tmp2 - tmp4;
1937
1938
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
1939
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
1940
0
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
1941
0
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
1942
0
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
1943
0
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
1944
0
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
1945
1946
0
    dataptr[DCTSIZE*0] = (DCTELEM)
1947
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1948
0
           FIX(0.653061224)),                 /* 32/49 */
1949
0
        CONST_BITS+1);
1950
0
    tmp13 += tmp13;
1951
0
    dataptr[DCTSIZE*4] = (DCTELEM)
1952
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
1953
0
        MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
1954
0
        MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
1955
0
        CONST_BITS+1);
1956
1957
0
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
1958
1959
0
    dataptr[DCTSIZE*2] = (DCTELEM)
1960
0
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
1961
0
        + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
1962
0
        CONST_BITS+1);
1963
0
    dataptr[DCTSIZE*6] = (DCTELEM)
1964
0
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
1965
0
        - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
1966
0
        CONST_BITS+1);
1967
1968
    /* Odd part */
1969
1970
0
    tmp10 = tmp1 + tmp2;
1971
0
    tmp11 = tmp5 - tmp4;
1972
0
    dataptr[DCTSIZE*7] = (DCTELEM)
1973
0
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1974
0
           FIX(0.653061224)),                 /* 32/49 */
1975
0
        CONST_BITS+1);
1976
0
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
1977
0
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
1978
0
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
1979
0
    tmp10 += tmp11 - tmp3;
1980
0
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
1981
0
      MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
1982
0
    dataptr[DCTSIZE*5] = (DCTELEM)
1983
0
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
1984
0
        + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
1985
0
        CONST_BITS+1);
1986
0
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
1987
0
      MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
1988
0
    dataptr[DCTSIZE*3] = (DCTELEM)
1989
0
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
1990
0
        - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
1991
0
        CONST_BITS+1);
1992
0
    dataptr[DCTSIZE*1] = (DCTELEM)
1993
0
      DESCALE(tmp11 + tmp12 + tmp3
1994
0
        - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
1995
0
        - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
1996
0
        CONST_BITS+1);
1997
1998
0
    dataptr++;      /* advance pointer to next column */
1999
0
    wsptr++;      /* advance pointer to next column */
2000
0
  }
2001
0
}
2002
2003
2004
/*
2005
 * Perform the forward DCT on a 15x15 sample block.
2006
 */
2007
2008
GLOBAL(void)
2009
jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2010
0
{
2011
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2012
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2013
0
  INT32 z1, z2, z3;
2014
0
  DCTELEM workspace[8*7];
2015
0
  DCTELEM *dataptr;
2016
0
  DCTELEM *wsptr;
2017
0
  JSAMPROW elemptr;
2018
0
  int ctr;
2019
0
  SHIFT_TEMPS
2020
2021
  /* Pass 1: process rows.
2022
   * Note results are scaled up by sqrt(8) compared to a true DCT.
2023
   * cK represents sqrt(2) * cos(K*pi/30).
2024
   */
2025
2026
0
  dataptr = data;
2027
0
  ctr = 0;
2028
0
  for (;;) {
2029
0
    elemptr = sample_data[ctr] + start_col;
2030
2031
    /* Even part */
2032
2033
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2034
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2035
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2036
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2037
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2038
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2039
0
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2040
0
    tmp7 = GETJSAMPLE(elemptr[7]);
2041
2042
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2043
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2044
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2045
0
    tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2046
0
    tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2047
0
    tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2048
0
    tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2049
2050
0
    z1 = tmp0 + tmp4 + tmp5;
2051
0
    z2 = tmp1 + tmp3 + tmp6;
2052
0
    z3 = tmp2 + tmp7;
2053
    /* Apply unsigned->signed conversion. */
2054
0
    dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2055
0
    z3 += z3;
2056
0
    dataptr[6] = (DCTELEM)
2057
0
      DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2058
0
        MULTIPLY(z2 - z3, FIX(0.437016024)),  /* c12 */
2059
0
        CONST_BITS);
2060
0
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2061
0
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) -  /* c2+c14 */
2062
0
         MULTIPLY(tmp6 - tmp2, FIX(2.238241955));   /* c4+c8 */
2063
0
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) -  /* c8-c14 */
2064
0
   MULTIPLY(tmp0 - tmp2, FIX(0.091361227));   /* c2-c4 */
2065
0
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) +  /* c2 */
2066
0
   MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) +  /* c8 */
2067
0
   MULTIPLY(tmp1 - tmp4, FIX(0.790569415));   /* (c6+c12)/2 */
2068
2069
0
    dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2070
0
    dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2071
2072
    /* Odd part */
2073
2074
0
    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2075
0
        FIX(1.224744871));                         /* c5 */
2076
0
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2077
0
     MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876));  /* c9 */
2078
0
    tmp12 = MULTIPLY(tmp12, FIX(1.224744871));                 /* c5 */
2079
0
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) +         /* c1 */
2080
0
     MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) +         /* c3 */
2081
0
     MULTIPLY(tmp13 + tmp15, FIX(0.575212477));          /* c11 */
2082
0
    tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) -                 /* c7-c11 */
2083
0
     MULTIPLY(tmp14, FIX(0.513743148)) +                 /* c3-c9 */
2084
0
     MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12;   /* c1+c13 */
2085
0
    tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) -               /* -(c1-c7) */
2086
0
     MULTIPLY(tmp11, FIX(2.176250899)) -                 /* c3+c9 */
2087
0
     MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12;   /* c11+c13 */
2088
2089
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2090
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2091
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2092
0
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2093
2094
0
    ctr++;
2095
2096
0
    if (ctr != DCTSIZE) {
2097
0
      if (ctr == 15)
2098
0
  break;     /* Done. */
2099
0
      dataptr += DCTSIZE; /* advance pointer to next row */
2100
0
    } else
2101
0
      dataptr = workspace; /* switch pointer to extended workspace */
2102
0
  }
2103
2104
  /* Pass 2: process columns.
2105
   * We leave the results scaled up by an overall factor of 8.
2106
   * We must also scale the output by (8/15)**2 = 64/225, which we partially
2107
   * fold into the constant multipliers and final shifting:
2108
   * cK now represents sqrt(2) * cos(K*pi/30) * 256/225.
2109
   */
2110
2111
0
  dataptr = data;
2112
0
  wsptr = workspace;
2113
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2114
    /* Even part */
2115
2116
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*6];
2117
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*5];
2118
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*4];
2119
0
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*3];
2120
0
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*2];
2121
0
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*1];
2122
0
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*0];
2123
0
    tmp7 = dataptr[DCTSIZE*7];
2124
2125
0
    tmp10 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*6];
2126
0
    tmp11 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*5];
2127
0
    tmp12 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*4];
2128
0
    tmp13 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*3];
2129
0
    tmp14 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*2];
2130
0
    tmp15 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*1];
2131
0
    tmp16 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*0];
2132
2133
0
    z1 = tmp0 + tmp4 + tmp5;
2134
0
    z2 = tmp1 + tmp3 + tmp6;
2135
0
    z3 = tmp2 + tmp7;
2136
0
    dataptr[DCTSIZE*0] = (DCTELEM)
2137
0
      DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2138
0
        CONST_BITS+2);
2139
0
    z3 += z3;
2140
0
    dataptr[DCTSIZE*6] = (DCTELEM)
2141
0
      DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2142
0
        MULTIPLY(z2 - z3, FIX(0.497227121)),  /* c12 */
2143
0
        CONST_BITS+2);
2144
0
    tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2145
0
    z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) -  /* c2+c14 */
2146
0
         MULTIPLY(tmp6 - tmp2, FIX(2.546621957));   /* c4+c8 */
2147
0
    z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) -  /* c8-c14 */
2148
0
   MULTIPLY(tmp0 - tmp2, FIX(0.103948774));   /* c2-c4 */
2149
0
    z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) +  /* c2 */
2150
0
   MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) +  /* c8 */
2151
0
   MULTIPLY(tmp1 - tmp4, FIX(0.899492312));   /* (c6+c12)/2 */
2152
2153
0
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
2154
0
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
2155
2156
    /* Odd part */
2157
2158
0
    tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2159
0
        FIX(1.393487498));                         /* c5 */
2160
0
    tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2161
0
     MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187));  /* c9 */
2162
0
    tmp12 = MULTIPLY(tmp12, FIX(1.393487498));                 /* c5 */
2163
0
    tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) +         /* c1 */
2164
0
     MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) +         /* c3 */
2165
0
     MULTIPLY(tmp13 + tmp15, FIX(0.654463974));          /* c11 */
2166
0
    tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) -                 /* c7-c11 */
2167
0
     MULTIPLY(tmp14, FIX(0.584525538)) +                 /* c3-c9 */
2168
0
     MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12;   /* c1+c13 */
2169
0
    tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) -               /* -(c1-c7) */
2170
0
     MULTIPLY(tmp11, FIX(2.476089912)) -                 /* c3+c9 */
2171
0
     MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12;   /* c11+c13 */
2172
2173
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
2174
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
2175
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
2176
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
2177
2178
0
    dataptr++;      /* advance pointer to next column */
2179
0
    wsptr++;      /* advance pointer to next column */
2180
0
  }
2181
0
}
2182
2183
2184
/*
2185
 * Perform the forward DCT on a 16x16 sample block.
2186
 */
2187
2188
GLOBAL(void)
2189
jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2190
261k
{
2191
261k
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2192
261k
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2193
261k
  DCTELEM workspace[DCTSIZE2];
2194
261k
  DCTELEM *dataptr;
2195
261k
  DCTELEM *wsptr;
2196
261k
  JSAMPROW elemptr;
2197
261k
  int ctr;
2198
261k
  SHIFT_TEMPS
2199
2200
  /* Pass 1: process rows.
2201
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2202
   * furthermore, we scale the results by 2**PASS1_BITS.
2203
   * cK represents sqrt(2) * cos(K*pi/32).
2204
   */
2205
2206
261k
  dataptr = data;
2207
261k
  ctr = 0;
2208
4.17M
  for (;;) {
2209
4.17M
    elemptr = sample_data[ctr] + start_col;
2210
2211
    /* Even part */
2212
2213
4.17M
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2214
4.17M
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2215
4.17M
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2216
4.17M
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2217
4.17M
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2218
4.17M
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2219
4.17M
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2220
4.17M
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2221
2222
4.17M
    tmp10 = tmp0 + tmp7;
2223
4.17M
    tmp14 = tmp0 - tmp7;
2224
4.17M
    tmp11 = tmp1 + tmp6;
2225
4.17M
    tmp15 = tmp1 - tmp6;
2226
4.17M
    tmp12 = tmp2 + tmp5;
2227
4.17M
    tmp16 = tmp2 - tmp5;
2228
4.17M
    tmp13 = tmp3 + tmp4;
2229
4.17M
    tmp17 = tmp3 - tmp4;
2230
2231
4.17M
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2232
4.17M
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2233
4.17M
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2234
4.17M
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2235
4.17M
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2236
4.17M
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2237
4.17M
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2238
4.17M
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2239
2240
    /* Apply unsigned->signed conversion. */
2241
4.17M
    dataptr[0] = (DCTELEM)
2242
4.17M
      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2243
4.17M
    dataptr[4] = (DCTELEM)
2244
4.17M
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2245
4.17M
        MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2246
4.17M
        CONST_BITS-PASS1_BITS);
2247
2248
4.17M
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2249
4.17M
      MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2250
2251
4.17M
    dataptr[2] = (DCTELEM)
2252
4.17M
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2253
4.17M
        + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2254
4.17M
        CONST_BITS-PASS1_BITS);
2255
4.17M
    dataptr[6] = (DCTELEM)
2256
4.17M
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2257
4.17M
        - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2258
4.17M
        CONST_BITS-PASS1_BITS);
2259
2260
    /* Odd part */
2261
2262
4.17M
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2263
4.17M
      MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2264
4.17M
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2265
4.17M
      MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2266
4.17M
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2267
4.17M
      MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2268
4.17M
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2269
4.17M
      MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2270
4.17M
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2271
4.17M
      MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2272
4.17M
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2273
4.17M
      MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2274
4.17M
    tmp10 = tmp11 + tmp12 + tmp13 -
2275
4.17M
      MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2276
4.17M
      MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2277
4.17M
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2278
4.17M
       - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2279
4.17M
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2280
4.17M
       + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2281
4.17M
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2282
4.17M
       + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2283
2284
4.17M
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2285
4.17M
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2286
4.17M
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2287
4.17M
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2288
2289
4.17M
    ctr++;
2290
2291
4.17M
    if (ctr != DCTSIZE) {
2292
3.91M
      if (ctr == DCTSIZE * 2)
2293
261k
  break;      /* Done. */
2294
3.65M
      dataptr += DCTSIZE; /* advance pointer to next row */
2295
3.65M
    } else
2296
261k
      dataptr = workspace; /* switch pointer to extended workspace */
2297
4.17M
  }
2298
2299
  /* Pass 2: process columns.
2300
   * We remove the PASS1_BITS scaling, but leave the results scaled up
2301
   * by an overall factor of 8.
2302
   * We must also scale the output by (8/16)**2 = 1/2**2.
2303
   * cK represents sqrt(2) * cos(K*pi/32).
2304
   */
2305
2306
261k
  dataptr = data;
2307
261k
  wsptr = workspace;
2308
2.34M
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2309
    /* Even part */
2310
2311
2.08M
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
2312
2.08M
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
2313
2.08M
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
2314
2.08M
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
2315
2.08M
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
2316
2.08M
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
2317
2.08M
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
2318
2.08M
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
2319
2320
2.08M
    tmp10 = tmp0 + tmp7;
2321
2.08M
    tmp14 = tmp0 - tmp7;
2322
2.08M
    tmp11 = tmp1 + tmp6;
2323
2.08M
    tmp15 = tmp1 - tmp6;
2324
2.08M
    tmp12 = tmp2 + tmp5;
2325
2.08M
    tmp16 = tmp2 - tmp5;
2326
2.08M
    tmp13 = tmp3 + tmp4;
2327
2.08M
    tmp17 = tmp3 - tmp4;
2328
2329
2.08M
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
2330
2.08M
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
2331
2.08M
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
2332
2.08M
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
2333
2.08M
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
2334
2.08M
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
2335
2.08M
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
2336
2.08M
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
2337
2338
2.08M
    dataptr[DCTSIZE*0] = (DCTELEM)
2339
2.08M
      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
2340
2.08M
    dataptr[DCTSIZE*4] = (DCTELEM)
2341
2.08M
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2342
2.08M
        MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2343
2.08M
        CONST_BITS+PASS1_BITS+2);
2344
2345
2.08M
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2346
2.08M
      MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2347
2348
2.08M
    dataptr[DCTSIZE*2] = (DCTELEM)
2349
2.08M
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2350
2.08M
        + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+10 */
2351
2.08M
        CONST_BITS+PASS1_BITS+2);
2352
2.08M
    dataptr[DCTSIZE*6] = (DCTELEM)
2353
2.08M
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2354
2.08M
        - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2355
2.08M
        CONST_BITS+PASS1_BITS+2);
2356
2357
    /* Odd part */
2358
2359
2.08M
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2360
2.08M
      MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2361
2.08M
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2362
2.08M
      MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2363
2.08M
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2364
2.08M
      MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2365
2.08M
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2366
2.08M
      MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2367
2.08M
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2368
2.08M
      MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2369
2.08M
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2370
2.08M
      MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2371
2.08M
    tmp10 = tmp11 + tmp12 + tmp13 -
2372
2.08M
      MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2373
2.08M
      MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2374
2.08M
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2375
2.08M
       - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2376
2.08M
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2377
2.08M
       + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2378
2.08M
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2379
2.08M
       + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2380
2381
2.08M
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
2382
2.08M
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
2383
2.08M
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
2384
2.08M
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
2385
2386
2.08M
    dataptr++;      /* advance pointer to next column */
2387
2.08M
    wsptr++;      /* advance pointer to next column */
2388
2.08M
  }
2389
261k
}
2390
2391
2392
/*
2393
 * Perform the forward DCT on a 16x8 sample block.
2394
 *
2395
 * 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2396
 */
2397
2398
GLOBAL(void)
2399
jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2400
0
{
2401
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2402
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2403
0
  INT32 z1;
2404
0
  DCTELEM *dataptr;
2405
0
  JSAMPROW elemptr;
2406
0
  int ctr;
2407
0
  SHIFT_TEMPS
2408
2409
  /* Pass 1: process rows.
2410
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2411
   * furthermore, we scale the results by 2**PASS1_BITS.
2412
   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2413
   */
2414
2415
0
  dataptr = data;
2416
0
  ctr = 0;
2417
0
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
2418
0
    elemptr = sample_data[ctr] + start_col;
2419
2420
    /* Even part */
2421
2422
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2423
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2424
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2425
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2426
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2427
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2428
0
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2429
0
    tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2430
2431
0
    tmp10 = tmp0 + tmp7;
2432
0
    tmp14 = tmp0 - tmp7;
2433
0
    tmp11 = tmp1 + tmp6;
2434
0
    tmp15 = tmp1 - tmp6;
2435
0
    tmp12 = tmp2 + tmp5;
2436
0
    tmp16 = tmp2 - tmp5;
2437
0
    tmp13 = tmp3 + tmp4;
2438
0
    tmp17 = tmp3 - tmp4;
2439
2440
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2441
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2442
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2443
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2444
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2445
0
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2446
0
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2447
0
    tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2448
2449
    /* Apply unsigned->signed conversion. */
2450
0
    dataptr[0] = (DCTELEM)
2451
0
      ((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2452
0
    dataptr[4] = (DCTELEM)
2453
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2454
0
        MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
2455
0
        CONST_BITS-PASS1_BITS);
2456
2457
0
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
2458
0
      MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
2459
2460
0
    dataptr[2] = (DCTELEM)
2461
0
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
2462
0
        + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
2463
0
        CONST_BITS-PASS1_BITS);
2464
0
    dataptr[6] = (DCTELEM)
2465
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
2466
0
        - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
2467
0
        CONST_BITS-PASS1_BITS);
2468
2469
    /* Odd part */
2470
2471
0
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
2472
0
      MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
2473
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
2474
0
      MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
2475
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
2476
0
      MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
2477
0
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
2478
0
      MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
2479
0
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
2480
0
      MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
2481
0
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
2482
0
      MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
2483
0
    tmp10 = tmp11 + tmp12 + tmp13 -
2484
0
      MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
2485
0
      MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
2486
0
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2487
0
       - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
2488
0
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2489
0
       + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
2490
0
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2491
0
       + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
2492
2493
0
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2494
0
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2495
0
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2496
0
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2497
2498
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
2499
0
  }
2500
2501
  /* Pass 2: process columns.
2502
   * We remove the PASS1_BITS scaling, but leave the results scaled up
2503
   * by an overall factor of 8.
2504
   * We must also scale the output by 8/16 = 1/2.
2505
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2506
   */
2507
2508
0
  dataptr = data;
2509
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2510
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
2511
     * rotator "c1" should be "c6".
2512
     */
2513
2514
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
2515
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
2516
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
2517
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
2518
2519
0
    tmp10 = tmp0 + tmp3;
2520
0
    tmp12 = tmp0 - tmp3;
2521
0
    tmp11 = tmp1 + tmp2;
2522
0
    tmp13 = tmp1 - tmp2;
2523
2524
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
2525
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
2526
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
2527
0
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
2528
2529
0
    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2530
0
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2531
2532
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
2533
0
    dataptr[DCTSIZE*2] = (DCTELEM)
2534
0
      DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
2535
0
        CONST_BITS+PASS1_BITS+1);
2536
0
    dataptr[DCTSIZE*6] = (DCTELEM)
2537
0
      DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
2538
0
        CONST_BITS+PASS1_BITS+1);
2539
2540
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2541
     * i0..i3 in the paper are tmp0..tmp3 here.
2542
     */
2543
2544
0
    tmp12 = tmp0 + tmp2;
2545
0
    tmp13 = tmp1 + tmp3;
2546
2547
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
2548
0
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
2549
0
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
2550
0
    tmp12 += z1;
2551
0
    tmp13 += z1;
2552
2553
0
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
2554
0
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */
2555
0
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
2556
0
    tmp0 += z1 + tmp12;
2557
0
    tmp3 += z1 + tmp13;
2558
2559
0
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
2560
0
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
2561
0
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
2562
0
    tmp1 += z1 + tmp13;
2563
0
    tmp2 += z1 + tmp12;
2564
2565
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2566
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2567
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2568
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+PASS1_BITS+1);
2569
2570
0
    dataptr++;      /* advance pointer to next column */
2571
0
  }
2572
0
}
2573
2574
2575
/*
2576
 * Perform the forward DCT on a 14x7 sample block.
2577
 *
2578
 * 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2579
 */
2580
2581
GLOBAL(void)
2582
jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2583
0
{
2584
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2585
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2586
0
  INT32 z1, z2, z3;
2587
0
  DCTELEM *dataptr;
2588
0
  JSAMPROW elemptr;
2589
0
  int ctr;
2590
0
  SHIFT_TEMPS
2591
2592
  /* Zero bottom row of output coefficient block. */
2593
0
  MEMZERO(&data[DCTSIZE*7], SIZEOF(DCTELEM) * DCTSIZE);
2594
2595
  /* Pass 1: process rows.
2596
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2597
   * furthermore, we scale the results by 2**PASS1_BITS.
2598
   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
2599
   */
2600
2601
0
  dataptr = data;
2602
0
  for (ctr = 0; ctr < 7; ctr++) {
2603
0
    elemptr = sample_data[ctr] + start_col;
2604
2605
    /* Even part */
2606
2607
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2608
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2609
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2610
0
    tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2611
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2612
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2613
0
    tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2614
2615
0
    tmp10 = tmp0 + tmp6;
2616
0
    tmp14 = tmp0 - tmp6;
2617
0
    tmp11 = tmp1 + tmp5;
2618
0
    tmp15 = tmp1 - tmp5;
2619
0
    tmp12 = tmp2 + tmp4;
2620
0
    tmp16 = tmp2 - tmp4;
2621
2622
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2623
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2624
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2625
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2626
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2627
0
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2628
0
    tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2629
2630
    /* Apply unsigned->signed conversion. */
2631
0
    dataptr[0] = (DCTELEM)
2632
0
      ((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2633
0
    tmp13 += tmp13;
2634
0
    dataptr[4] = (DCTELEM)
2635
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2636
0
        MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2637
0
        MULTIPLY(tmp12 - tmp13, FIX(0.881747734)),  /* c8 */
2638
0
        CONST_BITS-PASS1_BITS);
2639
2640
0
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686));    /* c6 */
2641
2642
0
    dataptr[2] = (DCTELEM)
2643
0
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590))   /* c2-c6 */
2644
0
        + MULTIPLY(tmp16, FIX(0.613604268)),        /* c10 */
2645
0
        CONST_BITS-PASS1_BITS);
2646
0
    dataptr[6] = (DCTELEM)
2647
0
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954))   /* c6+c10 */
2648
0
        - MULTIPLY(tmp16, FIX(1.378756276)),        /* c2 */
2649
0
        CONST_BITS-PASS1_BITS);
2650
2651
    /* Odd part */
2652
2653
0
    tmp10 = tmp1 + tmp2;
2654
0
    tmp11 = tmp5 - tmp4;
2655
0
    dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2656
0
    tmp3 <<= CONST_BITS;
2657
0
    tmp10 = MULTIPLY(tmp10, - FIX(0.158341681));          /* -c13 */
2658
0
    tmp11 = MULTIPLY(tmp11, FIX(1.405321284));            /* c1 */
2659
0
    tmp10 += tmp11 - tmp3;
2660
0
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) +     /* c5 */
2661
0
      MULTIPLY(tmp4 + tmp6, FIX(0.752406978));      /* c9 */
2662
0
    dataptr[5] = (DCTELEM)
2663
0
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2664
0
        + MULTIPLY(tmp4, FIX(1.119999435)),         /* c1+c11-c9 */
2665
0
        CONST_BITS-PASS1_BITS);
2666
0
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) +     /* c3 */
2667
0
      MULTIPLY(tmp5 - tmp6, FIX(0.467085129));      /* c11 */
2668
0
    dataptr[3] = (DCTELEM)
2669
0
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2670
0
        - MULTIPLY(tmp5, FIX(3.069855259)),         /* c1+c5+c11 */
2671
0
        CONST_BITS-PASS1_BITS);
2672
0
    dataptr[1] = (DCTELEM)
2673
0
      DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2674
0
        MULTIPLY(tmp0 + tmp6, FIX(1.126980169)),    /* c3+c5-c1 */
2675
0
        CONST_BITS-PASS1_BITS);
2676
2677
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
2678
0
  }
2679
2680
  /* Pass 2: process columns.
2681
   * We remove the PASS1_BITS scaling, but leave the results scaled up
2682
   * by an overall factor of 8.
2683
   * We must also scale the output by (8/14)*(8/7) = 32/49, which we
2684
   * partially fold into the constant multipliers and final shifting:
2685
   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14) * 64/49.
2686
   */
2687
2688
0
  dataptr = data;
2689
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2690
    /* Even part */
2691
2692
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*6];
2693
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*5];
2694
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*4];
2695
0
    tmp3 = dataptr[DCTSIZE*3];
2696
2697
0
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*6];
2698
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*5];
2699
0
    tmp12 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*4];
2700
2701
0
    z1 = tmp0 + tmp2;
2702
0
    dataptr[DCTSIZE*0] = (DCTELEM)
2703
0
      DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2704
0
        CONST_BITS+PASS1_BITS+1);
2705
0
    tmp3 += tmp3;
2706
0
    z1 -= tmp3;
2707
0
    z1 -= tmp3;
2708
0
    z1 = MULTIPLY(z1, FIX(0.461784020));                /* (c2+c6-c4)/2 */
2709
0
    z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084));       /* (c2+c4-c6)/2 */
2710
0
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446));       /* c6 */
2711
0
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
2712
0
    z1 -= z2;
2713
0
    z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509));       /* c4 */
2714
0
    dataptr[DCTSIZE*4] = (DCTELEM)
2715
0
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2716
0
        CONST_BITS+PASS1_BITS+1);
2717
0
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
2718
2719
    /* Odd part */
2720
2721
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677));   /* (c3+c1-c5)/2 */
2722
0
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464));   /* (c3+c5-c1)/2 */
2723
0
    tmp0 = tmp1 - tmp2;
2724
0
    tmp1 += tmp2;
2725
0
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2726
0
    tmp1 += tmp2;
2727
0
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310));   /* c5 */
2728
0
    tmp0 += tmp3;
2729
0
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355));   /* c3+c1-c5 */
2730
2731
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2732
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2733
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2734
2735
0
    dataptr++;      /* advance pointer to next column */
2736
0
  }
2737
0
}
2738
2739
2740
/*
2741
 * Perform the forward DCT on a 12x6 sample block.
2742
 *
2743
 * 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2744
 */
2745
2746
GLOBAL(void)
2747
jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2748
0
{
2749
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2750
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2751
0
  DCTELEM *dataptr;
2752
0
  JSAMPROW elemptr;
2753
0
  int ctr;
2754
0
  SHIFT_TEMPS
2755
2756
  /* Zero 2 bottom rows of output coefficient block. */
2757
0
  MEMZERO(&data[DCTSIZE*6], SIZEOF(DCTELEM) * DCTSIZE * 2);
2758
2759
  /* Pass 1: process rows.
2760
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2761
   * furthermore, we scale the results by 2**PASS1_BITS.
2762
   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
2763
   */
2764
2765
0
  dataptr = data;
2766
0
  for (ctr = 0; ctr < 6; ctr++) {
2767
0
    elemptr = sample_data[ctr] + start_col;
2768
2769
    /* Even part */
2770
2771
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2772
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2773
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2774
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2775
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2776
0
    tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2777
2778
0
    tmp10 = tmp0 + tmp5;
2779
0
    tmp13 = tmp0 - tmp5;
2780
0
    tmp11 = tmp1 + tmp4;
2781
0
    tmp14 = tmp1 - tmp4;
2782
0
    tmp12 = tmp2 + tmp3;
2783
0
    tmp15 = tmp2 - tmp3;
2784
2785
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2786
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2787
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2788
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2789
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2790
0
    tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2791
2792
    /* Apply unsigned->signed conversion. */
2793
0
    dataptr[0] = (DCTELEM)
2794
0
      ((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2795
0
    dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2796
0
    dataptr[4] = (DCTELEM)
2797
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2798
0
        CONST_BITS-PASS1_BITS);
2799
0
    dataptr[2] = (DCTELEM)
2800
0
      DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2801
0
        CONST_BITS-PASS1_BITS);
2802
2803
    /* Odd part */
2804
2805
0
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100);    /* c9 */
2806
0
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865);   /* c3-c9 */
2807
0
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065);   /* c3+c9 */
2808
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054));   /* c5 */
2809
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669));   /* c7 */
2810
0
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2811
0
      + MULTIPLY(tmp5, FIX(0.184591911));        /* c11 */
2812
0
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2813
0
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2814
0
      + MULTIPLY(tmp5, FIX(0.860918669));        /* c7 */
2815
0
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2816
0
      - MULTIPLY(tmp5, FIX(1.121971054));        /* c5 */
2817
0
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2818
0
      - MULTIPLY(tmp2 + tmp5, FIX_0_541196100);  /* c9 */
2819
2820
0
    dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2821
0
    dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2822
0
    dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2823
0
    dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2824
2825
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
2826
0
  }
2827
2828
  /* Pass 2: process columns.
2829
   * We remove the PASS1_BITS scaling, but leave the results scaled up
2830
   * by an overall factor of 8.
2831
   * We must also scale the output by (8/12)*(8/6) = 8/9, which we
2832
   * partially fold into the constant multipliers and final shifting:
2833
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
2834
   */
2835
2836
0
  dataptr = data;
2837
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2838
    /* Even part */
2839
2840
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
2841
0
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
2842
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
2843
2844
0
    tmp10 = tmp0 + tmp2;
2845
0
    tmp12 = tmp0 - tmp2;
2846
2847
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
2848
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
2849
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
2850
2851
0
    dataptr[DCTSIZE*0] = (DCTELEM)
2852
0
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
2853
0
        CONST_BITS+PASS1_BITS+1);
2854
0
    dataptr[DCTSIZE*2] = (DCTELEM)
2855
0
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
2856
0
        CONST_BITS+PASS1_BITS+1);
2857
0
    dataptr[DCTSIZE*4] = (DCTELEM)
2858
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2859
0
        CONST_BITS+PASS1_BITS+1);
2860
2861
    /* Odd part */
2862
2863
0
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
2864
2865
0
    dataptr[DCTSIZE*1] = (DCTELEM)
2866
0
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
2867
0
        CONST_BITS+PASS1_BITS+1);
2868
0
    dataptr[DCTSIZE*3] = (DCTELEM)
2869
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
2870
0
        CONST_BITS+PASS1_BITS+1);
2871
0
    dataptr[DCTSIZE*5] = (DCTELEM)
2872
0
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
2873
0
        CONST_BITS+PASS1_BITS+1);
2874
2875
0
    dataptr++;      /* advance pointer to next column */
2876
0
  }
2877
0
}
2878
2879
2880
/*
2881
 * Perform the forward DCT on a 10x5 sample block.
2882
 *
2883
 * 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2884
 */
2885
2886
GLOBAL(void)
2887
jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2888
0
{
2889
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2890
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2891
0
  DCTELEM *dataptr;
2892
0
  JSAMPROW elemptr;
2893
0
  int ctr;
2894
0
  SHIFT_TEMPS
2895
2896
  /* Zero 3 bottom rows of output coefficient block. */
2897
0
  MEMZERO(&data[DCTSIZE*5], SIZEOF(DCTELEM) * DCTSIZE * 3);
2898
2899
  /* Pass 1: process rows.
2900
   * Note results are scaled up by sqrt(8) compared to a true DCT;
2901
   * furthermore, we scale the results by 2**PASS1_BITS.
2902
   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
2903
   */
2904
2905
0
  dataptr = data;
2906
0
  for (ctr = 0; ctr < 5; ctr++) {
2907
0
    elemptr = sample_data[ctr] + start_col;
2908
2909
    /* Even part */
2910
2911
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2912
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2913
0
    tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2914
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2915
0
    tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2916
2917
0
    tmp10 = tmp0 + tmp4;
2918
0
    tmp13 = tmp0 - tmp4;
2919
0
    tmp11 = tmp1 + tmp3;
2920
0
    tmp14 = tmp1 - tmp3;
2921
2922
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2923
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2924
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2925
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2926
0
    tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2927
2928
    /* Apply unsigned->signed conversion. */
2929
0
    dataptr[0] = (DCTELEM)
2930
0
      ((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2931
0
    tmp12 += tmp12;
2932
0
    dataptr[4] = (DCTELEM)
2933
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2934
0
        MULTIPLY(tmp11 - tmp12, FIX(0.437016024)),  /* c8 */
2935
0
        CONST_BITS-PASS1_BITS);
2936
0
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876));    /* c6 */
2937
0
    dataptr[2] = (DCTELEM)
2938
0
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)),  /* c2-c6 */
2939
0
        CONST_BITS-PASS1_BITS);
2940
0
    dataptr[6] = (DCTELEM)
2941
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)),  /* c2+c6 */
2942
0
        CONST_BITS-PASS1_BITS);
2943
2944
    /* Odd part */
2945
2946
0
    tmp10 = tmp0 + tmp4;
2947
0
    tmp11 = tmp1 - tmp3;
2948
0
    dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2949
0
    tmp2 <<= CONST_BITS;
2950
0
    dataptr[1] = (DCTELEM)
2951
0
      DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) +          /* c1 */
2952
0
        MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 +   /* c3 */
2953
0
        MULTIPLY(tmp3, FIX(0.642039522)) +          /* c7 */
2954
0
        MULTIPLY(tmp4, FIX(0.221231742)),           /* c9 */
2955
0
        CONST_BITS-PASS1_BITS);
2956
0
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) -     /* (c3+c7)/2 */
2957
0
      MULTIPLY(tmp1 + tmp3, FIX(0.587785252));      /* (c1-c9)/2 */
2958
0
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) +   /* (c3-c7)/2 */
2959
0
      (tmp11 << (CONST_BITS - 1)) - tmp2;
2960
0
    dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2961
0
    dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2962
2963
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
2964
0
  }
2965
2966
  /* Pass 2: process columns.
2967
   * We remove the PASS1_BITS scaling, but leave the results scaled up
2968
   * by an overall factor of 8.
2969
   * We must also scale the output by (8/10)*(8/5) = 32/25, which we
2970
   * fold into the constant multipliers:
2971
   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10) * 32/25.
2972
   */
2973
2974
0
  dataptr = data;
2975
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2976
    /* Even part */
2977
2978
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*4];
2979
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*3];
2980
0
    tmp2 = dataptr[DCTSIZE*2];
2981
2982
0
    tmp10 = tmp0 + tmp1;
2983
0
    tmp11 = tmp0 - tmp1;
2984
2985
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*4];
2986
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*3];
2987
2988
0
    dataptr[DCTSIZE*0] = (DCTELEM)
2989
0
      DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)),        /* 32/25 */
2990
0
        CONST_BITS+PASS1_BITS);
2991
0
    tmp11 = MULTIPLY(tmp11, FIX(1.011928851));          /* (c2+c4)/2 */
2992
0
    tmp10 -= tmp2 << 2;
2993
0
    tmp10 = MULTIPLY(tmp10, FIX(0.452548340));          /* (c2-c4)/2 */
2994
0
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2995
0
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2996
2997
    /* Odd part */
2998
2999
0
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961));    /* c3 */
3000
3001
0
    dataptr[DCTSIZE*1] = (DCTELEM)
3002
0
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
3003
0
        CONST_BITS+PASS1_BITS);
3004
0
    dataptr[DCTSIZE*3] = (DCTELEM)
3005
0
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
3006
0
        CONST_BITS+PASS1_BITS);
3007
3008
0
    dataptr++;      /* advance pointer to next column */
3009
0
  }
3010
0
}
3011
3012
3013
/*
3014
 * Perform the forward DCT on an 8x4 sample block.
3015
 *
3016
 * 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
3017
 */
3018
3019
GLOBAL(void)
3020
jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3021
0
{
3022
0
  INT32 tmp0, tmp1, tmp2, tmp3;
3023
0
  INT32 tmp10, tmp11, tmp12, tmp13;
3024
0
  INT32 z1;
3025
0
  DCTELEM *dataptr;
3026
0
  JSAMPROW elemptr;
3027
0
  int ctr;
3028
0
  SHIFT_TEMPS
3029
3030
  /* Zero 4 bottom rows of output coefficient block. */
3031
0
  MEMZERO(&data[DCTSIZE*4], SIZEOF(DCTELEM) * DCTSIZE * 4);
3032
3033
  /* Pass 1: process rows.
3034
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3035
   * furthermore, we scale the results by 2**PASS1_BITS.
3036
   * We must also scale the output by 8/4 = 2, which we add here.
3037
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3038
   */
3039
3040
0
  dataptr = data;
3041
0
  for (ctr = 0; ctr < 4; ctr++) {
3042
0
    elemptr = sample_data[ctr] + start_col;
3043
3044
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
3045
     * rotator "c1" should be "c6".
3046
     */
3047
3048
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3049
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3050
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3051
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3052
3053
0
    tmp10 = tmp0 + tmp3;
3054
0
    tmp12 = tmp0 - tmp3;
3055
0
    tmp11 = tmp1 + tmp2;
3056
0
    tmp13 = tmp1 - tmp2;
3057
3058
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3059
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3060
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3061
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3062
3063
    /* Apply unsigned->signed conversion. */
3064
0
    dataptr[0] = (DCTELEM)
3065
0
      ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3066
0
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3067
3068
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
3069
    /* Add fudge factor here for final descale. */
3070
0
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3071
3072
0
    dataptr[2] = (DCTELEM)
3073
0
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3074
0
      CONST_BITS-PASS1_BITS-1);
3075
0
    dataptr[6] = (DCTELEM)
3076
0
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3077
0
      CONST_BITS-PASS1_BITS-1);
3078
3079
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3080
     * i0..i3 in the paper are tmp0..tmp3 here.
3081
     */
3082
3083
0
    tmp12 = tmp0 + tmp2;
3084
0
    tmp13 = tmp1 + tmp3;
3085
3086
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
3087
    /* Add fudge factor here for final descale. */
3088
0
    z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3089
3090
0
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
3091
0
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
3092
0
    tmp12 += z1;
3093
0
    tmp13 += z1;
3094
3095
0
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
3096
0
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
3097
0
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
3098
0
    tmp0 += z1 + tmp12;
3099
0
    tmp3 += z1 + tmp13;
3100
3101
0
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
3102
0
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
3103
0
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
3104
0
    tmp1 += z1 + tmp13;
3105
0
    tmp2 += z1 + tmp12;
3106
3107
0
    dataptr[1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS-PASS1_BITS-1);
3108
0
    dataptr[3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS-PASS1_BITS-1);
3109
0
    dataptr[5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS-1);
3110
0
    dataptr[7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS-PASS1_BITS-1);
3111
3112
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
3113
0
  }
3114
3115
  /* Pass 2: process columns.
3116
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3117
   * by an overall factor of 8.
3118
   * 4-point FDCT kernel,
3119
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3120
   */
3121
3122
0
  dataptr = data;
3123
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3124
    /* Even part */
3125
3126
    /* Add fudge factor here for final descale. */
3127
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3] + (ONE << (PASS1_BITS-1));
3128
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
3129
3130
0
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
3131
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
3132
3133
0
    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3134
0
    dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3135
3136
    /* Odd part */
3137
3138
0
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
3139
    /* Add fudge factor here for final descale. */
3140
0
    tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
3141
3142
0
    dataptr[DCTSIZE*1] = (DCTELEM)
3143
0
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3144
0
      CONST_BITS+PASS1_BITS);
3145
0
    dataptr[DCTSIZE*3] = (DCTELEM)
3146
0
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3147
0
      CONST_BITS+PASS1_BITS);
3148
3149
0
    dataptr++;      /* advance pointer to next column */
3150
0
  }
3151
0
}
3152
3153
3154
/*
3155
 * Perform the forward DCT on a 6x3 sample block.
3156
 *
3157
 * 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3158
 */
3159
3160
GLOBAL(void)
3161
jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3162
0
{
3163
0
  INT32 tmp0, tmp1, tmp2;
3164
0
  INT32 tmp10, tmp11, tmp12;
3165
0
  DCTELEM *dataptr;
3166
0
  JSAMPROW elemptr;
3167
0
  int ctr;
3168
0
  SHIFT_TEMPS
3169
3170
  /* Pre-zero output coefficient block. */
3171
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3172
3173
  /* Pass 1: process rows.
3174
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3175
   * furthermore, we scale the results by 2**PASS1_BITS.
3176
   * We scale the results further by 2 as part of output adaption
3177
   * scaling for different DCT size.
3178
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3179
   */
3180
3181
0
  dataptr = data;
3182
0
  for (ctr = 0; ctr < 3; ctr++) {
3183
0
    elemptr = sample_data[ctr] + start_col;
3184
3185
    /* Even part */
3186
3187
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3188
0
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3189
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3190
3191
0
    tmp10 = tmp0 + tmp2;
3192
0
    tmp12 = tmp0 - tmp2;
3193
3194
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3195
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3196
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3197
3198
    /* Apply unsigned->signed conversion. */
3199
0
    dataptr[0] = (DCTELEM)
3200
0
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3201
0
    dataptr[2] = (DCTELEM)
3202
0
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3203
0
        CONST_BITS-PASS1_BITS-1);
3204
0
    dataptr[4] = (DCTELEM)
3205
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3206
0
        CONST_BITS-PASS1_BITS-1);
3207
3208
    /* Odd part */
3209
3210
0
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3211
0
        CONST_BITS-PASS1_BITS-1);
3212
3213
0
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3214
0
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3215
0
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3216
3217
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
3218
0
  }
3219
3220
  /* Pass 2: process columns.
3221
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3222
   * by an overall factor of 8.
3223
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3224
   * fold into the constant multipliers (other part was done in pass 1):
3225
   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6) * 16/9.
3226
   */
3227
3228
0
  dataptr = data;
3229
0
  for (ctr = 0; ctr < 6; ctr++) {
3230
    /* Even part */
3231
3232
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*2];
3233
0
    tmp1 = dataptr[DCTSIZE*1];
3234
3235
0
    tmp2 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*2];
3236
3237
0
    dataptr[DCTSIZE*0] = (DCTELEM)
3238
0
      DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),        /* 16/9 */
3239
0
        CONST_BITS+PASS1_BITS);
3240
0
    dataptr[DCTSIZE*2] = (DCTELEM)
3241
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3242
0
        CONST_BITS+PASS1_BITS);
3243
3244
    /* Odd part */
3245
3246
0
    dataptr[DCTSIZE*1] = (DCTELEM)
3247
0
      DESCALE(MULTIPLY(tmp2, FIX(2.177324216)),               /* c1 */
3248
0
        CONST_BITS+PASS1_BITS);
3249
3250
0
    dataptr++;      /* advance pointer to next column */
3251
0
  }
3252
0
}
3253
3254
3255
/*
3256
 * Perform the forward DCT on a 4x2 sample block.
3257
 *
3258
 * 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3259
 */
3260
3261
GLOBAL(void)
3262
jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3263
0
{
3264
0
  DCTELEM tmp0, tmp2, tmp10, tmp12, tmp4, tmp5;
3265
0
  INT32 tmp1, tmp3, tmp11, tmp13;
3266
0
  INT32 z1, z2, z3;
3267
0
  JSAMPROW elemptr;
3268
0
  SHIFT_TEMPS
3269
3270
  /* Pre-zero output coefficient block. */
3271
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3272
3273
  /* Pass 1: process rows.
3274
   * Note results are scaled up by sqrt(8) compared to a true DCT.
3275
   * 4-point FDCT kernel,
3276
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
3277
   */
3278
3279
  /* Row 0 */
3280
0
  elemptr = sample_data[0] + start_col;
3281
3282
  /* Even part */
3283
3284
0
  tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3285
0
  tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3286
3287
0
  tmp0 = tmp4 + tmp5;
3288
0
  tmp2 = tmp4 - tmp5;
3289
3290
  /* Odd part */
3291
3292
0
  z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3293
0
  z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3294
3295
0
  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);    /* c6 */
3296
  /* Add fudge factor here for final descale. */
3297
0
  z1 += ONE << (CONST_BITS-3-1);
3298
0
  tmp1 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3299
0
  tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3300
3301
  /* Row 1 */
3302
0
  elemptr = sample_data[1] + start_col;
3303
3304
  /* Even part */
3305
3306
0
  tmp4 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3307
0
  tmp5 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3308
3309
0
  tmp10 = tmp4 + tmp5;
3310
0
  tmp12 = tmp4 - tmp5;
3311
3312
  /* Odd part */
3313
3314
0
  z2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3315
0
  z3 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3316
3317
0
  z1 = MULTIPLY(z2 + z3, FIX_0_541196100);    /* c6 */
3318
0
  tmp11 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3319
0
  tmp13 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3320
3321
  /* Pass 2: process columns.
3322
   * We leave the results scaled up by an overall factor of 8.
3323
   * We must also scale the output by (8/4)*(8/2) = 2**3.
3324
   */
3325
3326
  /* Column 0 */
3327
  /* Apply unsigned->signed conversion. */
3328
0
  data[DCTSIZE*0] = (tmp0 + tmp10 - 8 * CENTERJSAMPLE) << 3;
3329
0
  data[DCTSIZE*1] = (tmp0 - tmp10) << 3;
3330
3331
  /* Column 1 */
3332
0
  data[DCTSIZE*0+1] = (DCTELEM) RIGHT_SHIFT(tmp1 + tmp11, CONST_BITS-3);
3333
0
  data[DCTSIZE*1+1] = (DCTELEM) RIGHT_SHIFT(tmp1 - tmp11, CONST_BITS-3);
3334
3335
  /* Column 2 */
3336
0
  data[DCTSIZE*0+2] = (tmp2 + tmp12) << 3;
3337
0
  data[DCTSIZE*1+2] = (tmp2 - tmp12) << 3;
3338
3339
  /* Column 3 */
3340
0
  data[DCTSIZE*0+3] = (DCTELEM) RIGHT_SHIFT(tmp3 + tmp13, CONST_BITS-3);
3341
0
  data[DCTSIZE*1+3] = (DCTELEM) RIGHT_SHIFT(tmp3 - tmp13, CONST_BITS-3);
3342
0
}
3343
3344
3345
/*
3346
 * Perform the forward DCT on a 2x1 sample block.
3347
 *
3348
 * 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3349
 */
3350
3351
GLOBAL(void)
3352
jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3353
0
{
3354
0
  DCTELEM tmp0, tmp1;
3355
0
  JSAMPROW elemptr;
3356
3357
  /* Pre-zero output coefficient block. */
3358
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3359
3360
0
  elemptr = sample_data[0] + start_col;
3361
3362
0
  tmp0 = GETJSAMPLE(elemptr[0]);
3363
0
  tmp1 = GETJSAMPLE(elemptr[1]);
3364
3365
  /* We leave the results scaled up by an overall factor of 8.
3366
   * We must also scale the output by (8/2)*(8/1) = 2**5.
3367
   */
3368
3369
  /* Even part */
3370
3371
  /* Apply unsigned->signed conversion. */
3372
0
  data[0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
3373
3374
  /* Odd part */
3375
3376
0
  data[1] = (tmp0 - tmp1) << 5;
3377
0
}
3378
3379
3380
/*
3381
 * Perform the forward DCT on an 8x16 sample block.
3382
 *
3383
 * 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3384
 */
3385
3386
GLOBAL(void)
3387
jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3388
0
{
3389
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3390
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3391
0
  INT32 z1;
3392
0
  DCTELEM workspace[DCTSIZE2];
3393
0
  DCTELEM *dataptr;
3394
0
  DCTELEM *wsptr;
3395
0
  JSAMPROW elemptr;
3396
0
  int ctr;
3397
0
  SHIFT_TEMPS
3398
3399
  /* Pass 1: process rows.
3400
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3401
   * furthermore, we scale the results by 2**PASS1_BITS.
3402
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3403
   */
3404
3405
0
  dataptr = data;
3406
0
  ctr = 0;
3407
0
  for (;;) {
3408
0
    elemptr = sample_data[ctr] + start_col;
3409
3410
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
3411
     * rotator "c1" should be "c6".
3412
     */
3413
3414
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3415
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3416
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3417
0
    tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3418
3419
0
    tmp10 = tmp0 + tmp3;
3420
0
    tmp12 = tmp0 - tmp3;
3421
0
    tmp11 = tmp1 + tmp2;
3422
0
    tmp13 = tmp1 - tmp2;
3423
3424
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3425
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3426
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3427
0
    tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3428
3429
    /* Apply unsigned->signed conversion. */
3430
0
    dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3431
0
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3432
3433
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);   /* c6 */
3434
0
    dataptr[2] = (DCTELEM)
3435
0
      DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
3436
0
        CONST_BITS-PASS1_BITS);
3437
0
    dataptr[6] = (DCTELEM)
3438
0
      DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
3439
0
        CONST_BITS-PASS1_BITS);
3440
3441
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3442
     * i0..i3 in the paper are tmp0..tmp3 here.
3443
     */
3444
3445
0
    tmp12 = tmp0 + tmp2;
3446
0
    tmp13 = tmp1 + tmp3;
3447
3448
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);   /*  c3 */
3449
0
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);      /* -c3+c5 */
3450
0
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);      /* -c3-c5 */
3451
0
    tmp12 += z1;
3452
0
    tmp13 += z1;
3453
3454
0
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);   /* -c3+c7 */
3455
0
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);          /*  c1+c3-c5-c7 */
3456
0
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);          /* -c1+c3+c5-c7 */
3457
0
    tmp0 += z1 + tmp12;
3458
0
    tmp3 += z1 + tmp13;
3459
3460
0
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);   /* -c1-c3 */
3461
0
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);          /*  c1+c3+c5-c7 */
3462
0
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);          /*  c1+c3-c5+c7 */
3463
0
    tmp1 += z1 + tmp13;
3464
0
    tmp2 += z1 + tmp12;
3465
3466
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3467
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3468
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3469
0
    dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-PASS1_BITS);
3470
3471
0
    ctr++;
3472
3473
0
    if (ctr != DCTSIZE) {
3474
0
      if (ctr == DCTSIZE * 2)
3475
0
  break;     /* Done. */
3476
0
      dataptr += DCTSIZE; /* advance pointer to next row */
3477
0
    } else
3478
0
      dataptr = workspace; /* switch pointer to extended workspace */
3479
0
  }
3480
3481
  /* Pass 2: process columns.
3482
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3483
   * by an overall factor of 8.
3484
   * We must also scale the output by 8/16 = 1/2.
3485
   * 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3486
   */
3487
3488
0
  dataptr = data;
3489
0
  wsptr = workspace;
3490
0
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3491
    /* Even part */
3492
3493
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*7];
3494
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*6];
3495
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*5];
3496
0
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*4];
3497
0
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*3];
3498
0
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*2];
3499
0
    tmp6 = dataptr[DCTSIZE*6] + wsptr[DCTSIZE*1];
3500
0
    tmp7 = dataptr[DCTSIZE*7] + wsptr[DCTSIZE*0];
3501
3502
0
    tmp10 = tmp0 + tmp7;
3503
0
    tmp14 = tmp0 - tmp7;
3504
0
    tmp11 = tmp1 + tmp6;
3505
0
    tmp15 = tmp1 - tmp6;
3506
0
    tmp12 = tmp2 + tmp5;
3507
0
    tmp16 = tmp2 - tmp5;
3508
0
    tmp13 = tmp3 + tmp4;
3509
0
    tmp17 = tmp3 - tmp4;
3510
3511
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*7];
3512
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*6];
3513
0
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*5];
3514
0
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*4];
3515
0
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*3];
3516
0
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*2];
3517
0
    tmp6 = dataptr[DCTSIZE*6] - wsptr[DCTSIZE*1];
3518
0
    tmp7 = dataptr[DCTSIZE*7] - wsptr[DCTSIZE*0];
3519
3520
0
    dataptr[DCTSIZE*0] = (DCTELEM)
3521
0
      DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
3522
0
    dataptr[DCTSIZE*4] = (DCTELEM)
3523
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3524
0
        MULTIPLY(tmp11 - tmp12, FIX_0_541196100),   /* c12[16] = c6[8] */
3525
0
        CONST_BITS+PASS1_BITS+1);
3526
3527
0
    tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) +   /* c14[16] = c7[8] */
3528
0
      MULTIPLY(tmp14 - tmp16, FIX(1.387039845));    /* c2[16] = c1[8] */
3529
3530
0
    dataptr[DCTSIZE*2] = (DCTELEM)
3531
0
      DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982))   /* c6+c14 */
3532
0
        + MULTIPLY(tmp16, FIX(2.172734804)),        /* c2+c10 */
3533
0
        CONST_BITS+PASS1_BITS+1);
3534
0
    dataptr[DCTSIZE*6] = (DCTELEM)
3535
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243))   /* c2-c6 */
3536
0
        - MULTIPLY(tmp17, FIX(1.061594338)),        /* c10+c14 */
3537
0
        CONST_BITS+PASS1_BITS+1);
3538
3539
    /* Odd part */
3540
3541
0
    tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) +         /* c3 */
3542
0
      MULTIPLY(tmp6 - tmp7, FIX(0.410524528));          /* c13 */
3543
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) +         /* c5 */
3544
0
      MULTIPLY(tmp5 + tmp7, FIX(0.666655658));          /* c11 */
3545
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) +         /* c7 */
3546
0
      MULTIPLY(tmp4 - tmp7, FIX(0.897167586));          /* c9 */
3547
0
    tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) +         /* c15 */
3548
0
      MULTIPLY(tmp6 - tmp5, FIX(1.407403738));          /* c1 */
3549
0
    tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) +       /* -c11 */
3550
0
      MULTIPLY(tmp4 + tmp6, - FIX(1.247225013));        /* -c5 */
3551
0
    tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) +       /* -c3 */
3552
0
      MULTIPLY(tmp5 - tmp4, FIX(0.410524528));          /* c13 */
3553
0
    tmp10 = tmp11 + tmp12 + tmp13 -
3554
0
      MULTIPLY(tmp0, FIX(2.286341144)) +                /* c7+c5+c3-c1 */
3555
0
      MULTIPLY(tmp7, FIX(0.779653625));                 /* c15+c13-c11+c9 */
3556
0
    tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3557
0
       - MULTIPLY(tmp6, FIX(1.663905119));              /* c7+c13+c1-c5 */
3558
0
    tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3559
0
       + MULTIPLY(tmp5, FIX(1.227391138));              /* c9-c11+c1-c13 */
3560
0
    tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3561
0
       + MULTIPLY(tmp4, FIX(2.167985692));              /* c1+c13+c5-c9 */
3562
3563
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
3564
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
3565
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
3566
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
3567
3568
0
    dataptr++;      /* advance pointer to next column */
3569
0
    wsptr++;      /* advance pointer to next column */
3570
0
  }
3571
0
}
3572
3573
3574
/*
3575
 * Perform the forward DCT on a 7x14 sample block.
3576
 *
3577
 * 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3578
 */
3579
3580
GLOBAL(void)
3581
jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3582
0
{
3583
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3584
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3585
0
  INT32 z1, z2, z3;
3586
0
  DCTELEM workspace[8*6];
3587
0
  DCTELEM *dataptr;
3588
0
  DCTELEM *wsptr;
3589
0
  JSAMPROW elemptr;
3590
0
  int ctr;
3591
0
  SHIFT_TEMPS
3592
3593
  /* Pre-zero output coefficient block. */
3594
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3595
3596
  /* Pass 1: process rows.
3597
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3598
   * furthermore, we scale the results by 2**PASS1_BITS.
3599
   * 7-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3600
   */
3601
3602
0
  dataptr = data;
3603
0
  ctr = 0;
3604
0
  for (;;) {
3605
0
    elemptr = sample_data[ctr] + start_col;
3606
3607
    /* Even part */
3608
3609
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3610
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3611
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3612
0
    tmp3 = GETJSAMPLE(elemptr[3]);
3613
3614
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3615
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3616
0
    tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3617
3618
0
    z1 = tmp0 + tmp2;
3619
    /* Apply unsigned->signed conversion. */
3620
0
    dataptr[0] = (DCTELEM)
3621
0
      ((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3622
0
    tmp3 += tmp3;
3623
0
    z1 -= tmp3;
3624
0
    z1 -= tmp3;
3625
0
    z1 = MULTIPLY(z1, FIX(0.353553391));                /* (c2+c6-c4)/2 */
3626
0
    z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002));       /* (c2+c4-c6)/2 */
3627
0
    z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123));       /* c6 */
3628
0
    dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3629
0
    z1 -= z2;
3630
0
    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734));       /* c4 */
3631
0
    dataptr[4] = (DCTELEM)
3632
0
      DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3633
0
        CONST_BITS-PASS1_BITS);
3634
0
    dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3635
3636
    /* Odd part */
3637
3638
0
    tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347));   /* (c3+c1-c5)/2 */
3639
0
    tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339));   /* (c3+c5-c1)/2 */
3640
0
    tmp0 = tmp1 - tmp2;
3641
0
    tmp1 += tmp2;
3642
0
    tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3643
0
    tmp1 += tmp2;
3644
0
    tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268));   /* c5 */
3645
0
    tmp0 += tmp3;
3646
0
    tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693));   /* c3+c1-c5 */
3647
3648
0
    dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3649
0
    dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3650
0
    dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3651
3652
0
    ctr++;
3653
3654
0
    if (ctr != DCTSIZE) {
3655
0
      if (ctr == 14)
3656
0
  break;     /* Done. */
3657
0
      dataptr += DCTSIZE; /* advance pointer to next row */
3658
0
    } else
3659
0
      dataptr = workspace; /* switch pointer to extended workspace */
3660
0
  }
3661
3662
  /* Pass 2: process columns.
3663
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3664
   * by an overall factor of 8.
3665
   * We must also scale the output by (8/7)*(8/14) = 32/49, which we
3666
   * fold into the constant multipliers:
3667
   * 14-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/28) * 32/49.
3668
   */
3669
3670
0
  dataptr = data;
3671
0
  wsptr = workspace;
3672
0
  for (ctr = 0; ctr < 7; ctr++) {
3673
    /* Even part */
3674
3675
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*5];
3676
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*4];
3677
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*3];
3678
0
    tmp13 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*2];
3679
0
    tmp4 = dataptr[DCTSIZE*4] + wsptr[DCTSIZE*1];
3680
0
    tmp5 = dataptr[DCTSIZE*5] + wsptr[DCTSIZE*0];
3681
0
    tmp6 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
3682
3683
0
    tmp10 = tmp0 + tmp6;
3684
0
    tmp14 = tmp0 - tmp6;
3685
0
    tmp11 = tmp1 + tmp5;
3686
0
    tmp15 = tmp1 - tmp5;
3687
0
    tmp12 = tmp2 + tmp4;
3688
0
    tmp16 = tmp2 - tmp4;
3689
3690
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*5];
3691
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*4];
3692
0
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*3];
3693
0
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*2];
3694
0
    tmp4 = dataptr[DCTSIZE*4] - wsptr[DCTSIZE*1];
3695
0
    tmp5 = dataptr[DCTSIZE*5] - wsptr[DCTSIZE*0];
3696
0
    tmp6 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];
3697
3698
0
    dataptr[DCTSIZE*0] = (DCTELEM)
3699
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3700
0
           FIX(0.653061224)),                 /* 32/49 */
3701
0
        CONST_BITS+PASS1_BITS);
3702
0
    tmp13 += tmp13;
3703
0
    dataptr[DCTSIZE*4] = (DCTELEM)
3704
0
      DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3705
0
        MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3706
0
        MULTIPLY(tmp12 - tmp13, FIX(0.575835255)),  /* c8 */
3707
0
        CONST_BITS+PASS1_BITS);
3708
3709
0
    tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570));    /* c6 */
3710
3711
0
    dataptr[DCTSIZE*2] = (DCTELEM)
3712
0
      DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691))   /* c2-c6 */
3713
0
        + MULTIPLY(tmp16, FIX(0.400721155)),        /* c10 */
3714
0
        CONST_BITS+PASS1_BITS);
3715
0
    dataptr[DCTSIZE*6] = (DCTELEM)
3716
0
      DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725))   /* c6+c10 */
3717
0
        - MULTIPLY(tmp16, FIX(0.900412262)),        /* c2 */
3718
0
        CONST_BITS+PASS1_BITS);
3719
3720
    /* Odd part */
3721
3722
0
    tmp10 = tmp1 + tmp2;
3723
0
    tmp11 = tmp5 - tmp4;
3724
0
    dataptr[DCTSIZE*7] = (DCTELEM)
3725
0
      DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3726
0
           FIX(0.653061224)),                 /* 32/49 */
3727
0
        CONST_BITS+PASS1_BITS);
3728
0
    tmp3  = MULTIPLY(tmp3 , FIX(0.653061224));            /* 32/49 */
3729
0
    tmp10 = MULTIPLY(tmp10, - FIX(0.103406812));          /* -c13 */
3730
0
    tmp11 = MULTIPLY(tmp11, FIX(0.917760839));            /* c1 */
3731
0
    tmp10 += tmp11 - tmp3;
3732
0
    tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) +     /* c5 */
3733
0
      MULTIPLY(tmp4 + tmp6, FIX(0.491367823));      /* c9 */
3734
0
    dataptr[DCTSIZE*5] = (DCTELEM)
3735
0
      DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3736
0
        + MULTIPLY(tmp4, FIX(0.731428202)),         /* c1+c11-c9 */
3737
0
        CONST_BITS+PASS1_BITS);
3738
0
    tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) +     /* c3 */
3739
0
      MULTIPLY(tmp5 - tmp6, FIX(0.305035186));      /* c11 */
3740
0
    dataptr[DCTSIZE*3] = (DCTELEM)
3741
0
      DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3742
0
        - MULTIPLY(tmp5, FIX(2.004803435)),         /* c1+c5+c11 */
3743
0
        CONST_BITS+PASS1_BITS);
3744
0
    dataptr[DCTSIZE*1] = (DCTELEM)
3745
0
      DESCALE(tmp11 + tmp12 + tmp3
3746
0
        - MULTIPLY(tmp0, FIX(0.735987049))          /* c3+c5-c1 */
3747
0
        - MULTIPLY(tmp6, FIX(0.082925825)),         /* c9-c11-c13 */
3748
0
        CONST_BITS+PASS1_BITS);
3749
3750
0
    dataptr++;      /* advance pointer to next column */
3751
0
    wsptr++;      /* advance pointer to next column */
3752
0
  }
3753
0
}
3754
3755
3756
/*
3757
 * Perform the forward DCT on a 6x12 sample block.
3758
 *
3759
 * 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3760
 */
3761
3762
GLOBAL(void)
3763
jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3764
0
{
3765
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3766
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3767
0
  DCTELEM workspace[8*4];
3768
0
  DCTELEM *dataptr;
3769
0
  DCTELEM *wsptr;
3770
0
  JSAMPROW elemptr;
3771
0
  int ctr;
3772
0
  SHIFT_TEMPS
3773
3774
  /* Pre-zero output coefficient block. */
3775
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3776
3777
  /* Pass 1: process rows.
3778
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3779
   * furthermore, we scale the results by 2**PASS1_BITS.
3780
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3781
   */
3782
3783
0
  dataptr = data;
3784
0
  ctr = 0;
3785
0
  for (;;) {
3786
0
    elemptr = sample_data[ctr] + start_col;
3787
3788
    /* Even part */
3789
3790
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3791
0
    tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3792
0
    tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3793
3794
0
    tmp10 = tmp0 + tmp2;
3795
0
    tmp12 = tmp0 - tmp2;
3796
3797
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3798
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3799
0
    tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3800
3801
    /* Apply unsigned->signed conversion. */
3802
0
    dataptr[0] = (DCTELEM)
3803
0
      ((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3804
0
    dataptr[2] = (DCTELEM)
3805
0
      DESCALE(MULTIPLY(tmp12, FIX(1.224744871)),                 /* c2 */
3806
0
        CONST_BITS-PASS1_BITS);
3807
0
    dataptr[4] = (DCTELEM)
3808
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3809
0
        CONST_BITS-PASS1_BITS);
3810
3811
    /* Odd part */
3812
3813
0
    tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)),     /* c5 */
3814
0
        CONST_BITS-PASS1_BITS);
3815
3816
0
    dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3817
0
    dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3818
0
    dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3819
3820
0
    ctr++;
3821
3822
0
    if (ctr != DCTSIZE) {
3823
0
      if (ctr == 12)
3824
0
  break;     /* Done. */
3825
0
      dataptr += DCTSIZE; /* advance pointer to next row */
3826
0
    } else
3827
0
      dataptr = workspace; /* switch pointer to extended workspace */
3828
0
  }
3829
3830
  /* Pass 2: process columns.
3831
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3832
   * by an overall factor of 8.
3833
   * We must also scale the output by (8/6)*(8/12) = 8/9, which we
3834
   * fold into the constant multipliers:
3835
   * 12-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/24) * 8/9.
3836
   */
3837
3838
0
  dataptr = data;
3839
0
  wsptr = workspace;
3840
0
  for (ctr = 0; ctr < 6; ctr++) {
3841
    /* Even part */
3842
3843
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*3];
3844
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*2];
3845
0
    tmp2 = dataptr[DCTSIZE*2] + wsptr[DCTSIZE*1];
3846
0
    tmp3 = dataptr[DCTSIZE*3] + wsptr[DCTSIZE*0];
3847
0
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*7];
3848
0
    tmp5 = dataptr[DCTSIZE*5] + dataptr[DCTSIZE*6];
3849
3850
0
    tmp10 = tmp0 + tmp5;
3851
0
    tmp13 = tmp0 - tmp5;
3852
0
    tmp11 = tmp1 + tmp4;
3853
0
    tmp14 = tmp1 - tmp4;
3854
0
    tmp12 = tmp2 + tmp3;
3855
0
    tmp15 = tmp2 - tmp3;
3856
3857
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*3];
3858
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*2];
3859
0
    tmp2 = dataptr[DCTSIZE*2] - wsptr[DCTSIZE*1];
3860
0
    tmp3 = dataptr[DCTSIZE*3] - wsptr[DCTSIZE*0];
3861
0
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*7];
3862
0
    tmp5 = dataptr[DCTSIZE*5] - dataptr[DCTSIZE*6];
3863
3864
0
    dataptr[DCTSIZE*0] = (DCTELEM)
3865
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
3866
0
        CONST_BITS+PASS1_BITS);
3867
0
    dataptr[DCTSIZE*6] = (DCTELEM)
3868
0
      DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
3869
0
        CONST_BITS+PASS1_BITS);
3870
0
    dataptr[DCTSIZE*4] = (DCTELEM)
3871
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)),         /* c4 */
3872
0
        CONST_BITS+PASS1_BITS);
3873
0
    dataptr[DCTSIZE*2] = (DCTELEM)
3874
0
      DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) +        /* 8/9 */
3875
0
        MULTIPLY(tmp13 + tmp15, FIX(1.214244803)),         /* c2 */
3876
0
        CONST_BITS+PASS1_BITS);
3877
3878
    /* Odd part */
3879
3880
0
    tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200));   /* c9 */
3881
0
    tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102));  /* c3-c9 */
3882
0
    tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502));  /* c3+c9 */
3883
0
    tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603));   /* c5 */
3884
0
    tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039));   /* c7 */
3885
0
    tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
3886
0
      + MULTIPLY(tmp5, FIX(0.164081699));        /* c11 */
3887
0
    tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
3888
0
    tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
3889
0
      + MULTIPLY(tmp5, FIX(0.765261039));        /* c7 */
3890
0
    tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
3891
0
      - MULTIPLY(tmp5, FIX(0.997307603));        /* c5 */
3892
0
    tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
3893
0
      - MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
3894
3895
0
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3896
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3897
0
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3898
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3899
3900
0
    dataptr++;      /* advance pointer to next column */
3901
0
    wsptr++;      /* advance pointer to next column */
3902
0
  }
3903
0
}
3904
3905
3906
/*
3907
 * Perform the forward DCT on a 5x10 sample block.
3908
 *
3909
 * 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3910
 */
3911
3912
GLOBAL(void)
3913
jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3914
0
{
3915
0
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3916
0
  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3917
0
  DCTELEM workspace[8*2];
3918
0
  DCTELEM *dataptr;
3919
0
  DCTELEM *wsptr;
3920
0
  JSAMPROW elemptr;
3921
0
  int ctr;
3922
0
  SHIFT_TEMPS
3923
3924
  /* Pre-zero output coefficient block. */
3925
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3926
3927
  /* Pass 1: process rows.
3928
   * Note results are scaled up by sqrt(8) compared to a true DCT;
3929
   * furthermore, we scale the results by 2**PASS1_BITS.
3930
   * 5-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3931
   */
3932
3933
0
  dataptr = data;
3934
0
  ctr = 0;
3935
0
  for (;;) {
3936
0
    elemptr = sample_data[ctr] + start_col;
3937
3938
    /* Even part */
3939
3940
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3941
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3942
0
    tmp2 = GETJSAMPLE(elemptr[2]);
3943
3944
0
    tmp10 = tmp0 + tmp1;
3945
0
    tmp11 = tmp0 - tmp1;
3946
3947
0
    tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3948
0
    tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3949
3950
    /* Apply unsigned->signed conversion. */
3951
0
    dataptr[0] = (DCTELEM)
3952
0
      ((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3953
0
    tmp11 = MULTIPLY(tmp11, FIX(0.790569415));          /* (c2+c4)/2 */
3954
0
    tmp10 -= tmp2 << 2;
3955
0
    tmp10 = MULTIPLY(tmp10, FIX(0.353553391));          /* (c2-c4)/2 */
3956
0
    dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3957
0
    dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3958
3959
    /* Odd part */
3960
3961
0
    tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876));    /* c3 */
3962
3963
0
    dataptr[1] = (DCTELEM)
3964
0
      DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3965
0
        CONST_BITS-PASS1_BITS);
3966
0
    dataptr[3] = (DCTELEM)
3967
0
      DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3968
0
        CONST_BITS-PASS1_BITS);
3969
3970
0
    ctr++;
3971
3972
0
    if (ctr != DCTSIZE) {
3973
0
      if (ctr == 10)
3974
0
  break;     /* Done. */
3975
0
      dataptr += DCTSIZE; /* advance pointer to next row */
3976
0
    } else
3977
0
      dataptr = workspace; /* switch pointer to extended workspace */
3978
0
  }
3979
3980
  /* Pass 2: process columns.
3981
   * We remove the PASS1_BITS scaling, but leave the results scaled up
3982
   * by an overall factor of 8.
3983
   * We must also scale the output by (8/5)*(8/10) = 32/25, which we
3984
   * fold into the constant multipliers:
3985
   * 10-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/20) * 32/25.
3986
   */
3987
3988
0
  dataptr = data;
3989
0
  wsptr = workspace;
3990
0
  for (ctr = 0; ctr < 5; ctr++) {
3991
    /* Even part */
3992
3993
0
    tmp0 = dataptr[DCTSIZE*0] + wsptr[DCTSIZE*1];
3994
0
    tmp1 = dataptr[DCTSIZE*1] + wsptr[DCTSIZE*0];
3995
0
    tmp12 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*7];
3996
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*6];
3997
0
    tmp4 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
3998
3999
0
    tmp10 = tmp0 + tmp4;
4000
0
    tmp13 = tmp0 - tmp4;
4001
0
    tmp11 = tmp1 + tmp3;
4002
0
    tmp14 = tmp1 - tmp3;
4003
4004
0
    tmp0 = dataptr[DCTSIZE*0] - wsptr[DCTSIZE*1];
4005
0
    tmp1 = dataptr[DCTSIZE*1] - wsptr[DCTSIZE*0];
4006
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*7];
4007
0
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*6];
4008
0
    tmp4 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
4009
4010
0
    dataptr[DCTSIZE*0] = (DCTELEM)
4011
0
      DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
4012
0
        CONST_BITS+PASS1_BITS);
4013
0
    tmp12 += tmp12;
4014
0
    dataptr[DCTSIZE*4] = (DCTELEM)
4015
0
      DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
4016
0
        MULTIPLY(tmp11 - tmp12, FIX(0.559380511)),  /* c8 */
4017
0
        CONST_BITS+PASS1_BITS);
4018
0
    tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961));    /* c6 */
4019
0
    dataptr[DCTSIZE*2] = (DCTELEM)
4020
0
      DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)),  /* c2-c6 */
4021
0
        CONST_BITS+PASS1_BITS);
4022
0
    dataptr[DCTSIZE*6] = (DCTELEM)
4023
0
      DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)),  /* c2+c6 */
4024
0
        CONST_BITS+PASS1_BITS);
4025
4026
    /* Odd part */
4027
4028
0
    tmp10 = tmp0 + tmp4;
4029
0
    tmp11 = tmp1 - tmp3;
4030
0
    dataptr[DCTSIZE*5] = (DCTELEM)
4031
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)),  /* 32/25 */
4032
0
        CONST_BITS+PASS1_BITS);
4033
0
    tmp2 = MULTIPLY(tmp2, FIX(1.28));                     /* 32/25 */
4034
0
    dataptr[DCTSIZE*1] = (DCTELEM)
4035
0
      DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) +          /* c1 */
4036
0
        MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 +   /* c3 */
4037
0
        MULTIPLY(tmp3, FIX(0.821810588)) +          /* c7 */
4038
0
        MULTIPLY(tmp4, FIX(0.283176630)),           /* c9 */
4039
0
        CONST_BITS+PASS1_BITS);
4040
0
    tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) -     /* (c3+c7)/2 */
4041
0
      MULTIPLY(tmp1 + tmp3, FIX(0.752365123));      /* (c1-c9)/2 */
4042
0
    tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) +   /* (c3-c7)/2 */
4043
0
      MULTIPLY(tmp11, FIX(0.64)) - tmp2;            /* 16/25 */
4044
0
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
4045
0
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
4046
4047
0
    dataptr++;      /* advance pointer to next column */
4048
0
    wsptr++;      /* advance pointer to next column */
4049
0
  }
4050
0
}
4051
4052
4053
/*
4054
 * Perform the forward DCT on a 4x8 sample block.
4055
 *
4056
 * 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4057
 */
4058
4059
GLOBAL(void)
4060
jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4061
0
{
4062
0
  INT32 tmp0, tmp1, tmp2, tmp3;
4063
0
  INT32 tmp10, tmp11, tmp12, tmp13;
4064
0
  INT32 z1;
4065
0
  DCTELEM *dataptr;
4066
0
  JSAMPROW elemptr;
4067
0
  int ctr;
4068
0
  SHIFT_TEMPS
4069
4070
  /* Pre-zero output coefficient block. */
4071
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4072
4073
  /* Pass 1: process rows.
4074
   * Note results are scaled up by sqrt(8) compared to a true DCT;
4075
   * furthermore, we scale the results by 2**PASS1_BITS.
4076
   * We must also scale the output by 8/4 = 2, which we add here.
4077
   * 4-point FDCT kernel,
4078
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4079
   */
4080
4081
0
  dataptr = data;
4082
0
  for (ctr = 0; ctr < DCTSIZE; ctr++) {
4083
0
    elemptr = sample_data[ctr] + start_col;
4084
4085
    /* Even part */
4086
4087
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4088
0
    tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4089
4090
0
    tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4091
0
    tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4092
4093
    /* Apply unsigned->signed conversion. */
4094
0
    dataptr[0] = (DCTELEM)
4095
0
      ((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4096
0
    dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4097
4098
    /* Odd part */
4099
4100
0
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4101
    /* Add fudge factor here for final descale. */
4102
0
    tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4103
4104
0
    dataptr[1] = (DCTELEM)
4105
0
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4106
0
      CONST_BITS-PASS1_BITS-1);
4107
0
    dataptr[3] = (DCTELEM)
4108
0
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4109
0
      CONST_BITS-PASS1_BITS-1);
4110
4111
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
4112
0
  }
4113
4114
  /* Pass 2: process columns.
4115
   * We remove the PASS1_BITS scaling, but leave the results scaled up
4116
   * by an overall factor of 8.
4117
   * 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4118
   */
4119
4120
0
  dataptr = data;
4121
0
  for (ctr = 0; ctr < 4; ctr++) {
4122
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
4123
     * rotator "c1" should be "c6".
4124
     */
4125
4126
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
4127
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
4128
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
4129
0
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
4130
4131
    /* Add fudge factor here for final descale. */
4132
0
    tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4133
0
    tmp12 = tmp0 - tmp3;
4134
0
    tmp11 = tmp1 + tmp2;
4135
0
    tmp13 = tmp1 - tmp2;
4136
4137
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
4138
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
4139
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
4140
0
    tmp3 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
4141
4142
0
    dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4143
0
    dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4144
4145
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);       /* c6 */
4146
    /* Add fudge factor here for final descale. */
4147
0
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4148
4149
0
    dataptr[DCTSIZE*2] = (DCTELEM)
4150
0
      RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), /* c2-c6 */
4151
0
      CONST_BITS+PASS1_BITS);
4152
0
    dataptr[DCTSIZE*6] = (DCTELEM)
4153
0
      RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), /* c2+c6 */
4154
0
      CONST_BITS+PASS1_BITS);
4155
4156
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4157
     * i0..i3 in the paper are tmp0..tmp3 here.
4158
     */
4159
4160
0
    tmp12 = tmp0 + tmp2;
4161
0
    tmp13 = tmp1 + tmp3;
4162
4163
0
    z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602);       /*  c3 */
4164
    /* Add fudge factor here for final descale. */
4165
0
    z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4166
4167
0
    tmp12 = MULTIPLY(tmp12, - FIX_0_390180644);          /* -c3+c5 */
4168
0
    tmp13 = MULTIPLY(tmp13, - FIX_1_961570560);          /* -c3-c5 */
4169
0
    tmp12 += z1;
4170
0
    tmp13 += z1;
4171
4172
0
    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223);       /* -c3+c7 */
4173
0
    tmp0 = MULTIPLY(tmp0, FIX_1_501321110);              /*  c1+c3-c5-c7 */
4174
0
    tmp3 = MULTIPLY(tmp3, FIX_0_298631336);              /* -c1+c3+c5-c7 */
4175
0
    tmp0 += z1 + tmp12;
4176
0
    tmp3 += z1 + tmp13;
4177
4178
0
    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447);       /* -c1-c3 */
4179
0
    tmp1 = MULTIPLY(tmp1, FIX_3_072711026);              /*  c1+c3+c5-c7 */
4180
0
    tmp2 = MULTIPLY(tmp2, FIX_2_053119869);              /*  c1+c3-c5+c7 */
4181
0
    tmp1 += z1 + tmp13;
4182
0
    tmp2 += z1 + tmp12;
4183
4184
0
    dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0, CONST_BITS+PASS1_BITS);
4185
0
    dataptr[DCTSIZE*3] = (DCTELEM) RIGHT_SHIFT(tmp1, CONST_BITS+PASS1_BITS);
4186
0
    dataptr[DCTSIZE*5] = (DCTELEM) RIGHT_SHIFT(tmp2, CONST_BITS+PASS1_BITS);
4187
0
    dataptr[DCTSIZE*7] = (DCTELEM) RIGHT_SHIFT(tmp3, CONST_BITS+PASS1_BITS);
4188
4189
0
    dataptr++;      /* advance pointer to next column */
4190
0
  }
4191
0
}
4192
4193
4194
/*
4195
 * Perform the forward DCT on a 3x6 sample block.
4196
 *
4197
 * 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4198
 */
4199
4200
GLOBAL(void)
4201
jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4202
0
{
4203
0
  INT32 tmp0, tmp1, tmp2;
4204
0
  INT32 tmp10, tmp11, tmp12;
4205
0
  DCTELEM *dataptr;
4206
0
  JSAMPROW elemptr;
4207
0
  int ctr;
4208
0
  SHIFT_TEMPS
4209
4210
  /* Pre-zero output coefficient block. */
4211
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4212
4213
  /* Pass 1: process rows.
4214
   * Note results are scaled up by sqrt(8) compared to a true DCT;
4215
   * furthermore, we scale the results by 2**PASS1_BITS.
4216
   * We scale the results further by 2 as part of output adaption
4217
   * scaling for different DCT size.
4218
   * 3-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4219
   */
4220
4221
0
  dataptr = data;
4222
0
  for (ctr = 0; ctr < 6; ctr++) {
4223
0
    elemptr = sample_data[ctr] + start_col;
4224
4225
    /* Even part */
4226
4227
0
    tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4228
0
    tmp1 = GETJSAMPLE(elemptr[1]);
4229
4230
0
    tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4231
4232
    /* Apply unsigned->signed conversion. */
4233
0
    dataptr[0] = (DCTELEM)
4234
0
      ((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4235
0
    dataptr[2] = (DCTELEM)
4236
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4237
0
        CONST_BITS-PASS1_BITS-1);
4238
4239
    /* Odd part */
4240
4241
0
    dataptr[1] = (DCTELEM)
4242
0
      DESCALE(MULTIPLY(tmp2, FIX(1.224744871)),               /* c1 */
4243
0
        CONST_BITS-PASS1_BITS-1);
4244
4245
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
4246
0
  }
4247
4248
  /* Pass 2: process columns.
4249
   * We remove the PASS1_BITS scaling, but leave the results scaled up
4250
   * by an overall factor of 8.
4251
   * We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4252
   * fold into the constant multipliers (other part was done in pass 1):
4253
   * 6-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/12) * 16/9.
4254
   */
4255
4256
0
  dataptr = data;
4257
0
  for (ctr = 0; ctr < 3; ctr++) {
4258
    /* Even part */
4259
4260
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*5];
4261
0
    tmp11 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*4];
4262
0
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
4263
4264
0
    tmp10 = tmp0 + tmp2;
4265
0
    tmp12 = tmp0 - tmp2;
4266
4267
0
    tmp0 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*5];
4268
0
    tmp1 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*4];
4269
0
    tmp2 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
4270
4271
0
    dataptr[DCTSIZE*0] = (DCTELEM)
4272
0
      DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)),         /* 16/9 */
4273
0
        CONST_BITS+PASS1_BITS);
4274
0
    dataptr[DCTSIZE*2] = (DCTELEM)
4275
0
      DESCALE(MULTIPLY(tmp12, FIX(2.177324216)),                 /* c2 */
4276
0
        CONST_BITS+PASS1_BITS);
4277
0
    dataptr[DCTSIZE*4] = (DCTELEM)
4278
0
      DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4279
0
        CONST_BITS+PASS1_BITS);
4280
4281
    /* Odd part */
4282
4283
0
    tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829));             /* c5 */
4284
4285
0
    dataptr[DCTSIZE*1] = (DCTELEM)
4286
0
      DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)),   /* 16/9 */
4287
0
        CONST_BITS+PASS1_BITS);
4288
0
    dataptr[DCTSIZE*3] = (DCTELEM)
4289
0
      DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)),    /* 16/9 */
4290
0
        CONST_BITS+PASS1_BITS);
4291
0
    dataptr[DCTSIZE*5] = (DCTELEM)
4292
0
      DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)),   /* 16/9 */
4293
0
        CONST_BITS+PASS1_BITS);
4294
4295
0
    dataptr++;      /* advance pointer to next column */
4296
0
  }
4297
0
}
4298
4299
4300
/*
4301
 * Perform the forward DCT on a 2x4 sample block.
4302
 *
4303
 * 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4304
 */
4305
4306
GLOBAL(void)
4307
jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4308
0
{
4309
0
  INT32 tmp0, tmp1;
4310
0
  INT32 tmp10, tmp11;
4311
0
  DCTELEM *dataptr;
4312
0
  JSAMPROW elemptr;
4313
0
  int ctr;
4314
0
  SHIFT_TEMPS
4315
4316
  /* Pre-zero output coefficient block. */
4317
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4318
4319
  /* Pass 1: process rows.
4320
   * Note results are scaled up by sqrt(8) compared to a true DCT.
4321
   */
4322
4323
0
  dataptr = data;
4324
0
  for (ctr = 0; ctr < 4; ctr++) {
4325
0
    elemptr = sample_data[ctr] + start_col;
4326
4327
    /* Even part */
4328
4329
0
    tmp0 = GETJSAMPLE(elemptr[0]);
4330
0
    tmp1 = GETJSAMPLE(elemptr[1]);
4331
4332
    /* Apply unsigned->signed conversion. */
4333
0
    dataptr[0] = (DCTELEM) (tmp0 + tmp1 - 2 * CENTERJSAMPLE);
4334
4335
    /* Odd part */
4336
4337
0
    dataptr[1] = (DCTELEM) (tmp0 - tmp1);
4338
4339
0
    dataptr += DCTSIZE;   /* advance pointer to next row */
4340
0
  }
4341
4342
  /* Pass 2: process columns.
4343
   * We leave the results scaled up by an overall factor of 8.
4344
   * We must also scale the output by (8/2)*(8/4) = 2**3.
4345
   * 4-point FDCT kernel,
4346
   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4347
   */
4348
4349
0
  dataptr = data;
4350
0
  for (ctr = 0; ctr < 2; ctr++) {
4351
    /* Even part */
4352
4353
0
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*3];
4354
0
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*2];
4355
4356
0
    tmp10 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*3];
4357
0
    tmp11 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*2];
4358
4359
0
    dataptr[DCTSIZE*0] = (DCTELEM) ((tmp0 + tmp1) << 3);
4360
0
    dataptr[DCTSIZE*2] = (DCTELEM) ((tmp0 - tmp1) << 3);
4361
4362
    /* Odd part */
4363
4364
0
    tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100);       /* c6 */
4365
    /* Add fudge factor here for final descale. */
4366
0
    tmp0 += ONE << (CONST_BITS-3-1);
4367
4368
0
    dataptr[DCTSIZE*1] = (DCTELEM)
4369
0
      RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4370
0
      CONST_BITS-3);
4371
0
    dataptr[DCTSIZE*3] = (DCTELEM)
4372
0
      RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4373
0
      CONST_BITS-3);
4374
4375
0
    dataptr++;      /* advance pointer to next column */
4376
0
  }
4377
0
}
4378
4379
4380
/*
4381
 * Perform the forward DCT on a 1x2 sample block.
4382
 *
4383
 * 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4384
 */
4385
4386
GLOBAL(void)
4387
jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4388
0
{
4389
0
  DCTELEM tmp0, tmp1;
4390
4391
  /* Pre-zero output coefficient block. */
4392
0
  MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4393
4394
  /* Pass 1: empty. */
4395
4396
  /* Pass 2: process columns.
4397
   * We leave the results scaled up by an overall factor of 8.
4398
   * We must also scale the output by (8/1)*(8/2) = 2**5.
4399
   */
4400
4401
  /* Even part */
4402
4403
0
  tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4404
0
  tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4405
4406
  /* Apply unsigned->signed conversion. */
4407
0
  data[DCTSIZE*0] = (tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5;
4408
4409
  /* Odd part */
4410
4411
0
  data[DCTSIZE*1] = (tmp0 - tmp1) << 5;
4412
0
}
4413
4414
#endif /* DCT_SCALING_SUPPORTED */
4415
#endif /* DCT_ISLOW_SUPPORTED */