/src/libjpeg-turbo.main/jfdctint.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  |  * jfdctint.c  | 
3  |  |  *  | 
4  |  |  * This file was part of the Independent JPEG Group's software:  | 
5  |  |  * Copyright (C) 1991-1996, Thomas G. Lane.  | 
6  |  |  * libjpeg-turbo Modifications:  | 
7  |  |  * Copyright (C) 2015, 2020, D. R. Commander.  | 
8  |  |  * For conditions of distribution and use, see the accompanying README.ijg  | 
9  |  |  * file.  | 
10  |  |  *  | 
11  |  |  * This file contains a slower but more accurate integer implementation of the  | 
12  |  |  * forward DCT (Discrete Cosine Transform).  | 
13  |  |  *  | 
14  |  |  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT  | 
15  |  |  * on each column.  Direct algorithms are also available, but they are  | 
16  |  |  * much more complex and seem not to be any faster when reduced to code.  | 
17  |  |  *  | 
18  |  |  * This implementation is based on an algorithm described in  | 
19  |  |  *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT  | 
20  |  |  *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,  | 
21  |  |  *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.  | 
22  |  |  * The primary algorithm described there uses 11 multiplies and 29 adds.  | 
23  |  |  * We use their alternate method with 12 multiplies and 32 adds.  | 
24  |  |  * The advantage of this method is that no data path contains more than one  | 
25  |  |  * multiplication; this allows a very simple and accurate implementation in  | 
26  |  |  * scaled fixed-point arithmetic, with a minimal number of shifts.  | 
27  |  |  */  | 
28  |  |  | 
29  |  | #define JPEG_INTERNALS  | 
30  |  | #include "jinclude.h"  | 
31  |  | #include "jpeglib.h"  | 
32  |  | #include "jdct.h"               /* Private declarations for DCT subsystem */  | 
33  |  |  | 
34  |  | #ifdef DCT_ISLOW_SUPPORTED  | 
35  |  |  | 
36  |  |  | 
37  |  | /*  | 
38  |  |  * This module is specialized to the case DCTSIZE = 8.  | 
39  |  |  */  | 
40  |  |  | 
41  |  | #if DCTSIZE != 8  | 
42  |  |   Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */  | 
43  |  | #endif  | 
44  |  |  | 
45  |  |  | 
46  |  | /*  | 
47  |  |  * The poop on this scaling stuff is as follows:  | 
48  |  |  *  | 
49  |  |  * Each 1-D DCT step produces outputs which are a factor of sqrt(N)  | 
50  |  |  * larger than the true DCT outputs.  The final outputs are therefore  | 
51  |  |  * a factor of N larger than desired; since N=8 this can be cured by  | 
52  |  |  * a simple right shift at the end of the algorithm.  The advantage of  | 
53  |  |  * this arrangement is that we save two multiplications per 1-D DCT,  | 
54  |  |  * because the y0 and y4 outputs need not be divided by sqrt(N).  | 
55  |  |  * In the IJG code, this factor of 8 is removed by the quantization step  | 
56  |  |  * (in jcdctmgr.c), NOT in this module.  | 
57  |  |  *  | 
58  |  |  * We have to do addition and subtraction of the integer inputs, which  | 
59  |  |  * is no problem, and multiplication by fractional constants, which is  | 
60  |  |  * a problem to do in integer arithmetic.  We multiply all the constants  | 
61  |  |  * by CONST_SCALE and convert them to integer constants (thus retaining  | 
62  |  |  * CONST_BITS bits of precision in the constants).  After doing a  | 
63  |  |  * multiplication we have to divide the product by CONST_SCALE, with proper  | 
64  |  |  * rounding, to produce the correct output.  This division can be done  | 
65  |  |  * cheaply as a right shift of CONST_BITS bits.  We postpone shifting  | 
66  |  |  * as long as possible so that partial sums can be added together with  | 
67  |  |  * full fractional precision.  | 
68  |  |  *  | 
69  |  |  * The outputs of the first pass are scaled up by PASS1_BITS bits so that  | 
70  |  |  * they are represented to better-than-integral precision.  These outputs  | 
71  |  |  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word  | 
72  |  |  * with the recommended scaling.  (For 12-bit sample data, the intermediate  | 
73  |  |  * array is JLONG anyway.)  | 
74  |  |  *  | 
75  |  |  * To avoid overflow of the 32-bit intermediate results in pass 2, we must  | 
76  |  |  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis  | 
77  |  |  * shows that the values given below are the most effective.  | 
78  |  |  */  | 
79  |  |  | 
80  |  | #if BITS_IN_JSAMPLE == 8  | 
81  |  | #define CONST_BITS  13  | 
82  |  | #define PASS1_BITS  2  | 
83  |  | #else  | 
84  |  | #define CONST_BITS  13  | 
85  |  | #define PASS1_BITS  1           /* lose a little precision to avoid overflow */  | 
86  |  | #endif  | 
87  |  |  | 
88  |  | /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus  | 
89  |  |  * causing a lot of useless floating-point operations at run time.  | 
90  |  |  * To get around this we use the following pre-calculated constants.  | 
91  |  |  * If you change CONST_BITS you may want to add appropriate values.  | 
92  |  |  * (With a reasonable C compiler, you can just rely on the FIX() macro...)  | 
93  |  |  */  | 
94  |  |  | 
95  |  | #if CONST_BITS == 13  | 
96  |  | #define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */  | 
97  |  | #define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */  | 
98  |  | #define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */  | 
99  |  | #define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */  | 
100  |  | #define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */  | 
101  |  | #define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */  | 
102  |  | #define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */  | 
103  |  | #define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */  | 
104  |  | #define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */  | 
105  |  | #define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */  | 
106  |  | #define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */  | 
107  |  | #define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */  | 
108  |  | #else  | 
109  |  | #define FIX_0_298631336  FIX(0.298631336)  | 
110  |  | #define FIX_0_390180644  FIX(0.390180644)  | 
111  |  | #define FIX_0_541196100  FIX(0.541196100)  | 
112  |  | #define FIX_0_765366865  FIX(0.765366865)  | 
113  |  | #define FIX_0_899976223  FIX(0.899976223)  | 
114  |  | #define FIX_1_175875602  FIX(1.175875602)  | 
115  |  | #define FIX_1_501321110  FIX(1.501321110)  | 
116  |  | #define FIX_1_847759065  FIX(1.847759065)  | 
117  |  | #define FIX_1_961570560  FIX(1.961570560)  | 
118  |  | #define FIX_2_053119869  FIX(2.053119869)  | 
119  |  | #define FIX_2_562915447  FIX(2.562915447)  | 
120  |  | #define FIX_3_072711026  FIX(3.072711026)  | 
121  |  | #endif  | 
122  |  |  | 
123  |  |  | 
124  |  | /* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.  | 
125  |  |  * For 8-bit samples with the recommended scaling, all the variable  | 
126  |  |  * and constant values involved are no more than 16 bits wide, so a  | 
127  |  |  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.  | 
128  |  |  * For 12-bit samples, a full 32-bit multiplication will be needed.  | 
129  |  |  */  | 
130  |  |  | 
131  |  | #if BITS_IN_JSAMPLE == 8  | 
132  | 0  | #define MULTIPLY(var, const)  MULTIPLY16C16(var, const)  | 
133  |  | #else  | 
134  |  | #define MULTIPLY(var, const)  ((var) * (const))  | 
135  |  | #endif  | 
136  |  |  | 
137  |  |  | 
138  |  | /*  | 
139  |  |  * Perform the forward DCT on one block of samples.  | 
140  |  |  */  | 
141  |  |  | 
142  |  | GLOBAL(void)  | 
143  |  | jpeg_fdct_islow(DCTELEM *data)  | 
144  | 0  | { | 
145  | 0  |   JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;  | 
146  | 0  |   JLONG tmp10, tmp11, tmp12, tmp13;  | 
147  | 0  |   JLONG z1, z2, z3, z4, z5;  | 
148  | 0  |   DCTELEM *dataptr;  | 
149  | 0  |   int ctr;  | 
150  | 0  |   SHIFT_TEMPS  | 
151  |  |  | 
152  |  |   /* Pass 1: process rows. */  | 
153  |  |   /* Note results are scaled up by sqrt(8) compared to a true DCT; */  | 
154  |  |   /* furthermore, we scale the results by 2**PASS1_BITS. */  | 
155  |  | 
  | 
156  | 0  |   dataptr = data;  | 
157  | 0  |   for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) { | 
158  | 0  |     tmp0 = dataptr[0] + dataptr[7];  | 
159  | 0  |     tmp7 = dataptr[0] - dataptr[7];  | 
160  | 0  |     tmp1 = dataptr[1] + dataptr[6];  | 
161  | 0  |     tmp6 = dataptr[1] - dataptr[6];  | 
162  | 0  |     tmp2 = dataptr[2] + dataptr[5];  | 
163  | 0  |     tmp5 = dataptr[2] - dataptr[5];  | 
164  | 0  |     tmp3 = dataptr[3] + dataptr[4];  | 
165  | 0  |     tmp4 = dataptr[3] - dataptr[4];  | 
166  |  |  | 
167  |  |     /* Even part per LL&M figure 1 --- note that published figure is faulty;  | 
168  |  |      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".  | 
169  |  |      */  | 
170  |  | 
  | 
171  | 0  |     tmp10 = tmp0 + tmp3;  | 
172  | 0  |     tmp13 = tmp0 - tmp3;  | 
173  | 0  |     tmp11 = tmp1 + tmp2;  | 
174  | 0  |     tmp12 = tmp1 - tmp2;  | 
175  |  | 
  | 
176  | 0  |     dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);  | 
177  | 0  |     dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);  | 
178  |  | 
  | 
179  | 0  |     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);  | 
180  | 0  |     dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),  | 
181  | 0  |                                   CONST_BITS - PASS1_BITS);  | 
182  | 0  |     dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),  | 
183  | 0  |                                   CONST_BITS - PASS1_BITS);  | 
184  |  |  | 
185  |  |     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).  | 
186  |  |      * cK represents cos(K*pi/16).  | 
187  |  |      * i0..i3 in the paper are tmp4..tmp7 here.  | 
188  |  |      */  | 
189  |  | 
  | 
190  | 0  |     z1 = tmp4 + tmp7;  | 
191  | 0  |     z2 = tmp5 + tmp6;  | 
192  | 0  |     z3 = tmp4 + tmp6;  | 
193  | 0  |     z4 = tmp5 + tmp7;  | 
194  | 0  |     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */  | 
195  |  | 
  | 
196  | 0  |     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */  | 
197  | 0  |     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */  | 
198  | 0  |     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */  | 
199  | 0  |     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */  | 
200  | 0  |     z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */  | 
201  | 0  |     z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */  | 
202  | 0  |     z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */  | 
203  | 0  |     z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */  | 
204  |  | 
  | 
205  | 0  |     z3 += z5;  | 
206  | 0  |     z4 += z5;  | 
207  |  | 
  | 
208  | 0  |     dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);  | 
209  | 0  |     dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);  | 
210  | 0  |     dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);  | 
211  | 0  |     dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);  | 
212  |  | 
  | 
213  | 0  |     dataptr += DCTSIZE;         /* advance pointer to next row */  | 
214  | 0  |   }  | 
215  |  |  | 
216  |  |   /* Pass 2: process columns.  | 
217  |  |    * We remove the PASS1_BITS scaling, but leave the results scaled up  | 
218  |  |    * by an overall factor of 8.  | 
219  |  |    */  | 
220  |  | 
  | 
221  | 0  |   dataptr = data;  | 
222  | 0  |   for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) { | 
223  | 0  |     tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];  | 
224  | 0  |     tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];  | 
225  | 0  |     tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];  | 
226  | 0  |     tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];  | 
227  | 0  |     tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];  | 
228  | 0  |     tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];  | 
229  | 0  |     tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];  | 
230  | 0  |     tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];  | 
231  |  |  | 
232  |  |     /* Even part per LL&M figure 1 --- note that published figure is faulty;  | 
233  |  |      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".  | 
234  |  |      */  | 
235  |  | 
  | 
236  | 0  |     tmp10 = tmp0 + tmp3;  | 
237  | 0  |     tmp13 = tmp0 - tmp3;  | 
238  | 0  |     tmp11 = tmp1 + tmp2;  | 
239  | 0  |     tmp12 = tmp1 - tmp2;  | 
240  |  | 
  | 
241  | 0  |     dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);  | 
242  | 0  |     dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);  | 
243  |  | 
  | 
244  | 0  |     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);  | 
245  | 0  |     dataptr[DCTSIZE * 2] =  | 
246  | 0  |       (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),  | 
247  | 0  |                        CONST_BITS + PASS1_BITS);  | 
248  | 0  |     dataptr[DCTSIZE * 6] =  | 
249  | 0  |       (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),  | 
250  | 0  |                        CONST_BITS + PASS1_BITS);  | 
251  |  |  | 
252  |  |     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).  | 
253  |  |      * cK represents cos(K*pi/16).  | 
254  |  |      * i0..i3 in the paper are tmp4..tmp7 here.  | 
255  |  |      */  | 
256  |  | 
  | 
257  | 0  |     z1 = tmp4 + tmp7;  | 
258  | 0  |     z2 = tmp5 + tmp6;  | 
259  | 0  |     z3 = tmp4 + tmp6;  | 
260  | 0  |     z4 = tmp5 + tmp7;  | 
261  | 0  |     z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */  | 
262  |  | 
  | 
263  | 0  |     tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */  | 
264  | 0  |     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */  | 
265  | 0  |     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */  | 
266  | 0  |     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */  | 
267  | 0  |     z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */  | 
268  | 0  |     z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */  | 
269  | 0  |     z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */  | 
270  | 0  |     z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */  | 
271  |  | 
  | 
272  | 0  |     z3 += z5;  | 
273  | 0  |     z4 += z5;  | 
274  |  | 
  | 
275  | 0  |     dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,  | 
276  | 0  |                                             CONST_BITS + PASS1_BITS);  | 
277  | 0  |     dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,  | 
278  | 0  |                                             CONST_BITS + PASS1_BITS);  | 
279  | 0  |     dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,  | 
280  | 0  |                                             CONST_BITS + PASS1_BITS);  | 
281  | 0  |     dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,  | 
282  | 0  |                                             CONST_BITS + PASS1_BITS);  | 
283  |  | 
  | 
284  | 0  |     dataptr++;                  /* advance pointer to next column */  | 
285  | 0  |   }  | 
286  | 0  | }  | 
287  |  |  | 
288  |  | #endif /* DCT_ISLOW_SUPPORTED */  |