/src/ffmpeg/libavcodec/jrevdct.c
Line | Count | Source |
1 | | /* |
2 | | * This file is part of the Independent JPEG Group's software. |
3 | | * |
4 | | * The authors make NO WARRANTY or representation, either express or implied, |
5 | | * with respect to this software, its quality, accuracy, merchantability, or |
6 | | * fitness for a particular purpose. This software is provided "AS IS", and |
7 | | * you, its user, assume the entire risk as to its quality and accuracy. |
8 | | * |
9 | | * This software is copyright (C) 1991, 1992, Thomas G. Lane. |
10 | | * All Rights Reserved except as specified below. |
11 | | * |
12 | | * Permission is hereby granted to use, copy, modify, and distribute this |
13 | | * software (or portions thereof) for any purpose, without fee, subject to |
14 | | * these conditions: |
15 | | * (1) If any part of the source code for this software is distributed, then |
16 | | * this README file must be included, with this copyright and no-warranty |
17 | | * notice unaltered; and any additions, deletions, or changes to the original |
18 | | * files must be clearly indicated in accompanying documentation. |
19 | | * (2) If only executable code is distributed, then the accompanying |
20 | | * documentation must state that "this software is based in part on the work |
21 | | * of the Independent JPEG Group". |
22 | | * (3) Permission for use of this software is granted only if the user accepts |
23 | | * full responsibility for any undesirable consequences; the authors accept |
24 | | * NO LIABILITY for damages of any kind. |
25 | | * |
26 | | * These conditions apply to any software derived from or based on the IJG |
27 | | * code, not just to the unmodified library. If you use our work, you ought |
28 | | * to acknowledge us. |
29 | | * |
30 | | * Permission is NOT granted for the use of any IJG author's name or company |
31 | | * name in advertising or publicity relating to this software or products |
32 | | * derived from it. This software may be referred to only as "the Independent |
33 | | * JPEG Group's software". |
34 | | * |
35 | | * We specifically permit and encourage the use of this software as the basis |
36 | | * of commercial products, provided that all warranty or liability claims are |
37 | | * assumed by the product vendor. |
38 | | * |
39 | | * This file contains the basic inverse-DCT transformation subroutine. |
40 | | * |
41 | | * This implementation is based on an algorithm described in |
42 | | * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT |
43 | | * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics, |
44 | | * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991. |
45 | | * The primary algorithm described there uses 11 multiplies and 29 adds. |
46 | | * We use their alternate method with 12 multiplies and 32 adds. |
47 | | * The advantage of this method is that no data path contains more than one |
48 | | * multiplication; this allows a very simple and accurate implementation in |
49 | | * scaled fixed-point arithmetic, with a minimal number of shifts. |
50 | | * |
51 | | * I've made lots of modifications to attempt to take advantage of the |
52 | | * sparse nature of the DCT matrices we're getting. Although the logic |
53 | | * is cumbersome, it's straightforward and the resulting code is much |
54 | | * faster. |
55 | | * |
56 | | * A better way to do this would be to pass in the DCT block as a sparse |
57 | | * matrix, perhaps with the difference cases encoded. |
58 | | */ |
59 | | |
60 | | /** |
61 | | * @file |
62 | | * Independent JPEG Group's LLM idct. |
63 | | */ |
64 | | |
65 | | #include <stddef.h> |
66 | | #include <stdint.h> |
67 | | |
68 | | #include "libavutil/intreadwrite.h" |
69 | | |
70 | | #include "dct.h" |
71 | | #include "idctdsp.h" |
72 | | |
73 | | #define EIGHT_BIT_SAMPLES |
74 | | |
75 | 2.71G | #define DCTSIZE 8 |
76 | | #define DCTSIZE2 64 |
77 | | |
78 | | #define GLOBAL |
79 | | |
80 | 1.46G | #define RIGHT_SHIFT(x, n) ((x) >> (n)) |
81 | | |
82 | | typedef int16_t DCTBLOCK[DCTSIZE2]; |
83 | | |
84 | 754M | #define CONST_BITS 13 |
85 | | |
86 | | /* |
87 | | * This routine is specialized to the case DCTSIZE = 8. |
88 | | */ |
89 | | |
90 | | #if DCTSIZE != 8 |
91 | | Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ |
92 | | #endif |
93 | | |
94 | | |
95 | | /* |
96 | | * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT |
97 | | * on each column. Direct algorithms are also available, but they are |
98 | | * much more complex and seem not to be any faster when reduced to code. |
99 | | * |
100 | | * The poop on this scaling stuff is as follows: |
101 | | * |
102 | | * Each 1-D IDCT step produces outputs which are a factor of sqrt(N) |
103 | | * larger than the true IDCT outputs. The final outputs are therefore |
104 | | * a factor of N larger than desired; since N=8 this can be cured by |
105 | | * a simple right shift at the end of the algorithm. The advantage of |
106 | | * this arrangement is that we save two multiplications per 1-D IDCT, |
107 | | * because the y0 and y4 inputs need not be divided by sqrt(N). |
108 | | * |
109 | | * We have to do addition and subtraction of the integer inputs, which |
110 | | * is no problem, and multiplication by fractional constants, which is |
111 | | * a problem to do in integer arithmetic. We multiply all the constants |
112 | | * by CONST_SCALE and convert them to integer constants (thus retaining |
113 | | * CONST_BITS bits of precision in the constants). After doing a |
114 | | * multiplication we have to divide the product by CONST_SCALE, with proper |
115 | | * rounding, to produce the correct output. This division can be done |
116 | | * cheaply as a right shift of CONST_BITS bits. We postpone shifting |
117 | | * as long as possible so that partial sums can be added together with |
118 | | * full fractional precision. |
119 | | * |
120 | | * The outputs of the first pass are scaled up by PASS1_BITS bits so that |
121 | | * they are represented to better-than-integral precision. These outputs |
122 | | * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word |
123 | | * with the recommended scaling. (To scale up 12-bit sample data further, an |
124 | | * intermediate int32 array would be needed.) |
125 | | * |
126 | | * To avoid overflow of the 32-bit intermediate results in pass 2, we must |
127 | | * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis |
128 | | * shows that the values given below are the most effective. |
129 | | */ |
130 | | |
131 | | #ifdef EIGHT_BIT_SAMPLES |
132 | 272M | #define PASS1_BITS 2 |
133 | | #else |
134 | | #define PASS1_BITS 1 /* lose a little precision to avoid overflow */ |
135 | | #endif |
136 | | |
137 | 351M | #define ONE ((int32_t) 1) |
138 | | |
139 | 351M | #define CONST_SCALE (ONE << CONST_BITS) |
140 | | |
141 | | /* Convert a positive real constant to an integer scaled by CONST_SCALE. |
142 | | * IMPORTANT: if your compiler doesn't do this arithmetic at compile time, |
143 | | * you will pay a significant penalty in run time. In that case, figure |
144 | | * the correct integer constant values and insert them by hand. |
145 | | */ |
146 | | |
147 | | /* Actually FIX is no longer used, we precomputed them all */ |
148 | | #define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5)) |
149 | | |
150 | | /* Descale and correctly round an int32_t value that's scaled by N bits. |
151 | | * We assume RIGHT_SHIFT rounds towards minus infinity, so adding |
152 | | * the fudge factor is correct for either sign of X. |
153 | | */ |
154 | | |
155 | 1.46G | #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n) |
156 | | |
157 | | /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result. |
158 | | * For 8-bit samples with the recommended scaling, all the variable |
159 | | * and constant values involved are no more than 16 bits wide, so a |
160 | | * 16x16->32 bit multiply can be used instead of a full 32x32 multiply; |
161 | | * this provides a useful speedup on many machines. |
162 | | * There is no way to specify a 16x16->32 multiply in portable C, but |
163 | | * some C compilers will do the right thing if you provide the correct |
164 | | * combination of casts. |
165 | | * NB: for 12-bit samples, a full 32-bit multiplication will be needed. |
166 | | */ |
167 | | |
168 | | #ifdef EIGHT_BIT_SAMPLES |
169 | | #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */ |
170 | | #define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const))) |
171 | | #endif |
172 | | #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */ |
173 | | #define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const))) |
174 | | #endif |
175 | | #endif |
176 | | |
177 | | #ifndef MULTIPLY /* default definition */ |
178 | 540M | #define MULTIPLY(var,const) ((var) * (const)) |
179 | | #endif |
180 | | |
181 | | |
182 | | /* |
183 | | Unlike our decoder where we approximate the FIXes, we need to use exact |
184 | | ones here or successive P-frames will drift too much with Reference frame coding |
185 | | */ |
186 | | #define FIX_0_211164243 1730 |
187 | | #define FIX_0_275899380 2260 |
188 | | #define FIX_0_298631336 2446 |
189 | | #define FIX_0_390180644 3196 |
190 | | #define FIX_0_509795579 4176 |
191 | | #define FIX_0_541196100 4433 |
192 | | #define FIX_0_601344887 4926 |
193 | | #define FIX_0_765366865 6270 |
194 | | #define FIX_0_785694958 6436 |
195 | | #define FIX_0_899976223 7373 |
196 | | #define FIX_1_061594337 8697 |
197 | | #define FIX_1_111140466 9102 |
198 | | #define FIX_1_175875602 9633 |
199 | | #define FIX_1_306562965 10703 |
200 | | #define FIX_1_387039845 11363 |
201 | | #define FIX_1_451774981 11893 |
202 | | #define FIX_1_501321110 12299 |
203 | | #define FIX_1_662939225 13623 |
204 | | #define FIX_1_847759065 15137 |
205 | | #define FIX_1_961570560 16069 |
206 | | #define FIX_2_053119869 16819 |
207 | | #define FIX_2_172734803 17799 |
208 | | #define FIX_2_562915447 20995 |
209 | | #define FIX_3_072711026 25172 |
210 | | |
211 | | /* |
212 | | * Perform the inverse DCT on one block of coefficients. |
213 | | */ |
214 | | |
215 | | void ff_j_rev_dct(DCTBLOCK data) |
216 | 19.6M | { |
217 | 19.6M | int32_t tmp0, tmp1, tmp2, tmp3; |
218 | 19.6M | int32_t tmp10, tmp11, tmp12, tmp13; |
219 | 19.6M | int32_t z1, z2, z3, z4, z5; |
220 | 19.6M | int32_t d0, d1, d2, d3, d4, d5, d6, d7; |
221 | 19.6M | register int16_t *dataptr; |
222 | 19.6M | int rowctr; |
223 | | |
224 | | /* Pass 1: process rows. */ |
225 | | /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ |
226 | | /* furthermore, we scale the results by 2**PASS1_BITS. */ |
227 | | |
228 | 19.6M | dataptr = data; |
229 | | |
230 | 177M | for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { |
231 | | /* Due to quantization, we will usually find that many of the input |
232 | | * coefficients are zero, especially the AC terms. We can exploit this |
233 | | * by short-circuiting the IDCT calculation for any row in which all |
234 | | * the AC terms are zero. In that case each output is equal to the |
235 | | * DC coefficient (with scale factor as needed). |
236 | | * With typical images and quantization tables, half or more of the |
237 | | * row DCT calculations can be simplified this way. |
238 | | */ |
239 | | |
240 | 157M | register uint8_t *idataptr = (uint8_t*)dataptr; |
241 | | |
242 | | /* WARNING: we do the same permutation as MMX idct to simplify the |
243 | | video core */ |
244 | 157M | d0 = dataptr[0]; |
245 | 157M | d2 = dataptr[1]; |
246 | 157M | d4 = dataptr[2]; |
247 | 157M | d6 = dataptr[3]; |
248 | 157M | d1 = dataptr[4]; |
249 | 157M | d3 = dataptr[5]; |
250 | 157M | d5 = dataptr[6]; |
251 | 157M | d7 = dataptr[7]; |
252 | | |
253 | 157M | if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) { |
254 | | /* AC terms all zero */ |
255 | 139M | if (d0) { |
256 | | /* Compute a 32 bit value to assign. */ |
257 | 8.62M | int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS)); |
258 | 8.62M | register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16); |
259 | | |
260 | 8.62M | AV_WN32A(&idataptr[ 0], v); |
261 | 8.62M | AV_WN32A(&idataptr[ 4], v); |
262 | 8.62M | AV_WN32A(&idataptr[ 8], v); |
263 | 8.62M | AV_WN32A(&idataptr[12], v); |
264 | 8.62M | } |
265 | | |
266 | 139M | dataptr += DCTSIZE; /* advance pointer to next row */ |
267 | 139M | continue; |
268 | 139M | } |
269 | | |
270 | | /* Even part: reverse the even part of the forward DCT. */ |
271 | | /* The rotator is sqrt(2)*c(-6). */ |
272 | 18.1M | { |
273 | 18.1M | if (d6) { |
274 | 4.36M | if (d2) { |
275 | | /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ |
276 | 3.43M | z1 = MULTIPLY(d2 + d6, FIX_0_541196100); |
277 | 3.43M | tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); |
278 | 3.43M | tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); |
279 | | |
280 | 3.43M | tmp0 = (d0 + d4) * CONST_SCALE; |
281 | 3.43M | tmp1 = (d0 - d4) * CONST_SCALE; |
282 | | |
283 | 3.43M | tmp10 = tmp0 + tmp3; |
284 | 3.43M | tmp13 = tmp0 - tmp3; |
285 | 3.43M | tmp11 = tmp1 + tmp2; |
286 | 3.43M | tmp12 = tmp1 - tmp2; |
287 | 3.43M | } else { |
288 | | /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ |
289 | 932k | tmp2 = MULTIPLY(-d6, FIX_1_306562965); |
290 | 932k | tmp3 = MULTIPLY(d6, FIX_0_541196100); |
291 | | |
292 | 932k | tmp0 = (d0 + d4) * CONST_SCALE; |
293 | 932k | tmp1 = (d0 - d4) * CONST_SCALE; |
294 | | |
295 | 932k | tmp10 = tmp0 + tmp3; |
296 | 932k | tmp13 = tmp0 - tmp3; |
297 | 932k | tmp11 = tmp1 + tmp2; |
298 | 932k | tmp12 = tmp1 - tmp2; |
299 | 932k | } |
300 | 13.8M | } else { |
301 | 13.8M | if (d2) { |
302 | | /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ |
303 | 7.30M | tmp2 = MULTIPLY(d2, FIX_0_541196100); |
304 | 7.30M | tmp3 = MULTIPLY(d2, FIX_1_306562965); |
305 | | |
306 | 7.30M | tmp0 = (d0 + d4) * CONST_SCALE; |
307 | 7.30M | tmp1 = (d0 - d4) * CONST_SCALE; |
308 | | |
309 | 7.30M | tmp10 = tmp0 + tmp3; |
310 | 7.30M | tmp13 = tmp0 - tmp3; |
311 | 7.30M | tmp11 = tmp1 + tmp2; |
312 | 7.30M | tmp12 = tmp1 - tmp2; |
313 | 7.30M | } else { |
314 | | /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ |
315 | 6.51M | tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; |
316 | 6.51M | tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; |
317 | 6.51M | } |
318 | 13.8M | } |
319 | | |
320 | | /* Odd part per figure 8; the matrix is unitary and hence its |
321 | | * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
322 | | */ |
323 | | |
324 | 18.1M | if (d7) { |
325 | 3.99M | if (d5) { |
326 | 3.05M | if (d3) { |
327 | 2.69M | if (d1) { |
328 | | /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ |
329 | 2.44M | z1 = d7 + d1; |
330 | 2.44M | z2 = d5 + d3; |
331 | 2.44M | z3 = d7 + d3; |
332 | 2.44M | z4 = d5 + d1; |
333 | 2.44M | z5 = MULTIPLY(z3 + z4, FIX_1_175875602); |
334 | | |
335 | 2.44M | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
336 | 2.44M | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
337 | 2.44M | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
338 | 2.44M | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
339 | 2.44M | z1 = MULTIPLY(-z1, FIX_0_899976223); |
340 | 2.44M | z2 = MULTIPLY(-z2, FIX_2_562915447); |
341 | 2.44M | z3 = MULTIPLY(-z3, FIX_1_961570560); |
342 | 2.44M | z4 = MULTIPLY(-z4, FIX_0_390180644); |
343 | | |
344 | 2.44M | z3 += z5; |
345 | 2.44M | z4 += z5; |
346 | | |
347 | 2.44M | tmp0 += z1 + z3; |
348 | 2.44M | tmp1 += z2 + z4; |
349 | 2.44M | tmp2 += z2 + z3; |
350 | 2.44M | tmp3 += z1 + z4; |
351 | 2.44M | } else { |
352 | | /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ |
353 | 251k | z2 = d5 + d3; |
354 | 251k | z3 = d7 + d3; |
355 | 251k | z5 = MULTIPLY(z3 + d5, FIX_1_175875602); |
356 | | |
357 | 251k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
358 | 251k | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
359 | 251k | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
360 | 251k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
361 | 251k | z2 = MULTIPLY(-z2, FIX_2_562915447); |
362 | 251k | z3 = MULTIPLY(-z3, FIX_1_961570560); |
363 | 251k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
364 | | |
365 | 251k | z3 += z5; |
366 | 251k | z4 += z5; |
367 | | |
368 | 251k | tmp0 += z1 + z3; |
369 | 251k | tmp1 += z2 + z4; |
370 | 251k | tmp2 += z2 + z3; |
371 | 251k | tmp3 = z1 + z4; |
372 | 251k | } |
373 | 2.69M | } else { |
374 | 360k | if (d1) { |
375 | | /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ |
376 | 149k | z1 = d7 + d1; |
377 | 149k | z4 = d5 + d1; |
378 | 149k | z5 = MULTIPLY(d7 + z4, FIX_1_175875602); |
379 | | |
380 | 149k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
381 | 149k | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
382 | 149k | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
383 | 149k | z1 = MULTIPLY(-z1, FIX_0_899976223); |
384 | 149k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
385 | 149k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
386 | 149k | z4 = MULTIPLY(-z4, FIX_0_390180644); |
387 | | |
388 | 149k | z3 += z5; |
389 | 149k | z4 += z5; |
390 | | |
391 | 149k | tmp0 += z1 + z3; |
392 | 149k | tmp1 += z2 + z4; |
393 | 149k | tmp2 = z2 + z3; |
394 | 149k | tmp3 += z1 + z4; |
395 | 211k | } else { |
396 | | /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ |
397 | 211k | tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
398 | 211k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
399 | 211k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
400 | 211k | tmp1 = MULTIPLY(-d5, FIX_0_509795579); |
401 | 211k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
402 | 211k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
403 | 211k | z5 = MULTIPLY(d5 + d7, FIX_1_175875602); |
404 | | |
405 | 211k | z3 += z5; |
406 | 211k | z4 += z5; |
407 | | |
408 | 211k | tmp0 += z3; |
409 | 211k | tmp1 += z4; |
410 | 211k | tmp2 = z2 + z3; |
411 | 211k | tmp3 = z1 + z4; |
412 | 211k | } |
413 | 360k | } |
414 | 3.05M | } else { |
415 | 942k | if (d3) { |
416 | 267k | if (d1) { |
417 | | /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ |
418 | 157k | z1 = d7 + d1; |
419 | 157k | z3 = d7 + d3; |
420 | 157k | z5 = MULTIPLY(z3 + d1, FIX_1_175875602); |
421 | | |
422 | 157k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
423 | 157k | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
424 | 157k | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
425 | 157k | z1 = MULTIPLY(-z1, FIX_0_899976223); |
426 | 157k | z2 = MULTIPLY(-d3, FIX_2_562915447); |
427 | 157k | z3 = MULTIPLY(-z3, FIX_1_961570560); |
428 | 157k | z4 = MULTIPLY(-d1, FIX_0_390180644); |
429 | | |
430 | 157k | z3 += z5; |
431 | 157k | z4 += z5; |
432 | | |
433 | 157k | tmp0 += z1 + z3; |
434 | 157k | tmp1 = z2 + z4; |
435 | 157k | tmp2 += z2 + z3; |
436 | 157k | tmp3 += z1 + z4; |
437 | 157k | } else { |
438 | | /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ |
439 | 110k | z3 = d7 + d3; |
440 | | |
441 | 110k | tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
442 | 110k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
443 | 110k | tmp2 = MULTIPLY(d3, FIX_0_509795579); |
444 | 110k | z2 = MULTIPLY(-d3, FIX_2_562915447); |
445 | 110k | z5 = MULTIPLY(z3, FIX_1_175875602); |
446 | 110k | z3 = MULTIPLY(-z3, FIX_0_785694958); |
447 | | |
448 | 110k | tmp0 += z3; |
449 | 110k | tmp1 = z2 + z5; |
450 | 110k | tmp2 += z3; |
451 | 110k | tmp3 = z1 + z5; |
452 | 110k | } |
453 | 674k | } else { |
454 | 674k | if (d1) { |
455 | | /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ |
456 | 215k | z1 = d7 + d1; |
457 | 215k | z5 = MULTIPLY(z1, FIX_1_175875602); |
458 | | |
459 | 215k | z1 = MULTIPLY(z1, FIX_0_275899380); |
460 | 215k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
461 | 215k | tmp0 = MULTIPLY(-d7, FIX_1_662939225); |
462 | 215k | z4 = MULTIPLY(-d1, FIX_0_390180644); |
463 | 215k | tmp3 = MULTIPLY(d1, FIX_1_111140466); |
464 | | |
465 | 215k | tmp0 += z1; |
466 | 215k | tmp1 = z4 + z5; |
467 | 215k | tmp2 = z3 + z5; |
468 | 215k | tmp3 += z1; |
469 | 459k | } else { |
470 | | /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ |
471 | 459k | tmp0 = MULTIPLY(-d7, FIX_1_387039845); |
472 | 459k | tmp1 = MULTIPLY(d7, FIX_1_175875602); |
473 | 459k | tmp2 = MULTIPLY(-d7, FIX_0_785694958); |
474 | 459k | tmp3 = MULTIPLY(d7, FIX_0_275899380); |
475 | 459k | } |
476 | 674k | } |
477 | 942k | } |
478 | 14.1M | } else { |
479 | 14.1M | if (d5) { |
480 | 2.90M | if (d3) { |
481 | 2.03M | if (d1) { |
482 | | /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ |
483 | 1.67M | z2 = d5 + d3; |
484 | 1.67M | z4 = d5 + d1; |
485 | 1.67M | z5 = MULTIPLY(d3 + z4, FIX_1_175875602); |
486 | | |
487 | 1.67M | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
488 | 1.67M | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
489 | 1.67M | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
490 | 1.67M | z1 = MULTIPLY(-d1, FIX_0_899976223); |
491 | 1.67M | z2 = MULTIPLY(-z2, FIX_2_562915447); |
492 | 1.67M | z3 = MULTIPLY(-d3, FIX_1_961570560); |
493 | 1.67M | z4 = MULTIPLY(-z4, FIX_0_390180644); |
494 | | |
495 | 1.67M | z3 += z5; |
496 | 1.67M | z4 += z5; |
497 | | |
498 | 1.67M | tmp0 = z1 + z3; |
499 | 1.67M | tmp1 += z2 + z4; |
500 | 1.67M | tmp2 += z2 + z3; |
501 | 1.67M | tmp3 += z1 + z4; |
502 | 1.67M | } else { |
503 | | /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ |
504 | 362k | z2 = d5 + d3; |
505 | | |
506 | 362k | z5 = MULTIPLY(z2, FIX_1_175875602); |
507 | 362k | tmp1 = MULTIPLY(d5, FIX_1_662939225); |
508 | 362k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
509 | 362k | z2 = MULTIPLY(-z2, FIX_1_387039845); |
510 | 362k | tmp2 = MULTIPLY(d3, FIX_1_111140466); |
511 | 362k | z3 = MULTIPLY(-d3, FIX_1_961570560); |
512 | | |
513 | 362k | tmp0 = z3 + z5; |
514 | 362k | tmp1 += z2; |
515 | 362k | tmp2 += z2; |
516 | 362k | tmp3 = z4 + z5; |
517 | 362k | } |
518 | 2.03M | } else { |
519 | 874k | if (d1) { |
520 | | /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ |
521 | 385k | z4 = d5 + d1; |
522 | | |
523 | 385k | z5 = MULTIPLY(z4, FIX_1_175875602); |
524 | 385k | z1 = MULTIPLY(-d1, FIX_0_899976223); |
525 | 385k | tmp3 = MULTIPLY(d1, FIX_0_601344887); |
526 | 385k | tmp1 = MULTIPLY(-d5, FIX_0_509795579); |
527 | 385k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
528 | 385k | z4 = MULTIPLY(z4, FIX_0_785694958); |
529 | | |
530 | 385k | tmp0 = z1 + z5; |
531 | 385k | tmp1 += z4; |
532 | 385k | tmp2 = z2 + z5; |
533 | 385k | tmp3 += z4; |
534 | 488k | } else { |
535 | | /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ |
536 | 488k | tmp0 = MULTIPLY(d5, FIX_1_175875602); |
537 | 488k | tmp1 = MULTIPLY(d5, FIX_0_275899380); |
538 | 488k | tmp2 = MULTIPLY(-d5, FIX_1_387039845); |
539 | 488k | tmp3 = MULTIPLY(d5, FIX_0_785694958); |
540 | 488k | } |
541 | 874k | } |
542 | 11.2M | } else { |
543 | 11.2M | if (d3) { |
544 | 4.12M | if (d1) { |
545 | | /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ |
546 | 2.63M | z5 = d1 + d3; |
547 | 2.63M | tmp3 = MULTIPLY(d1, FIX_0_211164243); |
548 | 2.63M | tmp2 = MULTIPLY(-d3, FIX_1_451774981); |
549 | 2.63M | z1 = MULTIPLY(d1, FIX_1_061594337); |
550 | 2.63M | z2 = MULTIPLY(-d3, FIX_2_172734803); |
551 | 2.63M | z4 = MULTIPLY(z5, FIX_0_785694958); |
552 | 2.63M | z5 = MULTIPLY(z5, FIX_1_175875602); |
553 | | |
554 | 2.63M | tmp0 = z1 - z4; |
555 | 2.63M | tmp1 = z2 + z4; |
556 | 2.63M | tmp2 += z5; |
557 | 2.63M | tmp3 += z5; |
558 | 2.63M | } else { |
559 | | /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ |
560 | 1.49M | tmp0 = MULTIPLY(-d3, FIX_0_785694958); |
561 | 1.49M | tmp1 = MULTIPLY(-d3, FIX_1_387039845); |
562 | 1.49M | tmp2 = MULTIPLY(-d3, FIX_0_275899380); |
563 | 1.49M | tmp3 = MULTIPLY(d3, FIX_1_175875602); |
564 | 1.49M | } |
565 | 7.14M | } else { |
566 | 7.14M | if (d1) { |
567 | | /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ |
568 | 5.46M | tmp0 = MULTIPLY(d1, FIX_0_275899380); |
569 | 5.46M | tmp1 = MULTIPLY(d1, FIX_0_785694958); |
570 | 5.46M | tmp2 = MULTIPLY(d1, FIX_1_175875602); |
571 | 5.46M | tmp3 = MULTIPLY(d1, FIX_1_387039845); |
572 | 5.46M | } else { |
573 | | /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ |
574 | 1.68M | tmp0 = tmp1 = tmp2 = tmp3 = 0; |
575 | 1.68M | } |
576 | 7.14M | } |
577 | 11.2M | } |
578 | 14.1M | } |
579 | 18.1M | } |
580 | | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
581 | | |
582 | 18.1M | dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS); |
583 | 18.1M | dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS); |
584 | 18.1M | dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS); |
585 | 18.1M | dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS); |
586 | 18.1M | dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS); |
587 | 18.1M | dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS); |
588 | 18.1M | dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS); |
589 | 18.1M | dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS); |
590 | | |
591 | 18.1M | dataptr += DCTSIZE; /* advance pointer to next row */ |
592 | 18.1M | } |
593 | | |
594 | | /* Pass 2: process columns. */ |
595 | | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
596 | | /* and also undo the PASS1_BITS scaling. */ |
597 | | |
598 | 19.6M | dataptr = data; |
599 | 177M | for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { |
600 | | /* Columns of zeroes can be exploited in the same way as we did with rows. |
601 | | * However, the row calculation has created many nonzero AC terms, so the |
602 | | * simplification applies less often (typically 5% to 10% of the time). |
603 | | * On machines with very fast multiplication, it's possible that the |
604 | | * test takes more time than it's worth. In that case this section |
605 | | * may be commented out. |
606 | | */ |
607 | | |
608 | 157M | d0 = dataptr[DCTSIZE*0]; |
609 | 157M | d1 = dataptr[DCTSIZE*1]; |
610 | 157M | d2 = dataptr[DCTSIZE*2]; |
611 | 157M | d3 = dataptr[DCTSIZE*3]; |
612 | 157M | d4 = dataptr[DCTSIZE*4]; |
613 | 157M | d5 = dataptr[DCTSIZE*5]; |
614 | 157M | d6 = dataptr[DCTSIZE*6]; |
615 | 157M | d7 = dataptr[DCTSIZE*7]; |
616 | | |
617 | | /* Even part: reverse the even part of the forward DCT. */ |
618 | | /* The rotator is sqrt(2)*c(-6). */ |
619 | 157M | if (d6) { |
620 | 12.9M | if (d2) { |
621 | | /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ |
622 | 12.1M | z1 = MULTIPLY(d2 + d6, FIX_0_541196100); |
623 | 12.1M | tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); |
624 | 12.1M | tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); |
625 | | |
626 | 12.1M | tmp0 = (d0 + d4) * CONST_SCALE; |
627 | 12.1M | tmp1 = (d0 - d4) * CONST_SCALE; |
628 | | |
629 | 12.1M | tmp10 = tmp0 + tmp3; |
630 | 12.1M | tmp13 = tmp0 - tmp3; |
631 | 12.1M | tmp11 = tmp1 + tmp2; |
632 | 12.1M | tmp12 = tmp1 - tmp2; |
633 | 12.1M | } else { |
634 | | /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ |
635 | 805k | tmp2 = MULTIPLY(-d6, FIX_1_306562965); |
636 | 805k | tmp3 = MULTIPLY(d6, FIX_0_541196100); |
637 | | |
638 | 805k | tmp0 = (d0 + d4) * CONST_SCALE; |
639 | 805k | tmp1 = (d0 - d4) * CONST_SCALE; |
640 | | |
641 | 805k | tmp10 = tmp0 + tmp3; |
642 | 805k | tmp13 = tmp0 - tmp3; |
643 | 805k | tmp11 = tmp1 + tmp2; |
644 | 805k | tmp12 = tmp1 - tmp2; |
645 | 805k | } |
646 | 144M | } else { |
647 | 144M | if (d2) { |
648 | | /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ |
649 | 14.8M | tmp2 = MULTIPLY(d2, FIX_0_541196100); |
650 | 14.8M | tmp3 = MULTIPLY(d2, FIX_1_306562965); |
651 | | |
652 | 14.8M | tmp0 = (d0 + d4) * CONST_SCALE; |
653 | 14.8M | tmp1 = (d0 - d4) * CONST_SCALE; |
654 | | |
655 | 14.8M | tmp10 = tmp0 + tmp3; |
656 | 14.8M | tmp13 = tmp0 - tmp3; |
657 | 14.8M | tmp11 = tmp1 + tmp2; |
658 | 14.8M | tmp12 = tmp1 - tmp2; |
659 | 129M | } else { |
660 | | /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ |
661 | 129M | tmp10 = tmp13 = (d0 + d4) * CONST_SCALE; |
662 | 129M | tmp11 = tmp12 = (d0 - d4) * CONST_SCALE; |
663 | 129M | } |
664 | 144M | } |
665 | | |
666 | | /* Odd part per figure 8; the matrix is unitary and hence its |
667 | | * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively. |
668 | | */ |
669 | 157M | if (d7) { |
670 | 9.45M | if (d5) { |
671 | 7.62M | if (d3) { |
672 | 7.25M | if (d1) { |
673 | | /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */ |
674 | 7.04M | z1 = d7 + d1; |
675 | 7.04M | z2 = d5 + d3; |
676 | 7.04M | z3 = d7 + d3; |
677 | 7.04M | z4 = d5 + d1; |
678 | 7.04M | z5 = MULTIPLY(z3 + z4, FIX_1_175875602); |
679 | | |
680 | 7.04M | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
681 | 7.04M | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
682 | 7.04M | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
683 | 7.04M | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
684 | 7.04M | z1 = MULTIPLY(-z1, FIX_0_899976223); |
685 | 7.04M | z2 = MULTIPLY(-z2, FIX_2_562915447); |
686 | 7.04M | z3 = MULTIPLY(-z3, FIX_1_961570560); |
687 | 7.04M | z4 = MULTIPLY(-z4, FIX_0_390180644); |
688 | | |
689 | 7.04M | z3 += z5; |
690 | 7.04M | z4 += z5; |
691 | | |
692 | 7.04M | tmp0 += z1 + z3; |
693 | 7.04M | tmp1 += z2 + z4; |
694 | 7.04M | tmp2 += z2 + z3; |
695 | 7.04M | tmp3 += z1 + z4; |
696 | 7.04M | } else { |
697 | | /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */ |
698 | 211k | z2 = d5 + d3; |
699 | 211k | z3 = d7 + d3; |
700 | 211k | z5 = MULTIPLY(z3 + d5, FIX_1_175875602); |
701 | | |
702 | 211k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
703 | 211k | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
704 | 211k | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
705 | 211k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
706 | 211k | z2 = MULTIPLY(-z2, FIX_2_562915447); |
707 | 211k | z3 = MULTIPLY(-z3, FIX_1_961570560); |
708 | 211k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
709 | | |
710 | 211k | z3 += z5; |
711 | 211k | z4 += z5; |
712 | | |
713 | 211k | tmp0 += z1 + z3; |
714 | 211k | tmp1 += z2 + z4; |
715 | 211k | tmp2 += z2 + z3; |
716 | 211k | tmp3 = z1 + z4; |
717 | 211k | } |
718 | 7.25M | } else { |
719 | 375k | if (d1) { |
720 | | /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */ |
721 | 266k | z1 = d7 + d1; |
722 | 266k | z3 = d7; |
723 | 266k | z4 = d5 + d1; |
724 | 266k | z5 = MULTIPLY(z3 + z4, FIX_1_175875602); |
725 | | |
726 | 266k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
727 | 266k | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
728 | 266k | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
729 | 266k | z1 = MULTIPLY(-z1, FIX_0_899976223); |
730 | 266k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
731 | 266k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
732 | 266k | z4 = MULTIPLY(-z4, FIX_0_390180644); |
733 | | |
734 | 266k | z3 += z5; |
735 | 266k | z4 += z5; |
736 | | |
737 | 266k | tmp0 += z1 + z3; |
738 | 266k | tmp1 += z2 + z4; |
739 | 266k | tmp2 = z2 + z3; |
740 | 266k | tmp3 += z1 + z4; |
741 | 266k | } else { |
742 | | /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */ |
743 | 109k | tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
744 | 109k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
745 | 109k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
746 | 109k | tmp1 = MULTIPLY(-d5, FIX_0_509795579); |
747 | 109k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
748 | 109k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
749 | 109k | z5 = MULTIPLY(d5 + d7, FIX_1_175875602); |
750 | | |
751 | 109k | z3 += z5; |
752 | 109k | z4 += z5; |
753 | | |
754 | 109k | tmp0 += z3; |
755 | 109k | tmp1 += z4; |
756 | 109k | tmp2 = z2 + z3; |
757 | 109k | tmp3 = z1 + z4; |
758 | 109k | } |
759 | 375k | } |
760 | 7.62M | } else { |
761 | 1.82M | if (d3) { |
762 | 780k | if (d1) { |
763 | | /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */ |
764 | 500k | z1 = d7 + d1; |
765 | 500k | z3 = d7 + d3; |
766 | 500k | z5 = MULTIPLY(z3 + d1, FIX_1_175875602); |
767 | | |
768 | 500k | tmp0 = MULTIPLY(d7, FIX_0_298631336); |
769 | 500k | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
770 | 500k | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
771 | 500k | z1 = MULTIPLY(-z1, FIX_0_899976223); |
772 | 500k | z2 = MULTIPLY(-d3, FIX_2_562915447); |
773 | 500k | z3 = MULTIPLY(-z3, FIX_1_961570560); |
774 | 500k | z4 = MULTIPLY(-d1, FIX_0_390180644); |
775 | | |
776 | 500k | z3 += z5; |
777 | 500k | z4 += z5; |
778 | | |
779 | 500k | tmp0 += z1 + z3; |
780 | 500k | tmp1 = z2 + z4; |
781 | 500k | tmp2 += z2 + z3; |
782 | 500k | tmp3 += z1 + z4; |
783 | 500k | } else { |
784 | | /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */ |
785 | 279k | z3 = d7 + d3; |
786 | | |
787 | 279k | tmp0 = MULTIPLY(-d7, FIX_0_601344887); |
788 | 279k | z1 = MULTIPLY(-d7, FIX_0_899976223); |
789 | 279k | tmp2 = MULTIPLY(d3, FIX_0_509795579); |
790 | 279k | z2 = MULTIPLY(-d3, FIX_2_562915447); |
791 | 279k | z5 = MULTIPLY(z3, FIX_1_175875602); |
792 | 279k | z3 = MULTIPLY(-z3, FIX_0_785694958); |
793 | | |
794 | 279k | tmp0 += z3; |
795 | 279k | tmp1 = z2 + z5; |
796 | 279k | tmp2 += z3; |
797 | 279k | tmp3 = z1 + z5; |
798 | 279k | } |
799 | 1.04M | } else { |
800 | 1.04M | if (d1) { |
801 | | /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */ |
802 | 623k | z1 = d7 + d1; |
803 | 623k | z5 = MULTIPLY(z1, FIX_1_175875602); |
804 | | |
805 | 623k | z1 = MULTIPLY(z1, FIX_0_275899380); |
806 | 623k | z3 = MULTIPLY(-d7, FIX_1_961570560); |
807 | 623k | tmp0 = MULTIPLY(-d7, FIX_1_662939225); |
808 | 623k | z4 = MULTIPLY(-d1, FIX_0_390180644); |
809 | 623k | tmp3 = MULTIPLY(d1, FIX_1_111140466); |
810 | | |
811 | 623k | tmp0 += z1; |
812 | 623k | tmp1 = z4 + z5; |
813 | 623k | tmp2 = z3 + z5; |
814 | 623k | tmp3 += z1; |
815 | 623k | } else { |
816 | | /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */ |
817 | 423k | tmp0 = MULTIPLY(-d7, FIX_1_387039845); |
818 | 423k | tmp1 = MULTIPLY(d7, FIX_1_175875602); |
819 | 423k | tmp2 = MULTIPLY(-d7, FIX_0_785694958); |
820 | 423k | tmp3 = MULTIPLY(d7, FIX_0_275899380); |
821 | 423k | } |
822 | 1.04M | } |
823 | 1.82M | } |
824 | 148M | } else { |
825 | 148M | if (d5) { |
826 | 7.28M | if (d3) { |
827 | 6.45M | if (d1) { |
828 | | /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */ |
829 | 5.91M | z2 = d5 + d3; |
830 | 5.91M | z4 = d5 + d1; |
831 | 5.91M | z5 = MULTIPLY(d3 + z4, FIX_1_175875602); |
832 | | |
833 | 5.91M | tmp1 = MULTIPLY(d5, FIX_2_053119869); |
834 | 5.91M | tmp2 = MULTIPLY(d3, FIX_3_072711026); |
835 | 5.91M | tmp3 = MULTIPLY(d1, FIX_1_501321110); |
836 | 5.91M | z1 = MULTIPLY(-d1, FIX_0_899976223); |
837 | 5.91M | z2 = MULTIPLY(-z2, FIX_2_562915447); |
838 | 5.91M | z3 = MULTIPLY(-d3, FIX_1_961570560); |
839 | 5.91M | z4 = MULTIPLY(-z4, FIX_0_390180644); |
840 | | |
841 | 5.91M | z3 += z5; |
842 | 5.91M | z4 += z5; |
843 | | |
844 | 5.91M | tmp0 = z1 + z3; |
845 | 5.91M | tmp1 += z2 + z4; |
846 | 5.91M | tmp2 += z2 + z3; |
847 | 5.91M | tmp3 += z1 + z4; |
848 | 5.91M | } else { |
849 | | /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */ |
850 | 534k | z2 = d5 + d3; |
851 | | |
852 | 534k | z5 = MULTIPLY(z2, FIX_1_175875602); |
853 | 534k | tmp1 = MULTIPLY(d5, FIX_1_662939225); |
854 | 534k | z4 = MULTIPLY(-d5, FIX_0_390180644); |
855 | 534k | z2 = MULTIPLY(-z2, FIX_1_387039845); |
856 | 534k | tmp2 = MULTIPLY(d3, FIX_1_111140466); |
857 | 534k | z3 = MULTIPLY(-d3, FIX_1_961570560); |
858 | | |
859 | 534k | tmp0 = z3 + z5; |
860 | 534k | tmp1 += z2; |
861 | 534k | tmp2 += z2; |
862 | 534k | tmp3 = z4 + z5; |
863 | 534k | } |
864 | 6.45M | } else { |
865 | 836k | if (d1) { |
866 | | /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */ |
867 | 580k | z4 = d5 + d1; |
868 | | |
869 | 580k | z5 = MULTIPLY(z4, FIX_1_175875602); |
870 | 580k | z1 = MULTIPLY(-d1, FIX_0_899976223); |
871 | 580k | tmp3 = MULTIPLY(d1, FIX_0_601344887); |
872 | 580k | tmp1 = MULTIPLY(-d5, FIX_0_509795579); |
873 | 580k | z2 = MULTIPLY(-d5, FIX_2_562915447); |
874 | 580k | z4 = MULTIPLY(z4, FIX_0_785694958); |
875 | | |
876 | 580k | tmp0 = z1 + z5; |
877 | 580k | tmp1 += z4; |
878 | 580k | tmp2 = z2 + z5; |
879 | 580k | tmp3 += z4; |
880 | 580k | } else { |
881 | | /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */ |
882 | 255k | tmp0 = MULTIPLY(d5, FIX_1_175875602); |
883 | 255k | tmp1 = MULTIPLY(d5, FIX_0_275899380); |
884 | 255k | tmp2 = MULTIPLY(-d5, FIX_1_387039845); |
885 | 255k | tmp3 = MULTIPLY(d5, FIX_0_785694958); |
886 | 255k | } |
887 | 836k | } |
888 | 140M | } else { |
889 | 140M | if (d3) { |
890 | 8.06M | if (d1) { |
891 | | /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */ |
892 | 6.46M | z5 = d1 + d3; |
893 | 6.46M | tmp3 = MULTIPLY(d1, FIX_0_211164243); |
894 | 6.46M | tmp2 = MULTIPLY(-d3, FIX_1_451774981); |
895 | 6.46M | z1 = MULTIPLY(d1, FIX_1_061594337); |
896 | 6.46M | z2 = MULTIPLY(-d3, FIX_2_172734803); |
897 | 6.46M | z4 = MULTIPLY(z5, FIX_0_785694958); |
898 | 6.46M | z5 = MULTIPLY(z5, FIX_1_175875602); |
899 | | |
900 | 6.46M | tmp0 = z1 - z4; |
901 | 6.46M | tmp1 = z2 + z4; |
902 | 6.46M | tmp2 += z5; |
903 | 6.46M | tmp3 += z5; |
904 | 6.46M | } else { |
905 | | /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */ |
906 | 1.59M | tmp0 = MULTIPLY(-d3, FIX_0_785694958); |
907 | 1.59M | tmp1 = MULTIPLY(-d3, FIX_1_387039845); |
908 | 1.59M | tmp2 = MULTIPLY(-d3, FIX_0_275899380); |
909 | 1.59M | tmp3 = MULTIPLY(d3, FIX_1_175875602); |
910 | 1.59M | } |
911 | 132M | } else { |
912 | 132M | if (d1) { |
913 | | /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */ |
914 | 20.1M | tmp0 = MULTIPLY(d1, FIX_0_275899380); |
915 | 20.1M | tmp1 = MULTIPLY(d1, FIX_0_785694958); |
916 | 20.1M | tmp2 = MULTIPLY(d1, FIX_1_175875602); |
917 | 20.1M | tmp3 = MULTIPLY(d1, FIX_1_387039845); |
918 | 112M | } else { |
919 | | /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */ |
920 | 112M | tmp0 = tmp1 = tmp2 = tmp3 = 0; |
921 | 112M | } |
922 | 132M | } |
923 | 140M | } |
924 | 148M | } |
925 | | |
926 | | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
927 | | |
928 | 157M | dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3, |
929 | 157M | CONST_BITS+PASS1_BITS+3); |
930 | 157M | dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3, |
931 | 157M | CONST_BITS+PASS1_BITS+3); |
932 | 157M | dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2, |
933 | 157M | CONST_BITS+PASS1_BITS+3); |
934 | 157M | dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2, |
935 | 157M | CONST_BITS+PASS1_BITS+3); |
936 | 157M | dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1, |
937 | 157M | CONST_BITS+PASS1_BITS+3); |
938 | 157M | dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1, |
939 | 157M | CONST_BITS+PASS1_BITS+3); |
940 | 157M | dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0, |
941 | 157M | CONST_BITS+PASS1_BITS+3); |
942 | 157M | dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0, |
943 | 157M | CONST_BITS+PASS1_BITS+3); |
944 | | |
945 | 157M | dataptr++; /* advance pointer to next column */ |
946 | 157M | } |
947 | 19.6M | } |
948 | | |
949 | | #undef DCTSIZE |
950 | 31.2M | #define DCTSIZE 4 |
951 | 624M | #define DCTSTRIDE 8 |
952 | | |
953 | | void ff_j_rev_dct4(DCTBLOCK data) |
954 | 15.6M | { |
955 | 15.6M | int32_t tmp0, tmp1, tmp2, tmp3; |
956 | 15.6M | int32_t tmp10, tmp11, tmp12, tmp13; |
957 | 15.6M | int32_t z1; |
958 | 15.6M | int32_t d0, d2, d4, d6; |
959 | 15.6M | register int16_t *dataptr; |
960 | 15.6M | int rowctr; |
961 | | |
962 | | /* Pass 1: process rows. */ |
963 | | /* Note results are scaled up by sqrt(8) compared to a true IDCT; */ |
964 | | /* furthermore, we scale the results by 2**PASS1_BITS. */ |
965 | | |
966 | 15.6M | data[0] += 4; |
967 | | |
968 | 15.6M | dataptr = data; |
969 | | |
970 | 78.1M | for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { |
971 | | /* Due to quantization, we will usually find that many of the input |
972 | | * coefficients are zero, especially the AC terms. We can exploit this |
973 | | * by short-circuiting the IDCT calculation for any row in which all |
974 | | * the AC terms are zero. In that case each output is equal to the |
975 | | * DC coefficient (with scale factor as needed). |
976 | | * With typical images and quantization tables, half or more of the |
977 | | * row DCT calculations can be simplified this way. |
978 | | */ |
979 | | |
980 | 62.4M | register uint8_t *idataptr = (uint8_t*)dataptr; |
981 | | |
982 | 62.4M | d0 = dataptr[0]; |
983 | 62.4M | d2 = dataptr[1]; |
984 | 62.4M | d4 = dataptr[2]; |
985 | 62.4M | d6 = dataptr[3]; |
986 | | |
987 | 62.4M | if ((d2 | d4 | d6) == 0) { |
988 | | /* AC terms all zero */ |
989 | 48.4M | if (d0) { |
990 | | /* Compute a 32 bit value to assign. */ |
991 | 14.2M | int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS)); |
992 | 14.2M | register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16); |
993 | | |
994 | 14.2M | AV_WN32A(&idataptr[0], v); |
995 | 14.2M | AV_WN32A(&idataptr[4], v); |
996 | 14.2M | } |
997 | | |
998 | 48.4M | dataptr += DCTSTRIDE; /* advance pointer to next row */ |
999 | 48.4M | continue; |
1000 | 48.4M | } |
1001 | | |
1002 | | /* Even part: reverse the even part of the forward DCT. */ |
1003 | | /* The rotator is sqrt(2)*c(-6). */ |
1004 | 14.0M | if (d6) { |
1005 | 6.62M | if (d2) { |
1006 | | /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ |
1007 | 5.21M | z1 = MULTIPLY(d2 + d6, FIX_0_541196100); |
1008 | 5.21M | tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); |
1009 | 5.21M | tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); |
1010 | | |
1011 | 5.21M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1012 | 5.21M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1013 | | |
1014 | 5.21M | tmp10 = tmp0 + tmp3; |
1015 | 5.21M | tmp13 = tmp0 - tmp3; |
1016 | 5.21M | tmp11 = tmp1 + tmp2; |
1017 | 5.21M | tmp12 = tmp1 - tmp2; |
1018 | 5.21M | } else { |
1019 | | /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ |
1020 | 1.41M | tmp2 = MULTIPLY(-d6, FIX_1_306562965); |
1021 | 1.41M | tmp3 = MULTIPLY(d6, FIX_0_541196100); |
1022 | | |
1023 | 1.41M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1024 | 1.41M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1025 | | |
1026 | 1.41M | tmp10 = tmp0 + tmp3; |
1027 | 1.41M | tmp13 = tmp0 - tmp3; |
1028 | 1.41M | tmp11 = tmp1 + tmp2; |
1029 | 1.41M | tmp12 = tmp1 - tmp2; |
1030 | 1.41M | } |
1031 | 7.45M | } else { |
1032 | 7.45M | if (d2) { |
1033 | | /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ |
1034 | 6.26M | tmp2 = MULTIPLY(d2, FIX_0_541196100); |
1035 | 6.26M | tmp3 = MULTIPLY(d2, FIX_1_306562965); |
1036 | | |
1037 | 6.26M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1038 | 6.26M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1039 | | |
1040 | 6.26M | tmp10 = tmp0 + tmp3; |
1041 | 6.26M | tmp13 = tmp0 - tmp3; |
1042 | 6.26M | tmp11 = tmp1 + tmp2; |
1043 | 6.26M | tmp12 = tmp1 - tmp2; |
1044 | 6.26M | } else { |
1045 | | /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ |
1046 | 1.18M | tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS); |
1047 | 1.18M | tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS); |
1048 | 1.18M | } |
1049 | 7.45M | } |
1050 | | |
1051 | | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
1052 | | |
1053 | 14.0M | dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS); |
1054 | 14.0M | dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS); |
1055 | 14.0M | dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS); |
1056 | 14.0M | dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS); |
1057 | | |
1058 | 14.0M | dataptr += DCTSTRIDE; /* advance pointer to next row */ |
1059 | 14.0M | } |
1060 | | |
1061 | | /* Pass 2: process columns. */ |
1062 | | /* Note that we must descale the results by a factor of 8 == 2**3, */ |
1063 | | /* and also undo the PASS1_BITS scaling. */ |
1064 | | |
1065 | 15.6M | dataptr = data; |
1066 | 78.1M | for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) { |
1067 | | /* Columns of zeroes can be exploited in the same way as we did with rows. |
1068 | | * However, the row calculation has created many nonzero AC terms, so the |
1069 | | * simplification applies less often (typically 5% to 10% of the time). |
1070 | | * On machines with very fast multiplication, it's possible that the |
1071 | | * test takes more time than it's worth. In that case this section |
1072 | | * may be commented out. |
1073 | | */ |
1074 | | |
1075 | 62.4M | d0 = dataptr[DCTSTRIDE*0]; |
1076 | 62.4M | d2 = dataptr[DCTSTRIDE*1]; |
1077 | 62.4M | d4 = dataptr[DCTSTRIDE*2]; |
1078 | 62.4M | d6 = dataptr[DCTSTRIDE*3]; |
1079 | | |
1080 | | /* Even part: reverse the even part of the forward DCT. */ |
1081 | | /* The rotator is sqrt(2)*c(-6). */ |
1082 | 62.4M | if (d6) { |
1083 | 10.5M | if (d2) { |
1084 | | /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */ |
1085 | 9.04M | z1 = MULTIPLY(d2 + d6, FIX_0_541196100); |
1086 | 9.04M | tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065); |
1087 | 9.04M | tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865); |
1088 | | |
1089 | 9.04M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1090 | 9.04M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1091 | | |
1092 | 9.04M | tmp10 = tmp0 + tmp3; |
1093 | 9.04M | tmp13 = tmp0 - tmp3; |
1094 | 9.04M | tmp11 = tmp1 + tmp2; |
1095 | 9.04M | tmp12 = tmp1 - tmp2; |
1096 | 9.04M | } else { |
1097 | | /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */ |
1098 | 1.49M | tmp2 = MULTIPLY(-d6, FIX_1_306562965); |
1099 | 1.49M | tmp3 = MULTIPLY(d6, FIX_0_541196100); |
1100 | | |
1101 | 1.49M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1102 | 1.49M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1103 | | |
1104 | 1.49M | tmp10 = tmp0 + tmp3; |
1105 | 1.49M | tmp13 = tmp0 - tmp3; |
1106 | 1.49M | tmp11 = tmp1 + tmp2; |
1107 | 1.49M | tmp12 = tmp1 - tmp2; |
1108 | 1.49M | } |
1109 | 51.9M | } else { |
1110 | 51.9M | if (d2) { |
1111 | | /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */ |
1112 | 15.0M | tmp2 = MULTIPLY(d2, FIX_0_541196100); |
1113 | 15.0M | tmp3 = MULTIPLY(d2, FIX_1_306562965); |
1114 | | |
1115 | 15.0M | tmp0 = (d0 + d4) * (1 << CONST_BITS); |
1116 | 15.0M | tmp1 = (d0 - d4) * (1 << CONST_BITS); |
1117 | | |
1118 | 15.0M | tmp10 = tmp0 + tmp3; |
1119 | 15.0M | tmp13 = tmp0 - tmp3; |
1120 | 15.0M | tmp11 = tmp1 + tmp2; |
1121 | 15.0M | tmp12 = tmp1 - tmp2; |
1122 | 36.8M | } else { |
1123 | | /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */ |
1124 | 36.8M | tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS); |
1125 | 36.8M | tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS); |
1126 | 36.8M | } |
1127 | 51.9M | } |
1128 | | |
1129 | | /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */ |
1130 | | |
1131 | 62.4M | dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3); |
1132 | 62.4M | dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3); |
1133 | 62.4M | dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3); |
1134 | 62.4M | dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3); |
1135 | | |
1136 | 62.4M | dataptr++; /* advance pointer to next column */ |
1137 | 62.4M | } |
1138 | 15.6M | } |
1139 | | |
1140 | 5.20M | void ff_j_rev_dct2(DCTBLOCK data){ |
1141 | 5.20M | int d00, d01, d10, d11; |
1142 | | |
1143 | 5.20M | data[0] += 4; |
1144 | 5.20M | d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE]; |
1145 | 5.20M | d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE]; |
1146 | 5.20M | d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE]; |
1147 | 5.20M | d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE]; |
1148 | | |
1149 | 5.20M | data[0+0*DCTSTRIDE]= (d00 + d10)>>3; |
1150 | 5.20M | data[1+0*DCTSTRIDE]= (d01 + d11)>>3; |
1151 | 5.20M | data[0+1*DCTSTRIDE]= (d00 - d10)>>3; |
1152 | 5.20M | data[1+1*DCTSTRIDE]= (d01 - d11)>>3; |
1153 | 5.20M | } |
1154 | | |
1155 | 2.60k | void ff_j_rev_dct1(DCTBLOCK data){ |
1156 | 2.60k | data[0] = (data[0] + 4)>>3; |
1157 | 2.60k | } |
1158 | | |
1159 | | #undef FIX |
1160 | | #undef CONST_BITS |
1161 | | |
1162 | | void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]) |
1163 | 18.1M | { |
1164 | 18.1M | ff_j_rev_dct(block); |
1165 | 18.1M | ff_put_pixels_clamped_c(block, dest, line_size); |
1166 | 18.1M | } |
1167 | | |
1168 | | void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t block[64]) |
1169 | 1.51M | { |
1170 | 1.51M | ff_j_rev_dct(block); |
1171 | 1.51M | ff_add_pixels_clamped_c(block, dest, line_size); |
1172 | 1.51M | } |