/src/guetzli/guetzli/fdct.cc
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright 2016 Google Inc. |
3 | | * |
4 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | * you may not use this file except in compliance with the License. |
6 | | * You may obtain a copy of the License at |
7 | | * |
8 | | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | | * |
10 | | * Unless required by applicable law or agreed to in writing, software |
11 | | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | * See the License for the specific language governing permissions and |
14 | | * limitations under the License. |
15 | | */ |
16 | | |
17 | | // Integer implementation of the Discrete Cosine Transform (DCT) |
18 | | // |
19 | | // Note! DCT output is kept scaled by 16, to retain maximum 16bit precision |
20 | | |
21 | | #include "guetzli/fdct.h" |
22 | | |
23 | | namespace guetzli { |
24 | | |
25 | | namespace { |
26 | | |
27 | | /////////////////////////////////////////////////////////////////////////////// |
28 | | // Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed |
29 | | const coeff_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873, 8867, 4520 }; |
30 | | // rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass. |
31 | | // This multiply is merged in the table of constants used during 1st pass: |
32 | | const coeff_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 }; |
33 | | // rows #2 and #6 are pre-multiplied by 2.C(2): |
34 | | const coeff_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 }; |
35 | | // rows #3 and #5 are pre-multiplied by 2.C(3): |
36 | | const coeff_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 }; |
37 | | |
38 | | /////////////////////////////////////////////////////////////////////////////// |
39 | | // Constants (15bit precision) and C macros for IDCT vertical pass |
40 | | |
41 | | #define kTan1 (13036) // = tan(pi/16) |
42 | | #define kTan2 (27146) // = tan(2.pi/16) = sqrt(2) - 1. |
43 | | #define kTan3m1 (-21746) // = tan(3.pi/16) - 1 |
44 | | #define k2Sqrt2 (23170) // = 1 / 2.sqrt(2) |
45 | | |
46 | | // performs: {a,b} <- {a-b, a+b}, without saturation |
47 | 0 | #define BUTTERFLY(a, b) do { \ |
48 | 0 | SUB((a), (b)); \ |
49 | 0 | ADD((b), (b)); \ |
50 | 0 | ADD((b), (a)); \ |
51 | 0 | } while (0) |
52 | | |
53 | | /////////////////////////////////////////////////////////////////////////////// |
54 | | // Constants for DCT horizontal pass |
55 | | |
56 | | // Note about the CORRECT_LSB macro: |
57 | | // using 16bit fixed-point constants, we often compute products like: |
58 | | // p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16 |
59 | | // and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1 |
60 | | // in 3/4 of the cases. This can be easily seen from the relation: |
61 | | // (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1) |
62 | | // The approximation we are doing is replacing ((a|b)&1) by 1. |
63 | | // In practice, this is a slightly more involved because the constants A and B |
64 | | // have also been rounded compared to their exact floating point value. |
65 | | // However, all in all the correction is quite small, and CORRECT_LSB can |
66 | | // be defined empty if needed. |
67 | | |
68 | 0 | #define COLUMN_DCT8(in) do { \ |
69 | 0 | LOAD(m0, (in)[0 * 8]); \ |
70 | 0 | LOAD(m2, (in)[2 * 8]); \ |
71 | 0 | LOAD(m7, (in)[7 * 8]); \ |
72 | 0 | LOAD(m5, (in)[5 * 8]); \ |
73 | 0 | \ |
74 | 0 | BUTTERFLY(m0, m7); \ |
75 | 0 | BUTTERFLY(m2, m5); \ |
76 | 0 | \ |
77 | 0 | LOAD(m3, (in)[3 * 8]); \ |
78 | 0 | LOAD(m4, (in)[4 * 8]); \ |
79 | 0 | BUTTERFLY(m3, m4); \ |
80 | 0 | \ |
81 | 0 | LOAD(m6, (in)[6 * 8]); \ |
82 | 0 | LOAD(m1, (in)[1 * 8]); \ |
83 | 0 | BUTTERFLY(m1, m6); \ |
84 | 0 | BUTTERFLY(m7, m4); \ |
85 | 0 | BUTTERFLY(m6, m5); \ |
86 | 0 | \ |
87 | 0 | /* RowIdct() needs 15bits fixed-point input, when the output from */ \ |
88 | 0 | /* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \ |
89 | 0 | /* now instead of in RowIdct(), because we have some multiplies to */ \ |
90 | 0 | /* perform, that can take advantage of the extra 3bits precision. */ \ |
91 | 0 | LSHIFT(m4, 3); \ |
92 | 0 | LSHIFT(m5, 3); \ |
93 | 0 | BUTTERFLY(m4, m5); \ |
94 | 0 | STORE16((in)[0 * 8], m5); \ |
95 | 0 | STORE16((in)[4 * 8], m4); \ |
96 | 0 | \ |
97 | 0 | LSHIFT(m7, 3); \ |
98 | 0 | LSHIFT(m6, 3); \ |
99 | 0 | LSHIFT(m3, 3); \ |
100 | 0 | LSHIFT(m0, 3); \ |
101 | 0 | \ |
102 | 0 | LOAD_CST(m4, kTan2); \ |
103 | 0 | m5 = m4; \ |
104 | 0 | MULT(m4, m7); \ |
105 | 0 | MULT(m5, m6); \ |
106 | 0 | SUB(m4, m6); \ |
107 | 0 | ADD(m5, m7); \ |
108 | 0 | STORE16((in)[2 * 8], m5); \ |
109 | 0 | STORE16((in)[6 * 8], m4); \ |
110 | 0 | \ |
111 | 0 | /* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \ |
112 | 0 | /* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we */ \ |
113 | 0 | /* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \ |
114 | 0 | LOAD_CST(m6, k2Sqrt2); \ |
115 | 0 | LSHIFT(m2, 3 + 1); \ |
116 | 0 | LSHIFT(m1, 3 + 1); \ |
117 | 0 | BUTTERFLY(m1, m2); \ |
118 | 0 | MULT(m2, m6); \ |
119 | 0 | MULT(m1, m6); \ |
120 | 0 | BUTTERFLY(m3, m1); \ |
121 | 0 | BUTTERFLY(m0, m2); \ |
122 | 0 | \ |
123 | 0 | LOAD_CST(m4, kTan3m1); \ |
124 | 0 | LOAD_CST(m5, kTan1); \ |
125 | 0 | m7 = m3; \ |
126 | 0 | m6 = m1; \ |
127 | 0 | MULT(m3, m4); \ |
128 | 0 | MULT(m1, m5); \ |
129 | 0 | \ |
130 | 0 | ADD(m3, m7); \ |
131 | 0 | ADD(m1, m2); \ |
132 | 0 | CORRECT_LSB(m1); \ |
133 | 0 | CORRECT_LSB(m3); \ |
134 | 0 | MULT(m4, m0); \ |
135 | 0 | MULT(m5, m2); \ |
136 | 0 | ADD(m4, m0); \ |
137 | 0 | SUB(m0, m3); \ |
138 | 0 | ADD(m7, m4); \ |
139 | 0 | SUB(m5, m6); \ |
140 | 0 | \ |
141 | 0 | STORE16((in)[1 * 8], m1); \ |
142 | 0 | STORE16((in)[3 * 8], m0); \ |
143 | 0 | STORE16((in)[5 * 8], m7); \ |
144 | 0 | STORE16((in)[7 * 8], m5); \ |
145 | 0 | } while (0) |
146 | | |
147 | | |
148 | | // these are the macro required by COLUMN_* |
149 | 0 | #define LOAD_CST(dst, src) (dst) = (src) |
150 | 0 | #define LOAD(dst, src) (dst) = (src) |
151 | 0 | #define MULT(a, b) (a) = (((a) * (b)) >> 16) |
152 | 0 | #define ADD(a, b) (a) = (a) + (b) |
153 | 0 | #define SUB(a, b) (a) = (a) - (b) |
154 | 0 | #define LSHIFT(a, n) (a) = ((a) << (n)) |
155 | 0 | #define STORE16(a, b) (a) = (b) |
156 | 0 | #define CORRECT_LSB(a) (a) += 1 |
157 | | |
158 | | // DCT vertical pass |
159 | | |
160 | 0 | inline void ColumnDct(coeff_t* in) { |
161 | 0 | for (int i = 0; i < 8; ++i) { |
162 | 0 | int m0, m1, m2, m3, m4, m5, m6, m7; |
163 | 0 | COLUMN_DCT8(in + i); |
164 | 0 | } |
165 | 0 | } |
166 | | |
167 | | // DCT horizontal pass |
168 | | |
169 | | // We don't really need to round before descaling, since we |
170 | | // still have 4 bits of precision left as final scaled output. |
171 | 0 | #define DESCALE(a) static_cast<coeff_t>((a) >> 16) |
172 | | |
173 | 0 | void RowDct(coeff_t* in, const coeff_t* table) { |
174 | | // The Fourier transform is an unitary operator, so we're basically |
175 | | // doing the transpose of RowIdct() |
176 | 0 | const int a0 = in[0] + in[7]; |
177 | 0 | const int b0 = in[0] - in[7]; |
178 | 0 | const int a1 = in[1] + in[6]; |
179 | 0 | const int b1 = in[1] - in[6]; |
180 | 0 | const int a2 = in[2] + in[5]; |
181 | 0 | const int b2 = in[2] - in[5]; |
182 | 0 | const int a3 = in[3] + in[4]; |
183 | 0 | const int b3 = in[3] - in[4]; |
184 | | |
185 | | // even part |
186 | 0 | const int C2 = table[1]; |
187 | 0 | const int C4 = table[3]; |
188 | 0 | const int C6 = table[5]; |
189 | 0 | const int c0 = a0 + a3; |
190 | 0 | const int c1 = a0 - a3; |
191 | 0 | const int c2 = a1 + a2; |
192 | 0 | const int c3 = a1 - a2; |
193 | |
|
194 | 0 | in[0] = DESCALE(C4 * (c0 + c2)); |
195 | 0 | in[4] = DESCALE(C4 * (c0 - c2)); |
196 | 0 | in[2] = DESCALE(C2 * c1 + C6 * c3); |
197 | 0 | in[6] = DESCALE(C6 * c1 - C2 * c3); |
198 | | |
199 | | // odd part |
200 | 0 | const int C1 = table[0]; |
201 | 0 | const int C3 = table[2]; |
202 | 0 | const int C5 = table[4]; |
203 | 0 | const int C7 = table[6]; |
204 | 0 | in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3); |
205 | 0 | in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3); |
206 | 0 | in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3); |
207 | 0 | in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3); |
208 | 0 | } |
209 | | #undef DESCALE |
210 | | #undef LOAD_CST |
211 | | #undef LOAD |
212 | | #undef MULT |
213 | | #undef ADD |
214 | | #undef SUB |
215 | | #undef LSHIFT |
216 | | #undef STORE16 |
217 | | #undef CORRECT_LSB |
218 | | #undef kTan1 |
219 | | #undef kTan2 |
220 | | #undef kTan3m1 |
221 | | #undef k2Sqrt2 |
222 | | #undef BUTTERFLY |
223 | | #undef COLUMN_DCT8 |
224 | | |
225 | | } // namespace |
226 | | |
227 | | /////////////////////////////////////////////////////////////////////////////// |
228 | | // visible FDCT callable functions |
229 | | |
230 | 0 | void ComputeBlockDCT(coeff_t* coeffs) { |
231 | 0 | ColumnDct(coeffs); |
232 | 0 | RowDct(coeffs + 0 * 8, kTable04); |
233 | 0 | RowDct(coeffs + 1 * 8, kTable17); |
234 | 0 | RowDct(coeffs + 2 * 8, kTable26); |
235 | 0 | RowDct(coeffs + 3 * 8, kTable35); |
236 | 0 | RowDct(coeffs + 4 * 8, kTable04); |
237 | 0 | RowDct(coeffs + 5 * 8, kTable35); |
238 | 0 | RowDct(coeffs + 6 * 8, kTable26); |
239 | 0 | RowDct(coeffs + 7 * 8, kTable17); |
240 | 0 | } |
241 | | |
242 | | } // namespace guetzli |