/src/guetzli/guetzli/fdct.cc

Source (jump to first uncovered line)
/*
 * Copyright 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Integer implementation of the Discrete Cosine Transform (DCT)
//
// Note! DCT output is kept scaled by 16, to retain maximum 16bit precision

#include "guetzli/fdct.h"

namespace guetzli {

namespace {

///////////////////////////////////////////////////////////////////////////////
// Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
const coeff_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873,  8867, 4520 };
// rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
// This multiply is merged in the table of constants used during 1st pass:
const coeff_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
// rows #2 and #6 are pre-multiplied by 2.C(2):
const coeff_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
// rows #3 and #5 are pre-multiplied by 2.C(3):
const coeff_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };

///////////////////////////////////////////////////////////////////////////////
// Constants (15bit precision) and C macros for IDCT vertical pass

#define kTan1   (13036)   // = tan(pi/16)
#define kTan2   (27146)   // = tan(2.pi/16) = sqrt(2) - 1.
#define kTan3m1 (-21746)  // = tan(3.pi/16) - 1
#define k2Sqrt2 (23170)   // = 1 / 2.sqrt(2)

  // performs: {a,b} <- {a-b, a+b}, without saturation
#define BUTTERFLY(a, b) do {   \
  SUB((a), (b));               \
  ADD((b), (b));               \
  ADD((b), (a));               \
} while (0)

///////////////////////////////////////////////////////////////////////////////
// Constants for DCT horizontal pass

// Note about the CORRECT_LSB macro:
// using 16bit fixed-point constants, we often compute products like:
// p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
// and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
// in 3/4 of the cases. This can be easily seen from the relation:
//   (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1)
// The approximation we are doing is replacing ((a|b)&1) by 1.
// In practice, this is a slightly more involved because the constants A and B
// have also been rounded compared to their exact floating point value.
// However, all in all the correction is quite small, and CORRECT_LSB can
// be defined empty if needed.

#define COLUMN_DCT8(in) do { \
  LOAD(m0, (in)[0 * 8]);     \
  LOAD(m2, (in)[2 * 8]);     \
  LOAD(m7, (in)[7 * 8]);     \
  LOAD(m5, (in)[5 * 8]);     \
                             \
  BUTTERFLY(m0, m7);         \
  BUTTERFLY(m2, m5);         \
                             \
  LOAD(m3, (in)[3 * 8]);     \
  LOAD(m4, (in)[4 * 8]);     \
  BUTTERFLY(m3, m4);         \
                             \
  LOAD(m6, (in)[6 * 8]);     \
  LOAD(m1, (in)[1 * 8]);     \
  BUTTERFLY(m1, m6);         \
  BUTTERFLY(m7, m4);         \
  BUTTERFLY(m6, m5);         \
                             \
  /* RowIdct() needs 15bits fixed-point input, when the output from   */ \
  /* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
  /* now instead of in RowIdct(), because we have some multiplies to  */ \
  /* perform, that can take advantage of the extra 3bits precision.   */ \
  LSHIFT(m4, 3);             \
  LSHIFT(m5, 3);             \
  BUTTERFLY(m4, m5);         \
  STORE16((in)[0 * 8], m5);  \
  STORE16((in)[4 * 8], m4);  \
                             \
  LSHIFT(m7, 3);             \
  LSHIFT(m6, 3);             \
  LSHIFT(m3, 3);             \
  LSHIFT(m0, 3);             \
                             \
  LOAD_CST(m4, kTan2);       \
  m5 = m4;                   \
  MULT(m4, m7);              \
  MULT(m5, m6);              \
  SUB(m4, m6);               \
  ADD(m5, m7);               \
  STORE16((in)[2 * 8], m5);  \
  STORE16((in)[6 * 8], m4);  \
                             \
  /* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
  /* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we    */ \
  /* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
  LOAD_CST(m6, k2Sqrt2);     \
  LSHIFT(m2, 3 + 1);         \
  LSHIFT(m1, 3 + 1);         \
  BUTTERFLY(m1, m2);         \
  MULT(m2, m6);              \
  MULT(m1, m6);              \
  BUTTERFLY(m3, m1);         \
  BUTTERFLY(m0, m2);         \
                             \
  LOAD_CST(m4, kTan3m1);     \
  LOAD_CST(m5, kTan1);       \
  m7 = m3;                   \
  m6 = m1;                   \
  MULT(m3, m4);              \
  MULT(m1, m5);              \
                             \
  ADD(m3, m7);               \
  ADD(m1, m2);               \
  CORRECT_LSB(m1);           \
  CORRECT_LSB(m3);           \
  MULT(m4, m0);              \
  MULT(m5, m2);              \
  ADD(m4, m0);               \
  SUB(m0, m3);               \
  ADD(m7, m4);               \
  SUB(m5, m6);               \
                             \
  STORE16((in)[1 * 8], m1);  \
  STORE16((in)[3 * 8], m0);  \
  STORE16((in)[5 * 8], m7);  \
  STORE16((in)[7 * 8], m5);  \
} while (0)


// these are the macro required by COLUMN_*
#define LOAD_CST(dst, src) (dst) = (src)
#define LOAD(dst, src) (dst) = (src)
#define MULT(a, b)  (a) = (((a) * (b)) >> 16)
#define ADD(a, b)   (a) = (a) + (b)
#define SUB(a, b)   (a) = (a) - (b)
#define LSHIFT(a, n) (a) = ((a) << (n))
#define STORE16(a, b) (a) = (b)
#define CORRECT_LSB(a) (a) += 1

// DCT vertical pass

inline void ColumnDct(coeff_t* in) {
  for (int i = 0; i < 8; ++i) {
    int m0, m1, m2, m3, m4, m5, m6, m7;
    COLUMN_DCT8(in + i);
  }
}

// DCT horizontal pass

// We don't really need to round before descaling, since we
// still have 4 bits of precision left as final scaled output.
#define DESCALE(a)  static_cast<coeff_t>((a) >> 16)

void RowDct(coeff_t* in, const coeff_t* table) {
  // The Fourier transform is an unitary operator, so we're basically
  // doing the transpose of RowIdct()
  const int a0 = in[0] + in[7];
  const int b0 = in[0] - in[7];
  const int a1 = in[1] + in[6];
  const int b1 = in[1] - in[6];
  const int a2 = in[2] + in[5];
  const int b2 = in[2] - in[5];
  const int a3 = in[3] + in[4];
  const int b3 = in[3] - in[4];

  // even part
  const int C2 = table[1];
  const int C4 = table[3];
  const int C6 = table[5];
  const int c0 = a0 + a3;
  const int c1 = a0 - a3;
  const int c2 = a1 + a2;
  const int c3 = a1 - a2;

  in[0] = DESCALE(C4 * (c0 + c2));
  in[4] = DESCALE(C4 * (c0 - c2));
  in[2] = DESCALE(C2 * c1 + C6 * c3);
  in[6] = DESCALE(C6 * c1 - C2 * c3);

  // odd part
  const int C1 = table[0];
  const int C3 = table[2];
  const int C5 = table[4];
  const int C7 = table[6];
  in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
  in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
  in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
  in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
}
#undef DESCALE
#undef LOAD_CST
#undef LOAD
#undef MULT
#undef ADD
#undef SUB
#undef LSHIFT
#undef STORE16
#undef CORRECT_LSB
#undef kTan1
#undef kTan2
#undef kTan3m1
#undef k2Sqrt2
#undef BUTTERFLY
#undef COLUMN_DCT8

}  // namespace

///////////////////////////////////////////////////////////////////////////////
// visible FDCT callable functions

void ComputeBlockDCT(coeff_t* coeffs) {
  ColumnDct(coeffs);
  RowDct(coeffs + 0 * 8, kTable04);
  RowDct(coeffs + 1 * 8, kTable17);
  RowDct(coeffs + 2 * 8, kTable26);
  RowDct(coeffs + 3 * 8, kTable35);
  RowDct(coeffs + 4 * 8, kTable04);
  RowDct(coeffs + 5 * 8, kTable35);
  RowDct(coeffs + 6 * 8, kTable26);
  RowDct(coeffs + 7 * 8, kTable17);
}

}  // namespace guetzli

Coverage Report

Created: 2023-06-07 06:21

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright 2016 Google Inc.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		// Integer implementation of the Discrete Cosine Transform (DCT)
18		//
19		// Note! DCT output is kept scaled by 16, to retain maximum 16bit precision
20
21		#include "guetzli/fdct.h"
22
23		namespace guetzli {
24
25		namespace {
26
27		///////////////////////////////////////////////////////////////////////////////
28		// Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
29		const coeff_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873, 8867, 4520 };
30		// rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
31		// This multiply is merged in the table of constants used during 1st pass:
32		const coeff_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
33		// rows #2 and #6 are pre-multiplied by 2.C(2):
34		const coeff_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
35		// rows #3 and #5 are pre-multiplied by 2.C(3):
36		const coeff_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };
37
38		///////////////////////////////////////////////////////////////////////////////
39		// Constants (15bit precision) and C macros for IDCT vertical pass
40
41		#define kTan1 (13036) // = tan(pi/16)
42		#define kTan2 (27146) // = tan(2.pi/16) = sqrt(2) - 1.
43		#define kTan3m1 (-21746) // = tan(3.pi/16) - 1
44		#define k2Sqrt2 (23170) // = 1 / 2.sqrt(2)
45
46		// performs: {a,b} <- {a-b, a+b}, without saturation
47	0	#define BUTTERFLY(a, b) do { \
48	0	SUB((a), (b)); \
49	0	ADD((b), (b)); \
50	0	ADD((b), (a)); \
51	0	} while (0)
52
53		///////////////////////////////////////////////////////////////////////////////
54		// Constants for DCT horizontal pass
55
56		// Note about the CORRECT_LSB macro:
57		// using 16bit fixed-point constants, we often compute products like:
58		// p = (Ax + By + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
59		// and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
60		// in 3/4 of the cases. This can be easily seen from the relation:
61		// (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a\|b)&1)
62		// The approximation we are doing is replacing ((a\|b)&1) by 1.
63		// In practice, this is a slightly more involved because the constants A and B
64		// have also been rounded compared to their exact floating point value.
65		// However, all in all the correction is quite small, and CORRECT_LSB can
66		// be defined empty if needed.
67
68	0	#define COLUMN_DCT8(in) do { \
69	0	LOAD(m0, (in)[0 * 8]); \
70	0	LOAD(m2, (in)[2 * 8]); \
71	0	LOAD(m7, (in)[7 * 8]); \
72	0	LOAD(m5, (in)[5 * 8]); \
73	0	\
74	0	BUTTERFLY(m0, m7); \
75	0	BUTTERFLY(m2, m5); \
76	0	\
77	0	LOAD(m3, (in)[3 * 8]); \
78	0	LOAD(m4, (in)[4 * 8]); \
79	0	BUTTERFLY(m3, m4); \
80	0	\
81	0	LOAD(m6, (in)[6 * 8]); \
82	0	LOAD(m1, (in)[1 * 8]); \
83	0	BUTTERFLY(m1, m6); \
84	0	BUTTERFLY(m7, m4); \
85	0	BUTTERFLY(m6, m5); \
86	0	\
87	0	/* RowIdct() needs 15bits fixed-point input, when the output from */ \
88	0	/* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
89	0	/* now instead of in RowIdct(), because we have some multiplies to */ \
90	0	/* perform, that can take advantage of the extra 3bits precision. */ \
91	0	LSHIFT(m4, 3); \
92	0	LSHIFT(m5, 3); \
93	0	BUTTERFLY(m4, m5); \
94	0	STORE16((in)[0 * 8], m5); \
95	0	STORE16((in)[4 * 8], m4); \
96	0	\
97	0	LSHIFT(m7, 3); \
98	0	LSHIFT(m6, 3); \
99	0	LSHIFT(m3, 3); \
100	0	LSHIFT(m0, 3); \
101	0	\
102	0	LOAD_CST(m4, kTan2); \
103	0	m5 = m4; \
104	0	MULT(m4, m7); \
105	0	MULT(m5, m6); \
106	0	SUB(m4, m6); \
107	0	ADD(m5, m7); \
108	0	STORE16((in)[2 * 8], m5); \
109	0	STORE16((in)[6 * 8], m4); \
110	0	\
111	0	/* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
112	0	/* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we */ \
113	0	/* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
114	0	LOAD_CST(m6, k2Sqrt2); \
115	0	LSHIFT(m2, 3 + 1); \
116	0	LSHIFT(m1, 3 + 1); \
117	0	BUTTERFLY(m1, m2); \
118	0	MULT(m2, m6); \
119	0	MULT(m1, m6); \
120	0	BUTTERFLY(m3, m1); \
121	0	BUTTERFLY(m0, m2); \
122	0	\
123	0	LOAD_CST(m4, kTan3m1); \
124	0	LOAD_CST(m5, kTan1); \
125	0	m7 = m3; \
126	0	m6 = m1; \
127	0	MULT(m3, m4); \
128	0	MULT(m1, m5); \
129	0	\
130	0	ADD(m3, m7); \
131	0	ADD(m1, m2); \
132	0	CORRECT_LSB(m1); \
133	0	CORRECT_LSB(m3); \
134	0	MULT(m4, m0); \
135	0	MULT(m5, m2); \
136	0	ADD(m4, m0); \
137	0	SUB(m0, m3); \
138	0	ADD(m7, m4); \
139	0	SUB(m5, m6); \
140	0	\
141	0	STORE16((in)[1 * 8], m1); \
142	0	STORE16((in)[3 * 8], m0); \
143	0	STORE16((in)[5 * 8], m7); \
144	0	STORE16((in)[7 * 8], m5); \
145	0	} while (0)
146
147
148		// these are the macro required by COLUMN_*
149	0	#define LOAD_CST(dst, src) (dst) = (src)
150	0	#define LOAD(dst, src) (dst) = (src)
151	0	#define MULT(a, b) (a) = (((a) * (b)) >> 16)
152	0	#define ADD(a, b) (a) = (a) + (b)
153	0	#define SUB(a, b) (a) = (a) - (b)
154	0	#define LSHIFT(a, n) (a) = ((a) << (n))
155	0	#define STORE16(a, b) (a) = (b)
156	0	#define CORRECT_LSB(a) (a) += 1
157
158		// DCT vertical pass
159
160	0	inline void ColumnDct(coeff_t* in) {
161	0	for (int i = 0; i < 8; ++i) {
162	0	int m0, m1, m2, m3, m4, m5, m6, m7;
163	0	COLUMN_DCT8(in + i);
164	0	}
165	0	}
166
167		// DCT horizontal pass
168
169		// We don't really need to round before descaling, since we
170		// still have 4 bits of precision left as final scaled output.
171	0	#define DESCALE(a) static_cast<coeff_t>((a) >> 16)
172
173	0	void RowDct(coeff_t* in, const coeff_t* table) {
174		// The Fourier transform is an unitary operator, so we're basically
175		// doing the transpose of RowIdct()
176	0	const int a0 = in[0] + in[7];
177	0	const int b0 = in[0] - in[7];
178	0	const int a1 = in[1] + in[6];
179	0	const int b1 = in[1] - in[6];
180	0	const int a2 = in[2] + in[5];
181	0	const int b2 = in[2] - in[5];
182	0	const int a3 = in[3] + in[4];
183	0	const int b3 = in[3] - in[4];
184
185		// even part
186	0	const int C2 = table[1];
187	0	const int C4 = table[3];
188	0	const int C6 = table[5];
189	0	const int c0 = a0 + a3;
190	0	const int c1 = a0 - a3;
191	0	const int c2 = a1 + a2;
192	0	const int c3 = a1 - a2;
193
194	0	in[0] = DESCALE(C4 * (c0 + c2));
195	0	in[4] = DESCALE(C4 * (c0 - c2));
196	0	in[2] = DESCALE(C2 * c1 + C6 * c3);
197	0	in[6] = DESCALE(C6 * c1 - C2 * c3);
198
199		// odd part
200	0	const int C1 = table[0];
201	0	const int C3 = table[2];
202	0	const int C5 = table[4];
203	0	const int C7 = table[6];
204	0	in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
205	0	in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
206	0	in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
207	0	in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
208	0	}
209		#undef DESCALE
210		#undef LOAD_CST
211		#undef LOAD
212		#undef MULT
213		#undef ADD
214		#undef SUB
215		#undef LSHIFT
216		#undef STORE16
217		#undef CORRECT_LSB
218		#undef kTan1
219		#undef kTan2
220		#undef kTan3m1
221		#undef k2Sqrt2
222		#undef BUTTERFLY
223		#undef COLUMN_DCT8
224
225		} // namespace
226
227		///////////////////////////////////////////////////////////////////////////////
228		// visible FDCT callable functions
229
230	0	void ComputeBlockDCT(coeff_t* coeffs) {
231	0	ColumnDct(coeffs);
232	0	RowDct(coeffs + 0 * 8, kTable04);
233	0	RowDct(coeffs + 1 * 8, kTable17);
234	0	RowDct(coeffs + 2 * 8, kTable26);
235	0	RowDct(coeffs + 3 * 8, kTable35);
236	0	RowDct(coeffs + 4 * 8, kTable04);
237	0	RowDct(coeffs + 5 * 8, kTable35);
238	0	RowDct(coeffs + 6 * 8, kTable26);
239	0	RowDct(coeffs + 7 * 8, kTable17);
240	0	}
241
242		} // namespace guetzli