/src/libvpx/vp9/encoder/vp9_dct.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include <assert.h> |
12 | | #include <math.h> |
13 | | |
14 | | #include "./vp9_rtcd.h" |
15 | | #include "./vpx_config.h" |
16 | | #include "./vpx_dsp_rtcd.h" |
17 | | |
18 | | #include "vp9/common/vp9_blockd.h" |
19 | | #include "vp9/common/vp9_idct.h" |
20 | | #include "vpx_dsp/fwd_txfm.h" |
21 | | #include "vpx_ports/mem.h" |
22 | | |
23 | 0 | static void fdct4(const tran_low_t *input, tran_low_t *output) { |
24 | 0 | tran_high_t step[4]; |
25 | 0 | tran_high_t temp1, temp2; |
26 | |
|
27 | 0 | step[0] = input[0] + input[3]; |
28 | 0 | step[1] = input[1] + input[2]; |
29 | 0 | step[2] = input[1] - input[2]; |
30 | 0 | step[3] = input[0] - input[3]; |
31 | |
|
32 | 0 | temp1 = (step[0] + step[1]) * cospi_16_64; |
33 | 0 | temp2 = (step[0] - step[1]) * cospi_16_64; |
34 | 0 | output[0] = (tran_low_t)fdct_round_shift(temp1); |
35 | 0 | output[2] = (tran_low_t)fdct_round_shift(temp2); |
36 | 0 | temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; |
37 | 0 | temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; |
38 | 0 | output[1] = (tran_low_t)fdct_round_shift(temp1); |
39 | 0 | output[3] = (tran_low_t)fdct_round_shift(temp2); |
40 | 0 | } |
41 | | |
42 | 0 | static void fdct8(const tran_low_t *input, tran_low_t *output) { |
43 | 0 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
44 | 0 | tran_high_t t0, t1, t2, t3; // needs32 |
45 | 0 | tran_high_t x0, x1, x2, x3; // canbe16 |
46 | | |
47 | | // stage 1 |
48 | 0 | s0 = input[0] + input[7]; |
49 | 0 | s1 = input[1] + input[6]; |
50 | 0 | s2 = input[2] + input[5]; |
51 | 0 | s3 = input[3] + input[4]; |
52 | 0 | s4 = input[3] - input[4]; |
53 | 0 | s5 = input[2] - input[5]; |
54 | 0 | s6 = input[1] - input[6]; |
55 | 0 | s7 = input[0] - input[7]; |
56 | | |
57 | | // fdct4(step, step); |
58 | 0 | x0 = s0 + s3; |
59 | 0 | x1 = s1 + s2; |
60 | 0 | x2 = s1 - s2; |
61 | 0 | x3 = s0 - s3; |
62 | 0 | t0 = (x0 + x1) * cospi_16_64; |
63 | 0 | t1 = (x0 - x1) * cospi_16_64; |
64 | 0 | t2 = x2 * cospi_24_64 + x3 * cospi_8_64; |
65 | 0 | t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; |
66 | 0 | output[0] = (tran_low_t)fdct_round_shift(t0); |
67 | 0 | output[2] = (tran_low_t)fdct_round_shift(t2); |
68 | 0 | output[4] = (tran_low_t)fdct_round_shift(t1); |
69 | 0 | output[6] = (tran_low_t)fdct_round_shift(t3); |
70 | | |
71 | | // Stage 2 |
72 | 0 | t0 = (s6 - s5) * cospi_16_64; |
73 | 0 | t1 = (s6 + s5) * cospi_16_64; |
74 | 0 | t2 = (tran_low_t)fdct_round_shift(t0); |
75 | 0 | t3 = (tran_low_t)fdct_round_shift(t1); |
76 | | |
77 | | // Stage 3 |
78 | 0 | x0 = s4 + t2; |
79 | 0 | x1 = s4 - t2; |
80 | 0 | x2 = s7 - t3; |
81 | 0 | x3 = s7 + t3; |
82 | | |
83 | | // Stage 4 |
84 | 0 | t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
85 | 0 | t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
86 | 0 | t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
87 | 0 | t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
88 | 0 | output[1] = (tran_low_t)fdct_round_shift(t0); |
89 | 0 | output[3] = (tran_low_t)fdct_round_shift(t2); |
90 | 0 | output[5] = (tran_low_t)fdct_round_shift(t1); |
91 | 0 | output[7] = (tran_low_t)fdct_round_shift(t3); |
92 | 0 | } |
93 | | |
94 | 0 | static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { |
95 | 0 | tran_high_t step1[8]; // canbe16 |
96 | 0 | tran_high_t step2[8]; // canbe16 |
97 | 0 | tran_high_t step3[8]; // canbe16 |
98 | 0 | tran_high_t input[8]; // canbe16 |
99 | 0 | tran_high_t temp1, temp2; // needs32 |
100 | | |
101 | | // step 1 |
102 | 0 | input[0] = in[0] + in[15]; |
103 | 0 | input[1] = in[1] + in[14]; |
104 | 0 | input[2] = in[2] + in[13]; |
105 | 0 | input[3] = in[3] + in[12]; |
106 | 0 | input[4] = in[4] + in[11]; |
107 | 0 | input[5] = in[5] + in[10]; |
108 | 0 | input[6] = in[6] + in[9]; |
109 | 0 | input[7] = in[7] + in[8]; |
110 | |
|
111 | 0 | step1[0] = in[7] - in[8]; |
112 | 0 | step1[1] = in[6] - in[9]; |
113 | 0 | step1[2] = in[5] - in[10]; |
114 | 0 | step1[3] = in[4] - in[11]; |
115 | 0 | step1[4] = in[3] - in[12]; |
116 | 0 | step1[5] = in[2] - in[13]; |
117 | 0 | step1[6] = in[1] - in[14]; |
118 | 0 | step1[7] = in[0] - in[15]; |
119 | | |
120 | | // fdct8(step, step); |
121 | 0 | { |
122 | 0 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 |
123 | 0 | tran_high_t t0, t1, t2, t3; // needs32 |
124 | 0 | tran_high_t x0, x1, x2, x3; // canbe16 |
125 | | |
126 | | // stage 1 |
127 | 0 | s0 = input[0] + input[7]; |
128 | 0 | s1 = input[1] + input[6]; |
129 | 0 | s2 = input[2] + input[5]; |
130 | 0 | s3 = input[3] + input[4]; |
131 | 0 | s4 = input[3] - input[4]; |
132 | 0 | s5 = input[2] - input[5]; |
133 | 0 | s6 = input[1] - input[6]; |
134 | 0 | s7 = input[0] - input[7]; |
135 | | |
136 | | // fdct4(step, step); |
137 | 0 | x0 = s0 + s3; |
138 | 0 | x1 = s1 + s2; |
139 | 0 | x2 = s1 - s2; |
140 | 0 | x3 = s0 - s3; |
141 | 0 | t0 = (x0 + x1) * cospi_16_64; |
142 | 0 | t1 = (x0 - x1) * cospi_16_64; |
143 | 0 | t2 = x3 * cospi_8_64 + x2 * cospi_24_64; |
144 | 0 | t3 = x3 * cospi_24_64 - x2 * cospi_8_64; |
145 | 0 | out[0] = (tran_low_t)fdct_round_shift(t0); |
146 | 0 | out[4] = (tran_low_t)fdct_round_shift(t2); |
147 | 0 | out[8] = (tran_low_t)fdct_round_shift(t1); |
148 | 0 | out[12] = (tran_low_t)fdct_round_shift(t3); |
149 | | |
150 | | // Stage 2 |
151 | 0 | t0 = (s6 - s5) * cospi_16_64; |
152 | 0 | t1 = (s6 + s5) * cospi_16_64; |
153 | 0 | t2 = fdct_round_shift(t0); |
154 | 0 | t3 = fdct_round_shift(t1); |
155 | | |
156 | | // Stage 3 |
157 | 0 | x0 = s4 + t2; |
158 | 0 | x1 = s4 - t2; |
159 | 0 | x2 = s7 - t3; |
160 | 0 | x3 = s7 + t3; |
161 | | |
162 | | // Stage 4 |
163 | 0 | t0 = x0 * cospi_28_64 + x3 * cospi_4_64; |
164 | 0 | t1 = x1 * cospi_12_64 + x2 * cospi_20_64; |
165 | 0 | t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; |
166 | 0 | t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; |
167 | 0 | out[2] = (tran_low_t)fdct_round_shift(t0); |
168 | 0 | out[6] = (tran_low_t)fdct_round_shift(t2); |
169 | 0 | out[10] = (tran_low_t)fdct_round_shift(t1); |
170 | 0 | out[14] = (tran_low_t)fdct_round_shift(t3); |
171 | 0 | } |
172 | | |
173 | | // step 2 |
174 | 0 | temp1 = (step1[5] - step1[2]) * cospi_16_64; |
175 | 0 | temp2 = (step1[4] - step1[3]) * cospi_16_64; |
176 | 0 | step2[2] = fdct_round_shift(temp1); |
177 | 0 | step2[3] = fdct_round_shift(temp2); |
178 | 0 | temp1 = (step1[4] + step1[3]) * cospi_16_64; |
179 | 0 | temp2 = (step1[5] + step1[2]) * cospi_16_64; |
180 | 0 | step2[4] = fdct_round_shift(temp1); |
181 | 0 | step2[5] = fdct_round_shift(temp2); |
182 | | |
183 | | // step 3 |
184 | 0 | step3[0] = step1[0] + step2[3]; |
185 | 0 | step3[1] = step1[1] + step2[2]; |
186 | 0 | step3[2] = step1[1] - step2[2]; |
187 | 0 | step3[3] = step1[0] - step2[3]; |
188 | 0 | step3[4] = step1[7] - step2[4]; |
189 | 0 | step3[5] = step1[6] - step2[5]; |
190 | 0 | step3[6] = step1[6] + step2[5]; |
191 | 0 | step3[7] = step1[7] + step2[4]; |
192 | | |
193 | | // step 4 |
194 | 0 | temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; |
195 | 0 | temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; |
196 | 0 | step2[1] = fdct_round_shift(temp1); |
197 | 0 | step2[2] = fdct_round_shift(temp2); |
198 | 0 | temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; |
199 | 0 | temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; |
200 | 0 | step2[5] = fdct_round_shift(temp1); |
201 | 0 | step2[6] = fdct_round_shift(temp2); |
202 | | |
203 | | // step 5 |
204 | 0 | step1[0] = step3[0] + step2[1]; |
205 | 0 | step1[1] = step3[0] - step2[1]; |
206 | 0 | step1[2] = step3[3] + step2[2]; |
207 | 0 | step1[3] = step3[3] - step2[2]; |
208 | 0 | step1[4] = step3[4] - step2[5]; |
209 | 0 | step1[5] = step3[4] + step2[5]; |
210 | 0 | step1[6] = step3[7] - step2[6]; |
211 | 0 | step1[7] = step3[7] + step2[6]; |
212 | | |
213 | | // step 6 |
214 | 0 | temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; |
215 | 0 | temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; |
216 | 0 | out[1] = (tran_low_t)fdct_round_shift(temp1); |
217 | 0 | out[9] = (tran_low_t)fdct_round_shift(temp2); |
218 | |
|
219 | 0 | temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; |
220 | 0 | temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; |
221 | 0 | out[5] = (tran_low_t)fdct_round_shift(temp1); |
222 | 0 | out[13] = (tran_low_t)fdct_round_shift(temp2); |
223 | |
|
224 | 0 | temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; |
225 | 0 | temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; |
226 | 0 | out[3] = (tran_low_t)fdct_round_shift(temp1); |
227 | 0 | out[11] = (tran_low_t)fdct_round_shift(temp2); |
228 | |
|
229 | 0 | temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; |
230 | 0 | temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; |
231 | 0 | out[7] = (tran_low_t)fdct_round_shift(temp1); |
232 | 0 | out[15] = (tran_low_t)fdct_round_shift(temp2); |
233 | 0 | } |
234 | | |
235 | 0 | static void fadst4(const tran_low_t *input, tran_low_t *output) { |
236 | 0 | tran_high_t x0, x1, x2, x3; |
237 | 0 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
238 | |
|
239 | 0 | x0 = input[0]; |
240 | 0 | x1 = input[1]; |
241 | 0 | x2 = input[2]; |
242 | 0 | x3 = input[3]; |
243 | |
|
244 | 0 | if (!(x0 | x1 | x2 | x3)) { |
245 | 0 | output[0] = output[1] = output[2] = output[3] = 0; |
246 | 0 | return; |
247 | 0 | } |
248 | | |
249 | 0 | s0 = sinpi_1_9 * x0; |
250 | 0 | s1 = sinpi_4_9 * x0; |
251 | 0 | s2 = sinpi_2_9 * x1; |
252 | 0 | s3 = sinpi_1_9 * x1; |
253 | 0 | s4 = sinpi_3_9 * x2; |
254 | 0 | s5 = sinpi_4_9 * x3; |
255 | 0 | s6 = sinpi_2_9 * x3; |
256 | 0 | s7 = x0 + x1 - x3; |
257 | |
|
258 | 0 | x0 = s0 + s2 + s5; |
259 | 0 | x1 = sinpi_3_9 * s7; |
260 | 0 | x2 = s1 - s3 + s6; |
261 | 0 | x3 = s4; |
262 | |
|
263 | 0 | s0 = x0 + x3; |
264 | 0 | s1 = x1; |
265 | 0 | s2 = x2 - x3; |
266 | 0 | s3 = x2 - x0 + x3; |
267 | | |
268 | | // 1-D transform scaling factor is sqrt(2). |
269 | 0 | output[0] = (tran_low_t)fdct_round_shift(s0); |
270 | 0 | output[1] = (tran_low_t)fdct_round_shift(s1); |
271 | 0 | output[2] = (tran_low_t)fdct_round_shift(s2); |
272 | 0 | output[3] = (tran_low_t)fdct_round_shift(s3); |
273 | 0 | } |
274 | | |
275 | 0 | static void fadst8(const tran_low_t *input, tran_low_t *output) { |
276 | 0 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; |
277 | |
|
278 | 0 | tran_high_t x0 = input[7]; |
279 | 0 | tran_high_t x1 = input[0]; |
280 | 0 | tran_high_t x2 = input[5]; |
281 | 0 | tran_high_t x3 = input[2]; |
282 | 0 | tran_high_t x4 = input[3]; |
283 | 0 | tran_high_t x5 = input[4]; |
284 | 0 | tran_high_t x6 = input[1]; |
285 | 0 | tran_high_t x7 = input[6]; |
286 | | |
287 | | // stage 1 |
288 | 0 | s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
289 | 0 | s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
290 | 0 | s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
291 | 0 | s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
292 | 0 | s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
293 | 0 | s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
294 | 0 | s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
295 | 0 | s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
296 | |
|
297 | 0 | x0 = fdct_round_shift(s0 + s4); |
298 | 0 | x1 = fdct_round_shift(s1 + s5); |
299 | 0 | x2 = fdct_round_shift(s2 + s6); |
300 | 0 | x3 = fdct_round_shift(s3 + s7); |
301 | 0 | x4 = fdct_round_shift(s0 - s4); |
302 | 0 | x5 = fdct_round_shift(s1 - s5); |
303 | 0 | x6 = fdct_round_shift(s2 - s6); |
304 | 0 | x7 = fdct_round_shift(s3 - s7); |
305 | | |
306 | | // stage 2 |
307 | 0 | s0 = x0; |
308 | 0 | s1 = x1; |
309 | 0 | s2 = x2; |
310 | 0 | s3 = x3; |
311 | 0 | s4 = cospi_8_64 * x4 + cospi_24_64 * x5; |
312 | 0 | s5 = cospi_24_64 * x4 - cospi_8_64 * x5; |
313 | 0 | s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; |
314 | 0 | s7 = cospi_8_64 * x6 + cospi_24_64 * x7; |
315 | |
|
316 | 0 | x0 = s0 + s2; |
317 | 0 | x1 = s1 + s3; |
318 | 0 | x2 = s0 - s2; |
319 | 0 | x3 = s1 - s3; |
320 | 0 | x4 = fdct_round_shift(s4 + s6); |
321 | 0 | x5 = fdct_round_shift(s5 + s7); |
322 | 0 | x6 = fdct_round_shift(s4 - s6); |
323 | 0 | x7 = fdct_round_shift(s5 - s7); |
324 | | |
325 | | // stage 3 |
326 | 0 | s2 = cospi_16_64 * (x2 + x3); |
327 | 0 | s3 = cospi_16_64 * (x2 - x3); |
328 | 0 | s6 = cospi_16_64 * (x6 + x7); |
329 | 0 | s7 = cospi_16_64 * (x6 - x7); |
330 | |
|
331 | 0 | x2 = fdct_round_shift(s2); |
332 | 0 | x3 = fdct_round_shift(s3); |
333 | 0 | x6 = fdct_round_shift(s6); |
334 | 0 | x7 = fdct_round_shift(s7); |
335 | |
|
336 | 0 | output[0] = (tran_low_t)x0; |
337 | 0 | output[1] = (tran_low_t)-x4; |
338 | 0 | output[2] = (tran_low_t)x6; |
339 | 0 | output[3] = (tran_low_t)-x2; |
340 | 0 | output[4] = (tran_low_t)x3; |
341 | 0 | output[5] = (tran_low_t)-x7; |
342 | 0 | output[6] = (tran_low_t)x5; |
343 | 0 | output[7] = (tran_low_t)-x1; |
344 | 0 | } |
345 | | |
346 | 0 | static void fadst16(const tran_low_t *input, tran_low_t *output) { |
347 | 0 | tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; |
348 | 0 | tran_high_t s9, s10, s11, s12, s13, s14, s15; |
349 | |
|
350 | 0 | tran_high_t x0 = input[15]; |
351 | 0 | tran_high_t x1 = input[0]; |
352 | 0 | tran_high_t x2 = input[13]; |
353 | 0 | tran_high_t x3 = input[2]; |
354 | 0 | tran_high_t x4 = input[11]; |
355 | 0 | tran_high_t x5 = input[4]; |
356 | 0 | tran_high_t x6 = input[9]; |
357 | 0 | tran_high_t x7 = input[6]; |
358 | 0 | tran_high_t x8 = input[7]; |
359 | 0 | tran_high_t x9 = input[8]; |
360 | 0 | tran_high_t x10 = input[5]; |
361 | 0 | tran_high_t x11 = input[10]; |
362 | 0 | tran_high_t x12 = input[3]; |
363 | 0 | tran_high_t x13 = input[12]; |
364 | 0 | tran_high_t x14 = input[1]; |
365 | 0 | tran_high_t x15 = input[14]; |
366 | | |
367 | | // stage 1 |
368 | 0 | s0 = x0 * cospi_1_64 + x1 * cospi_31_64; |
369 | 0 | s1 = x0 * cospi_31_64 - x1 * cospi_1_64; |
370 | 0 | s2 = x2 * cospi_5_64 + x3 * cospi_27_64; |
371 | 0 | s3 = x2 * cospi_27_64 - x3 * cospi_5_64; |
372 | 0 | s4 = x4 * cospi_9_64 + x5 * cospi_23_64; |
373 | 0 | s5 = x4 * cospi_23_64 - x5 * cospi_9_64; |
374 | 0 | s6 = x6 * cospi_13_64 + x7 * cospi_19_64; |
375 | 0 | s7 = x6 * cospi_19_64 - x7 * cospi_13_64; |
376 | 0 | s8 = x8 * cospi_17_64 + x9 * cospi_15_64; |
377 | 0 | s9 = x8 * cospi_15_64 - x9 * cospi_17_64; |
378 | 0 | s10 = x10 * cospi_21_64 + x11 * cospi_11_64; |
379 | 0 | s11 = x10 * cospi_11_64 - x11 * cospi_21_64; |
380 | 0 | s12 = x12 * cospi_25_64 + x13 * cospi_7_64; |
381 | 0 | s13 = x12 * cospi_7_64 - x13 * cospi_25_64; |
382 | 0 | s14 = x14 * cospi_29_64 + x15 * cospi_3_64; |
383 | 0 | s15 = x14 * cospi_3_64 - x15 * cospi_29_64; |
384 | |
|
385 | 0 | x0 = fdct_round_shift(s0 + s8); |
386 | 0 | x1 = fdct_round_shift(s1 + s9); |
387 | 0 | x2 = fdct_round_shift(s2 + s10); |
388 | 0 | x3 = fdct_round_shift(s3 + s11); |
389 | 0 | x4 = fdct_round_shift(s4 + s12); |
390 | 0 | x5 = fdct_round_shift(s5 + s13); |
391 | 0 | x6 = fdct_round_shift(s6 + s14); |
392 | 0 | x7 = fdct_round_shift(s7 + s15); |
393 | 0 | x8 = fdct_round_shift(s0 - s8); |
394 | 0 | x9 = fdct_round_shift(s1 - s9); |
395 | 0 | x10 = fdct_round_shift(s2 - s10); |
396 | 0 | x11 = fdct_round_shift(s3 - s11); |
397 | 0 | x12 = fdct_round_shift(s4 - s12); |
398 | 0 | x13 = fdct_round_shift(s5 - s13); |
399 | 0 | x14 = fdct_round_shift(s6 - s14); |
400 | 0 | x15 = fdct_round_shift(s7 - s15); |
401 | | |
402 | | // stage 2 |
403 | 0 | s0 = x0; |
404 | 0 | s1 = x1; |
405 | 0 | s2 = x2; |
406 | 0 | s3 = x3; |
407 | 0 | s4 = x4; |
408 | 0 | s5 = x5; |
409 | 0 | s6 = x6; |
410 | 0 | s7 = x7; |
411 | 0 | s8 = x8 * cospi_4_64 + x9 * cospi_28_64; |
412 | 0 | s9 = x8 * cospi_28_64 - x9 * cospi_4_64; |
413 | 0 | s10 = x10 * cospi_20_64 + x11 * cospi_12_64; |
414 | 0 | s11 = x10 * cospi_12_64 - x11 * cospi_20_64; |
415 | 0 | s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; |
416 | 0 | s13 = x12 * cospi_4_64 + x13 * cospi_28_64; |
417 | 0 | s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; |
418 | 0 | s15 = x14 * cospi_20_64 + x15 * cospi_12_64; |
419 | |
|
420 | 0 | x0 = s0 + s4; |
421 | 0 | x1 = s1 + s5; |
422 | 0 | x2 = s2 + s6; |
423 | 0 | x3 = s3 + s7; |
424 | 0 | x4 = s0 - s4; |
425 | 0 | x5 = s1 - s5; |
426 | 0 | x6 = s2 - s6; |
427 | 0 | x7 = s3 - s7; |
428 | 0 | x8 = fdct_round_shift(s8 + s12); |
429 | 0 | x9 = fdct_round_shift(s9 + s13); |
430 | 0 | x10 = fdct_round_shift(s10 + s14); |
431 | 0 | x11 = fdct_round_shift(s11 + s15); |
432 | 0 | x12 = fdct_round_shift(s8 - s12); |
433 | 0 | x13 = fdct_round_shift(s9 - s13); |
434 | 0 | x14 = fdct_round_shift(s10 - s14); |
435 | 0 | x15 = fdct_round_shift(s11 - s15); |
436 | | |
437 | | // stage 3 |
438 | 0 | s0 = x0; |
439 | 0 | s1 = x1; |
440 | 0 | s2 = x2; |
441 | 0 | s3 = x3; |
442 | 0 | s4 = x4 * cospi_8_64 + x5 * cospi_24_64; |
443 | 0 | s5 = x4 * cospi_24_64 - x5 * cospi_8_64; |
444 | 0 | s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; |
445 | 0 | s7 = x6 * cospi_8_64 + x7 * cospi_24_64; |
446 | 0 | s8 = x8; |
447 | 0 | s9 = x9; |
448 | 0 | s10 = x10; |
449 | 0 | s11 = x11; |
450 | 0 | s12 = x12 * cospi_8_64 + x13 * cospi_24_64; |
451 | 0 | s13 = x12 * cospi_24_64 - x13 * cospi_8_64; |
452 | 0 | s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; |
453 | 0 | s15 = x14 * cospi_8_64 + x15 * cospi_24_64; |
454 | |
|
455 | 0 | x0 = s0 + s2; |
456 | 0 | x1 = s1 + s3; |
457 | 0 | x2 = s0 - s2; |
458 | 0 | x3 = s1 - s3; |
459 | 0 | x4 = fdct_round_shift(s4 + s6); |
460 | 0 | x5 = fdct_round_shift(s5 + s7); |
461 | 0 | x6 = fdct_round_shift(s4 - s6); |
462 | 0 | x7 = fdct_round_shift(s5 - s7); |
463 | 0 | x8 = s8 + s10; |
464 | 0 | x9 = s9 + s11; |
465 | 0 | x10 = s8 - s10; |
466 | 0 | x11 = s9 - s11; |
467 | 0 | x12 = fdct_round_shift(s12 + s14); |
468 | 0 | x13 = fdct_round_shift(s13 + s15); |
469 | 0 | x14 = fdct_round_shift(s12 - s14); |
470 | 0 | x15 = fdct_round_shift(s13 - s15); |
471 | | |
472 | | // stage 4 |
473 | 0 | s2 = (-cospi_16_64) * (x2 + x3); |
474 | 0 | s3 = cospi_16_64 * (x2 - x3); |
475 | 0 | s6 = cospi_16_64 * (x6 + x7); |
476 | 0 | s7 = cospi_16_64 * (-x6 + x7); |
477 | 0 | s10 = cospi_16_64 * (x10 + x11); |
478 | 0 | s11 = cospi_16_64 * (-x10 + x11); |
479 | 0 | s14 = (-cospi_16_64) * (x14 + x15); |
480 | 0 | s15 = cospi_16_64 * (x14 - x15); |
481 | |
|
482 | 0 | x2 = fdct_round_shift(s2); |
483 | 0 | x3 = fdct_round_shift(s3); |
484 | 0 | x6 = fdct_round_shift(s6); |
485 | 0 | x7 = fdct_round_shift(s7); |
486 | 0 | x10 = fdct_round_shift(s10); |
487 | 0 | x11 = fdct_round_shift(s11); |
488 | 0 | x14 = fdct_round_shift(s14); |
489 | 0 | x15 = fdct_round_shift(s15); |
490 | |
|
491 | 0 | output[0] = (tran_low_t)x0; |
492 | 0 | output[1] = (tran_low_t)-x8; |
493 | 0 | output[2] = (tran_low_t)x12; |
494 | 0 | output[3] = (tran_low_t)-x4; |
495 | 0 | output[4] = (tran_low_t)x6; |
496 | 0 | output[5] = (tran_low_t)x14; |
497 | 0 | output[6] = (tran_low_t)x10; |
498 | 0 | output[7] = (tran_low_t)x2; |
499 | 0 | output[8] = (tran_low_t)x3; |
500 | 0 | output[9] = (tran_low_t)x11; |
501 | 0 | output[10] = (tran_low_t)x15; |
502 | 0 | output[11] = (tran_low_t)x7; |
503 | 0 | output[12] = (tran_low_t)x5; |
504 | 0 | output[13] = (tran_low_t)-x13; |
505 | 0 | output[14] = (tran_low_t)x9; |
506 | 0 | output[15] = (tran_low_t)-x1; |
507 | 0 | } |
508 | | |
509 | | static const transform_2d FHT_4[] = { |
510 | | { fdct4, fdct4 }, // DCT_DCT = 0 |
511 | | { fadst4, fdct4 }, // ADST_DCT = 1 |
512 | | { fdct4, fadst4 }, // DCT_ADST = 2 |
513 | | { fadst4, fadst4 } // ADST_ADST = 3 |
514 | | }; |
515 | | |
516 | | static const transform_2d FHT_8[] = { |
517 | | { fdct8, fdct8 }, // DCT_DCT = 0 |
518 | | { fadst8, fdct8 }, // ADST_DCT = 1 |
519 | | { fdct8, fadst8 }, // DCT_ADST = 2 |
520 | | { fadst8, fadst8 } // ADST_ADST = 3 |
521 | | }; |
522 | | |
523 | | static const transform_2d FHT_16[] = { |
524 | | { fdct16, fdct16 }, // DCT_DCT = 0 |
525 | | { fadst16, fdct16 }, // ADST_DCT = 1 |
526 | | { fdct16, fadst16 }, // DCT_ADST = 2 |
527 | | { fadst16, fadst16 } // ADST_ADST = 3 |
528 | | }; |
529 | | |
530 | | void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, |
531 | 0 | int tx_type) { |
532 | 0 | if (tx_type == DCT_DCT) { |
533 | 0 | vpx_fdct4x4_c(input, output, stride); |
534 | 0 | } else { |
535 | 0 | tran_low_t out[4 * 4]; |
536 | 0 | int i, j; |
537 | 0 | tran_low_t temp_in[4], temp_out[4]; |
538 | 0 | const transform_2d ht = FHT_4[tx_type]; |
539 | | |
540 | | // Columns |
541 | 0 | for (i = 0; i < 4; ++i) { |
542 | 0 | for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16; |
543 | 0 | if (i == 0 && temp_in[0]) temp_in[0] += 1; |
544 | 0 | ht.cols(temp_in, temp_out); |
545 | 0 | for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j]; |
546 | 0 | } |
547 | | |
548 | | // Rows |
549 | 0 | for (i = 0; i < 4; ++i) { |
550 | 0 | for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4]; |
551 | 0 | ht.rows(temp_in, temp_out); |
552 | 0 | for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2; |
553 | 0 | } |
554 | 0 | } |
555 | 0 | } |
556 | | |
557 | | void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, |
558 | 0 | int tx_type) { |
559 | 0 | if (tx_type == DCT_DCT) { |
560 | 0 | vpx_fdct8x8_c(input, output, stride); |
561 | 0 | } else { |
562 | 0 | tran_low_t out[64]; |
563 | 0 | int i, j; |
564 | 0 | tran_low_t temp_in[8], temp_out[8]; |
565 | 0 | const transform_2d ht = FHT_8[tx_type]; |
566 | | |
567 | | // Columns |
568 | 0 | for (i = 0; i < 8; ++i) { |
569 | 0 | for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4; |
570 | 0 | ht.cols(temp_in, temp_out); |
571 | 0 | for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j]; |
572 | 0 | } |
573 | | |
574 | | // Rows |
575 | 0 | for (i = 0; i < 8; ++i) { |
576 | 0 | for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8]; |
577 | 0 | ht.rows(temp_in, temp_out); |
578 | 0 | for (j = 0; j < 8; ++j) |
579 | 0 | output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; |
580 | 0 | } |
581 | 0 | } |
582 | 0 | } |
583 | | |
584 | | /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per |
585 | | pixel. */ |
586 | 0 | void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { |
587 | 0 | int i; |
588 | 0 | tran_high_t a1, b1, c1, d1, e1; |
589 | 0 | const int16_t *ip_pass0 = input; |
590 | 0 | const tran_low_t *ip = NULL; |
591 | 0 | tran_low_t *op = output; |
592 | |
|
593 | 0 | for (i = 0; i < 4; i++) { |
594 | 0 | a1 = ip_pass0[0 * stride]; |
595 | 0 | b1 = ip_pass0[1 * stride]; |
596 | 0 | c1 = ip_pass0[2 * stride]; |
597 | 0 | d1 = ip_pass0[3 * stride]; |
598 | |
|
599 | 0 | a1 += b1; |
600 | 0 | d1 = d1 - c1; |
601 | 0 | e1 = (a1 - d1) >> 1; |
602 | 0 | b1 = e1 - b1; |
603 | 0 | c1 = e1 - c1; |
604 | 0 | a1 -= c1; |
605 | 0 | d1 += b1; |
606 | 0 | op[0] = (tran_low_t)a1; |
607 | 0 | op[4] = (tran_low_t)c1; |
608 | 0 | op[8] = (tran_low_t)d1; |
609 | 0 | op[12] = (tran_low_t)b1; |
610 | |
|
611 | 0 | ip_pass0++; |
612 | 0 | op++; |
613 | 0 | } |
614 | 0 | ip = output; |
615 | 0 | op = output; |
616 | |
|
617 | 0 | for (i = 0; i < 4; i++) { |
618 | 0 | a1 = ip[0]; |
619 | 0 | b1 = ip[1]; |
620 | 0 | c1 = ip[2]; |
621 | 0 | d1 = ip[3]; |
622 | |
|
623 | 0 | a1 += b1; |
624 | 0 | d1 -= c1; |
625 | 0 | e1 = (a1 - d1) >> 1; |
626 | 0 | b1 = e1 - b1; |
627 | 0 | c1 = e1 - c1; |
628 | 0 | a1 -= c1; |
629 | 0 | d1 += b1; |
630 | 0 | op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); |
631 | 0 | op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); |
632 | 0 | op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); |
633 | 0 | op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); |
634 | |
|
635 | 0 | ip += 4; |
636 | 0 | op += 4; |
637 | 0 | } |
638 | 0 | } |
639 | | |
640 | | void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, |
641 | 0 | int tx_type) { |
642 | 0 | if (tx_type == DCT_DCT) { |
643 | 0 | vpx_fdct16x16_c(input, output, stride); |
644 | 0 | } else { |
645 | 0 | tran_low_t out[256]; |
646 | 0 | int i, j; |
647 | 0 | tran_low_t temp_in[16], temp_out[16]; |
648 | 0 | const transform_2d ht = FHT_16[tx_type]; |
649 | | |
650 | | // Columns |
651 | 0 | for (i = 0; i < 16; ++i) { |
652 | 0 | for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4; |
653 | 0 | ht.cols(temp_in, temp_out); |
654 | 0 | for (j = 0; j < 16; ++j) |
655 | 0 | out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; |
656 | 0 | } |
657 | | |
658 | | // Rows |
659 | 0 | for (i = 0; i < 16; ++i) { |
660 | 0 | for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16]; |
661 | 0 | ht.rows(temp_in, temp_out); |
662 | 0 | for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j]; |
663 | 0 | } |
664 | 0 | } |
665 | 0 | } |
666 | | |
667 | | #if CONFIG_VP9_HIGHBITDEPTH |
668 | | void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, |
669 | 0 | int tx_type) { |
670 | 0 | vp9_fht4x4_c(input, output, stride, tx_type); |
671 | 0 | } |
672 | | |
673 | | void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, |
674 | 0 | int tx_type) { |
675 | 0 | vp9_fht8x8_c(input, output, stride, tx_type); |
676 | 0 | } |
677 | | |
678 | | void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, |
679 | 0 | int stride) { |
680 | 0 | vp9_fwht4x4_c(input, output, stride); |
681 | 0 | } |
682 | | |
683 | | void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, |
684 | 0 | int tx_type) { |
685 | 0 | vp9_fht16x16_c(input, output, stride, tx_type); |
686 | 0 | } |
687 | | #endif // CONFIG_VP9_HIGHBITDEPTH |