/src/aom/av1/common/av1_inv_txfm2d.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #include "config/aom_dsp_rtcd.h" |
13 | | #include "config/av1_rtcd.h" |
14 | | |
15 | | #include "av1/common/enums.h" |
16 | | #include "av1/common/av1_txfm.h" |
17 | | #include "av1/common/av1_inv_txfm1d.h" |
18 | | #include "av1/common/av1_inv_txfm1d_cfg.h" |
19 | | |
20 | | void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, |
21 | 1.22M | int stride, int bd) { |
22 | | /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, |
23 | | 0.5 shifts per pixel. */ |
24 | 1.22M | int i; |
25 | 1.22M | tran_low_t output[16]; |
26 | 1.22M | tran_low_t a1, b1, c1, d1, e1; |
27 | 1.22M | const tran_low_t *ip = input; |
28 | 1.22M | tran_low_t *op = output; |
29 | 1.22M | uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
30 | | |
31 | 6.10M | for (i = 0; i < 4; i++) { |
32 | 4.88M | a1 = ip[4 * 0] >> UNIT_QUANT_SHIFT; |
33 | 4.88M | c1 = ip[4 * 1] >> UNIT_QUANT_SHIFT; |
34 | 4.88M | d1 = ip[4 * 2] >> UNIT_QUANT_SHIFT; |
35 | 4.88M | b1 = ip[4 * 3] >> UNIT_QUANT_SHIFT; |
36 | 4.88M | a1 += c1; |
37 | 4.88M | d1 -= b1; |
38 | 4.88M | e1 = (a1 - d1) >> 1; |
39 | 4.88M | b1 = e1 - b1; |
40 | 4.88M | c1 = e1 - c1; |
41 | 4.88M | a1 -= b1; |
42 | 4.88M | d1 += c1; |
43 | | |
44 | 4.88M | op[4 * 0] = a1; |
45 | 4.88M | op[4 * 1] = b1; |
46 | 4.88M | op[4 * 2] = c1; |
47 | 4.88M | op[4 * 3] = d1; |
48 | 4.88M | ip++; |
49 | 4.88M | op++; |
50 | 4.88M | } |
51 | | |
52 | 1.22M | ip = output; |
53 | 6.10M | for (i = 0; i < 4; i++) { |
54 | 4.88M | a1 = ip[0]; |
55 | 4.88M | c1 = ip[1]; |
56 | 4.88M | d1 = ip[2]; |
57 | 4.88M | b1 = ip[3]; |
58 | 4.88M | a1 += c1; |
59 | 4.88M | d1 -= b1; |
60 | 4.88M | e1 = (a1 - d1) >> 1; |
61 | 4.88M | b1 = e1 - b1; |
62 | 4.88M | c1 = e1 - c1; |
63 | 4.88M | a1 -= b1; |
64 | 4.88M | d1 += c1; |
65 | | |
66 | 4.88M | range_check_value(a1, bd + 1); |
67 | 4.88M | range_check_value(b1, bd + 1); |
68 | 4.88M | range_check_value(c1, bd + 1); |
69 | 4.88M | range_check_value(d1, bd + 1); |
70 | | |
71 | 4.88M | dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); |
72 | 4.88M | dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); |
73 | 4.88M | dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); |
74 | 4.88M | dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); |
75 | | |
76 | 4.88M | ip += 4; |
77 | 4.88M | dest++; |
78 | 4.88M | } |
79 | 1.22M | } |
80 | | |
81 | | void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, |
82 | 1.26M | int dest_stride, int bd) { |
83 | 1.26M | int i; |
84 | 1.26M | tran_low_t a1, e1; |
85 | 1.26M | tran_low_t tmp[4]; |
86 | 1.26M | const tran_low_t *ip = in; |
87 | 1.26M | tran_low_t *op = tmp; |
88 | 1.26M | uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); |
89 | 1.26M | (void)bd; |
90 | | |
91 | 1.26M | a1 = ip[0 * 4] >> UNIT_QUANT_SHIFT; |
92 | 1.26M | e1 = a1 >> 1; |
93 | 1.26M | a1 -= e1; |
94 | 1.26M | op[0] = a1; |
95 | 1.26M | op[1] = op[2] = op[3] = e1; |
96 | | |
97 | 1.26M | ip = tmp; |
98 | 6.31M | for (i = 0; i < 4; i++) { |
99 | 5.05M | e1 = ip[0] >> 1; |
100 | 5.05M | a1 = ip[0] - e1; |
101 | 5.05M | dest[dest_stride * 0] = |
102 | 5.05M | highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); |
103 | 5.05M | dest[dest_stride * 1] = |
104 | 5.05M | highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); |
105 | 5.05M | dest[dest_stride * 2] = |
106 | 5.05M | highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); |
107 | 5.05M | dest[dest_stride * 3] = |
108 | 5.05M | highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); |
109 | 5.05M | ip++; |
110 | 5.05M | dest++; |
111 | 5.05M | } |
112 | 1.26M | } |
113 | | |
114 | 3.21M | static inline TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) { |
115 | 3.21M | switch (txfm_type) { |
116 | 358k | case TXFM_TYPE_DCT4: return av1_idct4; |
117 | 710k | case TXFM_TYPE_DCT8: return av1_idct8; |
118 | 481k | case TXFM_TYPE_DCT16: return av1_idct16; |
119 | 310k | case TXFM_TYPE_DCT32: return av1_idct32; |
120 | 78.1k | case TXFM_TYPE_DCT64: return av1_idct64; |
121 | 287k | case TXFM_TYPE_ADST4: return av1_iadst4; |
122 | 429k | case TXFM_TYPE_ADST8: return av1_iadst8; |
123 | 301k | case TXFM_TYPE_ADST16: return av1_iadst16; |
124 | 83.6k | case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c; |
125 | 114k | case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c; |
126 | 50.6k | case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c; |
127 | 2.99k | case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c; |
128 | 0 | default: assert(0); return NULL; |
129 | 3.21M | } |
130 | 3.21M | } |
131 | | |
132 | | static const int8_t inv_shift_4x4[2] = { 0, -4 }; |
133 | | static const int8_t inv_shift_8x8[2] = { -1, -4 }; |
134 | | static const int8_t inv_shift_16x16[2] = { -2, -4 }; |
135 | | static const int8_t inv_shift_32x32[2] = { -2, -4 }; |
136 | | static const int8_t inv_shift_64x64[2] = { -2, -4 }; |
137 | | static const int8_t inv_shift_4x8[2] = { 0, -4 }; |
138 | | static const int8_t inv_shift_8x4[2] = { 0, -4 }; |
139 | | static const int8_t inv_shift_8x16[2] = { -1, -4 }; |
140 | | static const int8_t inv_shift_16x8[2] = { -1, -4 }; |
141 | | static const int8_t inv_shift_16x32[2] = { -1, -4 }; |
142 | | static const int8_t inv_shift_32x16[2] = { -1, -4 }; |
143 | | static const int8_t inv_shift_32x64[2] = { -1, -4 }; |
144 | | static const int8_t inv_shift_64x32[2] = { -1, -4 }; |
145 | | static const int8_t inv_shift_4x16[2] = { -1, -4 }; |
146 | | static const int8_t inv_shift_16x4[2] = { -1, -4 }; |
147 | | static const int8_t inv_shift_8x32[2] = { -2, -4 }; |
148 | | static const int8_t inv_shift_32x8[2] = { -2, -4 }; |
149 | | static const int8_t inv_shift_16x64[2] = { -2, -4 }; |
150 | | static const int8_t inv_shift_64x16[2] = { -2, -4 }; |
151 | | |
152 | | const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = { |
153 | | inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32, |
154 | | inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16, |
155 | | inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64, |
156 | | inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32, |
157 | | inv_shift_32x8, inv_shift_16x64, inv_shift_64x16, |
158 | | }; |
159 | | |
160 | | static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 }; |
161 | | |
162 | | void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size, |
163 | 1.60M | TXFM_2D_FLIP_CFG *cfg) { |
164 | 1.60M | assert(cfg != NULL); |
165 | 1.60M | cfg->tx_size = tx_size; |
166 | 1.60M | av1_zero(cfg->stage_range_col); |
167 | 1.60M | av1_zero(cfg->stage_range_row); |
168 | 1.60M | set_flip_cfg(tx_type, cfg); |
169 | 1.60M | const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type]; |
170 | 1.60M | const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type]; |
171 | 1.60M | cfg->shift = av1_inv_txfm_shift_ls[tx_size]; |
172 | 1.60M | const int txw_idx = get_txw_idx(tx_size); |
173 | 1.60M | const int txh_idx = get_txh_idx(tx_size); |
174 | 1.60M | cfg->cos_bit_col = INV_COS_BIT; |
175 | 1.60M | cfg->cos_bit_row = INV_COS_BIT; |
176 | 1.60M | cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col]; |
177 | 1.60M | if (cfg->txfm_type_col == TXFM_TYPE_ADST4) { |
178 | 150k | memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range)); |
179 | 150k | } |
180 | 1.60M | cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row]; |
181 | 1.60M | if (cfg->txfm_type_row == TXFM_TYPE_ADST4) { |
182 | 136k | memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range)); |
183 | 136k | } |
184 | 1.60M | cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col]; |
185 | 1.60M | cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row]; |
186 | 1.60M | } |
187 | | |
188 | | void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, |
189 | | const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size, |
190 | 1.60M | int bd) { |
191 | 1.60M | const int fwd_shift = inv_start_range[tx_size]; |
192 | 1.60M | const int8_t *shift = cfg->shift; |
193 | 1.60M | int8_t opt_range_row, opt_range_col; |
194 | 1.60M | if (bd == 8) { |
195 | 764k | opt_range_row = 16; |
196 | 764k | opt_range_col = 16; |
197 | 840k | } else if (bd == 10) { |
198 | 768k | opt_range_row = 18; |
199 | 768k | opt_range_col = 16; |
200 | 768k | } else { |
201 | 71.8k | assert(bd == 12); |
202 | 71.8k | opt_range_row = 20; |
203 | 71.8k | opt_range_col = 18; |
204 | 71.8k | } |
205 | | // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning |
206 | 13.0M | for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) { |
207 | 11.4M | int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1; |
208 | 11.4M | (void)real_range_row; |
209 | 11.4M | if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { |
210 | | // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 |
211 | | // so opt_range_row >= real_range_row will not hold |
212 | 136k | stage_range_row[i] = opt_range_row; |
213 | 11.2M | } else { |
214 | 11.2M | assert(opt_range_row >= real_range_row); |
215 | 11.2M | stage_range_row[i] = opt_range_row; |
216 | 11.2M | } |
217 | 11.4M | } |
218 | | // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning |
219 | 12.5M | for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) { |
220 | 10.8M | int real_range_col = |
221 | 10.8M | cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; |
222 | 10.8M | (void)real_range_col; |
223 | 10.8M | if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { |
224 | | // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 |
225 | | // so opt_range_col >= real_range_col will not hold |
226 | 150k | stage_range_col[i] = opt_range_col; |
227 | 10.7M | } else { |
228 | 10.7M | assert(opt_range_col >= real_range_col); |
229 | 10.7M | stage_range_col[i] = opt_range_col; |
230 | 10.7M | } |
231 | 10.8M | } |
232 | 1.60M | } |
233 | | |
234 | | static inline void inv_txfm2d_add_c(const int32_t *input, uint16_t *output, |
235 | | int stride, TXFM_2D_FLIP_CFG *cfg, |
236 | | int32_t *txfm_buf, TX_SIZE tx_size, |
237 | 1.60M | int bd) { |
238 | | // Note when assigning txfm_size_col, we use the txfm_size from the |
239 | | // row configuration and vice versa. This is intentionally done to |
240 | | // accurately perform rectangular transforms. When the transform is |
241 | | // rectangular, the number of columns will be the same as the |
242 | | // txfm_size stored in the row cfg struct. It will make no difference |
243 | | // for square transforms. |
244 | 1.60M | const int txfm_size_col = tx_size_wide[cfg->tx_size]; |
245 | 1.60M | const int txfm_size_row = tx_size_high[cfg->tx_size]; |
246 | | // Take the shift from the larger dimension in the rectangular case. |
247 | 1.60M | const int8_t *shift = cfg->shift; |
248 | 1.60M | const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); |
249 | 1.60M | int8_t stage_range_row[MAX_TXFM_STAGE_NUM]; |
250 | 1.60M | int8_t stage_range_col[MAX_TXFM_STAGE_NUM]; |
251 | 1.60M | assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM); |
252 | 1.60M | assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM); |
253 | 1.60M | av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd); |
254 | | |
255 | 1.60M | const int8_t cos_bit_col = cfg->cos_bit_col; |
256 | 1.60M | const int8_t cos_bit_row = cfg->cos_bit_row; |
257 | 1.60M | const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col); |
258 | 1.60M | const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row); |
259 | | |
260 | | // txfm_buf's length is txfm_size_row * txfm_size_col + 2 * |
261 | | // AOMMAX(txfm_size_row, txfm_size_col) |
262 | | // it is used for intermediate data buffering |
263 | 1.60M | const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); |
264 | 1.60M | int32_t *temp_in = txfm_buf; |
265 | 1.60M | int32_t *temp_out = temp_in + buf_offset; |
266 | 1.60M | int32_t *buf = temp_out + buf_offset; |
267 | 1.60M | int32_t *buf_ptr = buf; |
268 | 1.60M | int c, r; |
269 | | |
270 | | // Rows |
271 | 21.8M | for (r = 0; r < txfm_size_row; ++r) { |
272 | 20.2M | if (abs(rect_type) == 1) { |
273 | 104M | for (c = 0; c < txfm_size_col; ++c) { |
274 | 97.7M | temp_in[c] = round_shift( |
275 | 97.7M | (int64_t)input[c * txfm_size_row + r] * NewInvSqrt2, NewSqrt2Bits); |
276 | 97.7M | } |
277 | 6.37M | clamp_buf(temp_in, txfm_size_col, bd + 8); |
278 | 6.37M | txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); |
279 | 13.8M | } else { |
280 | 312M | for (c = 0; c < txfm_size_col; ++c) { |
281 | 298M | temp_in[c] = input[c * txfm_size_row + r]; |
282 | 298M | } |
283 | 13.8M | clamp_buf(temp_in, txfm_size_col, bd + 8); |
284 | 13.8M | txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row); |
285 | 13.8M | } |
286 | 20.2M | av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); |
287 | 20.2M | buf_ptr += txfm_size_col; |
288 | 20.2M | } |
289 | | |
290 | | // Columns |
291 | 22.6M | for (c = 0; c < txfm_size_col; ++c) { |
292 | 21.0M | if (cfg->lr_flip == 0) { |
293 | 416M | for (r = 0; r < txfm_size_row; ++r) |
294 | 395M | temp_in[r] = buf[r * txfm_size_col + c]; |
295 | 21.0M | } else { |
296 | | // flip left right |
297 | 466k | for (r = 0; r < txfm_size_row; ++r) |
298 | 420k | temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; |
299 | 46.5k | } |
300 | 21.0M | clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16)); |
301 | 21.0M | txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col); |
302 | 21.0M | av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); |
303 | 21.0M | if (cfg->ud_flip == 0) { |
304 | 416M | for (r = 0; r < txfm_size_row; ++r) { |
305 | 395M | output[r * stride + c] = |
306 | 395M | highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd); |
307 | 395M | } |
308 | 20.9M | } else { |
309 | | // flip upside down |
310 | 563k | for (r = 0; r < txfm_size_row; ++r) { |
311 | 499k | output[r * stride + c] = highbd_clip_pixel_add( |
312 | 499k | output[r * stride + c], temp_out[txfm_size_row - r - 1], bd); |
313 | 499k | } |
314 | 64.4k | } |
315 | 21.0M | } |
316 | 1.60M | } |
317 | | |
318 | | static inline void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output, |
319 | | int stride, int32_t *txfm_buf, |
320 | | TX_TYPE tx_type, TX_SIZE tx_size, |
321 | 1.60M | int bd) { |
322 | 1.60M | TXFM_2D_FLIP_CFG cfg; |
323 | 1.60M | av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg); |
324 | | // Forward shift sum uses larger square size, to be consistent with what |
325 | | // av1_gen_inv_stage_range() does for inverse shifts. |
326 | 1.60M | inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd); |
327 | 1.60M | } |
328 | | |
329 | | void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output, |
330 | 98.6k | int stride, TX_TYPE tx_type, int bd) { |
331 | 98.6k | DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); |
332 | 98.6k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd); |
333 | 98.6k | } |
334 | | |
335 | | void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output, |
336 | 121k | int stride, TX_TYPE tx_type, int bd) { |
337 | 121k | DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); |
338 | 121k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd); |
339 | 121k | } |
340 | | |
341 | | void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output, |
342 | 99.7k | int stride, TX_TYPE tx_type, int bd) { |
343 | 99.7k | DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]); |
344 | 99.7k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd); |
345 | 99.7k | } |
346 | | |
347 | | void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output, |
348 | 146k | int stride, TX_TYPE tx_type, int bd) { |
349 | 146k | DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]); |
350 | 146k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd); |
351 | 146k | } |
352 | | |
353 | | void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output, |
354 | 37.5k | int stride, TX_TYPE tx_type, int bd) { |
355 | 37.5k | DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]); |
356 | 37.5k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd); |
357 | 37.5k | } |
358 | | |
359 | | void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output, |
360 | 31.1k | int stride, TX_TYPE tx_type, int bd) { |
361 | 31.1k | DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]); |
362 | 31.1k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd); |
363 | 31.1k | } |
364 | | |
365 | | void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output, |
366 | 160k | int stride, TX_TYPE tx_type, int bd) { |
367 | 160k | DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]); |
368 | 160k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd); |
369 | 160k | } |
370 | | |
371 | | void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output, |
372 | 354k | int stride, TX_TYPE tx_type, int bd) { |
373 | 354k | DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]); |
374 | 354k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd); |
375 | 354k | } |
376 | | |
377 | | void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output, |
378 | 158k | int stride, TX_TYPE tx_type, int bd) { |
379 | 158k | DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]); |
380 | 158k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd); |
381 | 158k | } |
382 | | |
383 | | void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output, |
384 | 76.1k | int stride, TX_TYPE tx_type, int bd) { |
385 | 76.1k | DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]); |
386 | 76.1k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd); |
387 | 76.1k | } |
388 | | |
389 | | void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output, |
390 | 26.4k | int stride, TX_TYPE tx_type, int bd) { |
391 | | // TODO(urvang): Can the same array be reused, instead of using a new array? |
392 | | // Remap 32x32 input into a modified 64x64 by: |
393 | | // - Copying over these values in top-left 32x32 locations. |
394 | | // - Setting the rest of the locations to 0. |
395 | 26.4k | int32_t mod_input[64 * 64]; |
396 | 874k | for (int col = 0; col < 32; ++col) { |
397 | 847k | memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); |
398 | 847k | memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); |
399 | 847k | } |
400 | 26.4k | memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input)); |
401 | 26.4k | DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]); |
402 | 26.4k | inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64, |
403 | 26.4k | bd); |
404 | 26.4k | } |
405 | | |
406 | | void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output, |
407 | 3.47k | int stride, TX_TYPE tx_type, int bd) { |
408 | | // Remap 32x32 input into a modified 64x32 by: |
409 | | // - Copying over these values in top-left 32x32 locations. |
410 | | // - Setting the rest of the locations to 0. |
411 | 3.47k | int32_t mod_input[32 * 64]; |
412 | 3.47k | memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input)); |
413 | 3.47k | memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input)); |
414 | 3.47k | DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); |
415 | 3.47k | inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32, |
416 | 3.47k | bd); |
417 | 3.47k | } |
418 | | |
419 | | void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output, |
420 | 8.28k | int stride, TX_TYPE tx_type, int bd) { |
421 | | // Remap 32x32 input into a modified 32x64 input by: |
422 | | // - Copying over these values in top-left 32x32 locations. |
423 | | // - Setting the rest of the locations to 0. |
424 | 8.28k | int32_t mod_input[64 * 32]; |
425 | 273k | for (int col = 0; col < 32; ++col) { |
426 | 265k | memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); |
427 | 265k | memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); |
428 | 265k | } |
429 | 8.28k | DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]); |
430 | 8.28k | inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64, |
431 | 8.28k | bd); |
432 | 8.28k | } |
433 | | |
434 | | void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output, |
435 | 7.46k | int stride, TX_TYPE tx_type, int bd) { |
436 | | // Remap 16x32 input into a modified 16x64 input by: |
437 | | // - Copying over these values in top-left 16x32 locations. |
438 | | // - Setting the rest of the locations to 0. |
439 | 7.46k | int32_t mod_input[64 * 16]; |
440 | 126k | for (int col = 0; col < 16; ++col) { |
441 | 119k | memcpy(mod_input + col * 64, input + col * 32, 32 * sizeof(*mod_input)); |
442 | 119k | memset(mod_input + col * 64 + 32, 0, 32 * sizeof(*mod_input)); |
443 | 119k | } |
444 | 7.46k | DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); |
445 | 7.46k | inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64, |
446 | 7.46k | bd); |
447 | 7.46k | } |
448 | | |
449 | | void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output, |
450 | 5.94k | int stride, TX_TYPE tx_type, int bd) { |
451 | | // Remap 32x16 input into a modified 64x16 by: |
452 | | // - Copying over these values in top-left 32x16 locations. |
453 | | // - Setting the rest of the locations to 0. |
454 | 5.94k | int32_t mod_input[16 * 64]; |
455 | 5.94k | memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input)); |
456 | 5.94k | memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input)); |
457 | 5.94k | DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]); |
458 | 5.94k | inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16, |
459 | 5.94k | bd); |
460 | 5.94k | } |
461 | | |
462 | | void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output, |
463 | 72.7k | int stride, TX_TYPE tx_type, int bd) { |
464 | 72.7k | DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); |
465 | 72.7k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd); |
466 | 72.7k | } |
467 | | |
468 | | void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output, |
469 | 115k | int stride, TX_TYPE tx_type, int bd) { |
470 | 115k | DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); |
471 | 115k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd); |
472 | 115k | } |
473 | | |
474 | | void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output, |
475 | 36.9k | int stride, TX_TYPE tx_type, int bd) { |
476 | 36.9k | DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); |
477 | 36.9k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd); |
478 | 36.9k | } |
479 | | |
480 | | void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output, |
481 | 43.7k | int stride, TX_TYPE tx_type, int bd) { |
482 | 43.7k | DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]); |
483 | 43.7k | inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd); |
484 | 43.7k | } |