/src/libhevc/encoder/ihevce_had_satd.c
Line | Count | Source (jump to first uncovered line) |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2018 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | |
21 | | /** |
22 | | ****************************************************************************** |
23 | | * @file ihevce_had_satd.c |
24 | | * |
25 | | * @brief |
26 | | * This file contains functions of Hadamard SAD and SATD |
27 | | * |
28 | | * @author |
29 | | * Ittiam |
30 | | * |
31 | | * List of Functions |
32 | | * <TODO: TO BE ADDED> |
33 | | * |
34 | | ****************************************************************************** |
35 | | */ |
36 | | |
37 | | /*****************************************************************************/ |
38 | | /* File Includes */ |
39 | | /*****************************************************************************/ |
40 | | /* System include files */ |
41 | | #include <stdio.h> |
42 | | #include <string.h> |
43 | | #include <stdlib.h> |
44 | | #include <assert.h> |
45 | | #include <stdarg.h> |
46 | | #include <math.h> |
47 | | |
48 | | /* User include files */ |
49 | | #include "ihevc_typedefs.h" |
50 | | #include "itt_video_api.h" |
51 | | #include "ihevce_api.h" |
52 | | |
53 | | #include "rc_cntrl_param.h" |
54 | | #include "rc_frame_info_collector.h" |
55 | | #include "rc_look_ahead_params.h" |
56 | | |
57 | | #include "ihevc_defs.h" |
58 | | #include "ihevc_structs.h" |
59 | | #include "ihevc_platform_macros.h" |
60 | | #include "ihevc_deblk.h" |
61 | | #include "ihevc_itrans_recon.h" |
62 | | #include "ihevc_chroma_itrans_recon.h" |
63 | | #include "ihevc_chroma_intra_pred.h" |
64 | | #include "ihevc_intra_pred.h" |
65 | | #include "ihevc_inter_pred.h" |
66 | | #include "ihevc_mem_fns.h" |
67 | | #include "ihevc_padding.h" |
68 | | #include "ihevc_weighted_pred.h" |
69 | | #include "ihevc_sao.h" |
70 | | #include "ihevc_resi_trans.h" |
71 | | #include "ihevc_quant_iquant_ssd.h" |
72 | | #include "ihevc_cabac_tables.h" |
73 | | |
74 | | #include "ihevce_defs.h" |
75 | | #include "ihevce_lap_enc_structs.h" |
76 | | #include "ihevce_multi_thrd_structs.h" |
77 | | #include "ihevce_multi_thrd_funcs.h" |
78 | | #include "ihevce_me_common_defs.h" |
79 | | #include "ihevce_had_satd.h" |
80 | | #include "ihevce_error_codes.h" |
81 | | #include "ihevce_bitstream.h" |
82 | | #include "ihevce_cabac.h" |
83 | | #include "ihevce_rdoq_macros.h" |
84 | | #include "ihevce_function_selector.h" |
85 | | #include "ihevce_enc_structs.h" |
86 | | #include "ihevce_cmn_utils_instr_set_router.h" |
87 | | #include "hme_datatype.h" |
88 | | #include "hme_interface.h" |
89 | | #include "hme_common_defs.h" |
90 | | #include "hme_defs.h" |
91 | | |
92 | | /*****************************************************************************/ |
93 | | /* Function Definitions */ |
94 | | /*****************************************************************************/ |
95 | | |
96 | | static void ihevce_hadamard_4x4_8bit( |
97 | | UWORD8 *pu1_src, |
98 | | WORD32 src_strd, |
99 | | UWORD8 *pu1_pred, |
100 | | WORD32 pred_strd, |
101 | | WORD16 *pi2_dst, |
102 | | WORD32 dst_strd) |
103 | 1.88G | { |
104 | 1.88G | WORD32 k; |
105 | 1.88G | WORD16 m[16]; |
106 | | |
107 | | /*===== hadamard horz transform =====*/ |
108 | 9.43G | for(k = 0; k < 4; k++) |
109 | 7.55G | { |
110 | 7.55G | WORD32 r0, r1, r2, r3; |
111 | 7.55G | WORD32 h0, h1, h2, h3; |
112 | | |
113 | | /* Compute the residue block */ |
114 | 7.55G | r0 = pu1_src[0] - pu1_pred[0]; |
115 | 7.55G | r1 = pu1_src[1] - pu1_pred[1]; |
116 | 7.55G | r2 = pu1_src[2] - pu1_pred[2]; |
117 | 7.55G | r3 = pu1_src[3] - pu1_pred[3]; |
118 | | |
119 | 7.55G | h0 = r0 + r1; |
120 | 7.55G | h1 = r0 - r1; |
121 | 7.55G | h2 = r2 + r3; |
122 | 7.55G | h3 = r2 - r3; |
123 | | |
124 | 7.55G | m[k * 4 + 0] = h0 + h2; |
125 | 7.55G | m[k * 4 + 1] = h1 + h3; |
126 | 7.55G | m[k * 4 + 2] = h0 - h2; |
127 | 7.55G | m[k * 4 + 3] = h1 - h3; |
128 | | |
129 | 7.55G | pu1_pred += pred_strd; |
130 | 7.55G | pu1_src += src_strd; |
131 | 7.55G | } |
132 | | |
133 | | /*===== hadamard vert transform =====*/ |
134 | 9.43G | for(k = 0; k < 4; k++) |
135 | 7.55G | { |
136 | 7.55G | WORD32 v0, v1, v2, v3; |
137 | | |
138 | 7.55G | v0 = m[0 + k] + m[4 + k]; |
139 | 7.55G | v1 = m[0 + k] - m[4 + k]; |
140 | 7.55G | v2 = m[8 + k] + m[12 + k]; |
141 | 7.55G | v3 = m[8 + k] - m[12 + k]; |
142 | | |
143 | 7.55G | pi2_dst[0 * dst_strd + k] = v0 + v2; |
144 | 7.55G | pi2_dst[1 * dst_strd + k] = v1 + v3; |
145 | 7.55G | pi2_dst[2 * dst_strd + k] = v0 - v2; |
146 | 7.55G | pi2_dst[3 * dst_strd + k] = v1 - v3; |
147 | 7.55G | } |
148 | 1.88G | } |
149 | | |
150 | | static void ihevce_hadamard_8x8_8bit( |
151 | | UWORD8 *pu1_src, |
152 | | WORD32 src_strd, |
153 | | UWORD8 *pu1_pred, |
154 | | WORD32 pred_strd, |
155 | | WORD16 *pi2_dst, |
156 | | WORD32 dst_strd) |
157 | 289M | { |
158 | 289M | WORD32 i; |
159 | | |
160 | | // y0 |
161 | 289M | ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); |
162 | | // y1 |
163 | 289M | ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd); |
164 | | // y2 |
165 | 289M | ihevce_hadamard_4x4_8bit( |
166 | 289M | pu1_src + 4 * src_strd, |
167 | 289M | src_strd, |
168 | 289M | pu1_pred + 4 * pred_strd, |
169 | 289M | pred_strd, |
170 | 289M | pi2_dst + (4 * dst_strd), |
171 | 289M | dst_strd); |
172 | | // y3 |
173 | 289M | ihevce_hadamard_4x4_8bit( |
174 | 289M | pu1_src + 4 + 4 * src_strd, |
175 | 289M | src_strd, |
176 | 289M | pu1_pred + 4 + 4 * pred_strd, |
177 | 289M | pred_strd, |
178 | 289M | pi2_dst + (4 * dst_strd) + 4, |
179 | 289M | dst_strd); |
180 | | |
181 | | /* Child HAD results combined as follows to get Parent result */ |
182 | | /* _ _ */ |
183 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
184 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
185 | | /* \- -/ */ |
186 | 4.92G | for(i = 0; i < 16; i++) |
187 | 4.63G | { |
188 | 4.63G | WORD32 idx = (i >> 2) * dst_strd + (i % 4); |
189 | 4.63G | WORD16 a0 = pi2_dst[idx]; |
190 | 4.63G | WORD16 a1 = pi2_dst[4 + idx]; |
191 | 4.63G | WORD16 a2 = pi2_dst[(4 * dst_strd) + idx]; |
192 | 4.63G | WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx]; |
193 | | |
194 | 4.63G | WORD16 b0 = (a0 + a1); |
195 | 4.63G | WORD16 b1 = (a0 - a1); |
196 | 4.63G | WORD16 b2 = (a2 + a3); |
197 | 4.63G | WORD16 b3 = (a2 - a3); |
198 | | |
199 | 4.63G | pi2_dst[idx] = b0 + b2; |
200 | 4.63G | pi2_dst[4 + idx] = b1 + b3; |
201 | 4.63G | pi2_dst[(4 * dst_strd) + idx] = b0 - b2; |
202 | 4.63G | pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3; |
203 | 4.63G | } |
204 | 289M | } |
205 | | |
206 | | static void ihevce_hadamard_16x16_8bit( |
207 | | UWORD8 *pu1_src, |
208 | | WORD32 src_strd, |
209 | | UWORD8 *pu1_pred, |
210 | | WORD32 pred_strd, |
211 | | WORD16 *pi2_dst, |
212 | | WORD32 dst_strd) |
213 | 46.9M | { |
214 | 46.9M | WORD32 i; |
215 | | |
216 | | // y0 |
217 | 46.9M | ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); |
218 | | // y1 |
219 | 46.9M | ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd); |
220 | | // y2 |
221 | 46.9M | ihevce_hadamard_8x8_8bit( |
222 | 46.9M | pu1_src + 8 * src_strd, |
223 | 46.9M | src_strd, |
224 | 46.9M | pu1_pred + 8 * pred_strd, |
225 | 46.9M | pred_strd, |
226 | 46.9M | pi2_dst + (8 * dst_strd), |
227 | 46.9M | dst_strd); |
228 | | // y3 |
229 | 46.9M | ihevce_hadamard_8x8_8bit( |
230 | 46.9M | pu1_src + 8 + 8 * src_strd, |
231 | 46.9M | src_strd, |
232 | 46.9M | pu1_pred + 8 + 8 * pred_strd, |
233 | 46.9M | pred_strd, |
234 | 46.9M | pi2_dst + (8 * dst_strd) + 8, |
235 | 46.9M | dst_strd); |
236 | | |
237 | | /* Child HAD results combined as follows to get Parent result */ |
238 | | /* _ _ */ |
239 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
240 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
241 | | /* \- -/ */ |
242 | 3.04G | for(i = 0; i < 64; i++) |
243 | 3.00G | { |
244 | 3.00G | WORD32 idx = (i >> 3) * dst_strd + (i % 8); |
245 | 3.00G | WORD16 a0 = pi2_dst[idx]; |
246 | 3.00G | WORD16 a1 = pi2_dst[8 + idx]; |
247 | 3.00G | WORD16 a2 = pi2_dst[(8 * dst_strd) + idx]; |
248 | 3.00G | WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx]; |
249 | | |
250 | 3.00G | WORD16 b0 = (a0 + a1) >> 1; |
251 | 3.00G | WORD16 b1 = (a0 - a1) >> 1; |
252 | 3.00G | WORD16 b2 = (a2 + a3) >> 1; |
253 | 3.00G | WORD16 b3 = (a2 - a3) >> 1; |
254 | | |
255 | 3.00G | pi2_dst[idx] = b0 + b2; |
256 | 3.00G | pi2_dst[8 + idx] = b1 + b3; |
257 | 3.00G | pi2_dst[(8 * dst_strd) + idx] = b0 - b2; |
258 | 3.00G | pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3; |
259 | 3.00G | } |
260 | 46.9M | } |
261 | | |
262 | | static void ihevce_hadamard_32x32_8bit( |
263 | | UWORD8 *pu1_src, |
264 | | WORD32 src_strd, |
265 | | UWORD8 *pu1_pred, |
266 | | WORD32 pred_strd, |
267 | | WORD16 *pi2_dst, |
268 | | WORD32 dst_strd) |
269 | 4.42M | { |
270 | 4.42M | WORD32 i; |
271 | | |
272 | | // y0 |
273 | 4.42M | ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); |
274 | | // y1 |
275 | 4.42M | ihevce_hadamard_16x16_8bit( |
276 | 4.42M | pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd); |
277 | | // y2 |
278 | 4.42M | ihevce_hadamard_16x16_8bit( |
279 | 4.42M | pu1_src + 16 * src_strd, |
280 | 4.42M | src_strd, |
281 | 4.42M | pu1_pred + 16 * pred_strd, |
282 | 4.42M | pred_strd, |
283 | 4.42M | pi2_dst + (16 * dst_strd), |
284 | 4.42M | dst_strd); |
285 | | // y3 |
286 | 4.42M | ihevce_hadamard_16x16_8bit( |
287 | 4.42M | pu1_src + 16 + 16 * src_strd, |
288 | 4.42M | src_strd, |
289 | 4.42M | pu1_pred + 16 + 16 * pred_strd, |
290 | 4.42M | pred_strd, |
291 | 4.42M | pi2_dst + (16 * dst_strd) + 16, |
292 | 4.42M | dst_strd); |
293 | | |
294 | | /* Child HAD results combined as follows to get Parent result */ |
295 | | /* _ _ */ |
296 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
297 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
298 | | /* \- -/ */ |
299 | 1.13G | for(i = 0; i < 256; i++) |
300 | 1.13G | { |
301 | 1.13G | WORD32 idx = (i >> 4) * dst_strd + (i % 16); |
302 | 1.13G | WORD16 a0 = pi2_dst[idx] >> 2; |
303 | 1.13G | WORD16 a1 = pi2_dst[16 + idx] >> 2; |
304 | 1.13G | WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2; |
305 | 1.13G | WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2; |
306 | | |
307 | 1.13G | WORD16 b0 = (a0 + a1); |
308 | 1.13G | WORD16 b1 = (a0 - a1); |
309 | 1.13G | WORD16 b2 = (a2 + a3); |
310 | 1.13G | WORD16 b3 = (a2 - a3); |
311 | | |
312 | 1.13G | pi2_dst[idx] = b0 + b2; |
313 | 1.13G | pi2_dst[16 + idx] = b1 + b3; |
314 | 1.13G | pi2_dst[(16 * dst_strd) + idx] = b0 - b2; |
315 | 1.13G | pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3; |
316 | 1.13G | } |
317 | 4.42M | } |
318 | | |
319 | | /** |
320 | | ******************************************************************************* |
321 | | * |
322 | | * @brief |
323 | | * Compute Hadamard sad for 4x4 block with 8-bit input |
324 | | * |
325 | | * @par Description: |
326 | | * |
327 | | * @param[in] pu1_origin |
328 | | * UWORD8 pointer to the current block |
329 | | * |
330 | | * @param[in] src_strd |
331 | | * WORD32 Source stride |
332 | | * |
333 | | * @param[in] pu1_pred_buf |
334 | | * UWORD8 pointer to the prediction block |
335 | | * |
336 | | * @param[in] pred_strd |
337 | | * WORD32 Pred stride |
338 | | * |
339 | | * @param[in] pi2_dst |
340 | | * WORD16 pointer to the transform block |
341 | | * |
342 | | * @param[in] dst_strd |
343 | | * WORD32 Destination stride |
344 | | * |
345 | | * @param[in] size |
346 | | * WORD32 transform Block size |
347 | | * |
348 | | * @returns hadamard SAD |
349 | | * |
350 | | * @remarks |
351 | | * Not updating the transform destination now. Only returning the SATD |
352 | | * |
353 | | ******************************************************************************* |
354 | | */ |
355 | | UWORD32 ihevce_HAD_4x4_8bit( |
356 | | UWORD8 *pu1_origin, |
357 | | WORD32 src_strd, |
358 | | UWORD8 *pu1_pred_buf, |
359 | | WORD32 pred_strd, |
360 | | WORD16 *pi2_dst, |
361 | | WORD32 dst_strd) |
362 | 293M | { |
363 | 293M | WORD32 k; |
364 | 293M | WORD16 v[16]; |
365 | 293M | UWORD32 u4_sad = 0; |
366 | | |
367 | 293M | (void)pi2_dst; |
368 | 293M | (void)dst_strd; |
369 | 293M | ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4); |
370 | | |
371 | 4.98G | for(k = 0; k < 16; ++k) |
372 | 4.69G | u4_sad += abs(v[k]); |
373 | 293M | u4_sad = ((u4_sad + 2) >> 2); |
374 | | |
375 | 293M | return u4_sad; |
376 | 293M | } |
377 | | |
378 | | /** |
379 | | ******************************************************************************* |
380 | | * |
381 | | * @brief |
382 | | * Computes Hadamard Sad for 8x8 block with 8-bit input |
383 | | * |
384 | | * @par Description: |
385 | | * |
386 | | * @param[in] pu1_origin |
387 | | * UWORD8 pointer to the current block |
388 | | * |
389 | | * @param[in] src_strd |
390 | | * WORD32 Source stride |
391 | | * |
392 | | * @param[in] pu1_pred_buf |
393 | | * UWORD8 pointer to the prediction block |
394 | | * |
395 | | * @param[in] pred_strd |
396 | | * WORD32 Pred stride |
397 | | * |
398 | | * @param[in] pi2_dst |
399 | | * WORD16 pointer to the transform block |
400 | | * |
401 | | * @param[in] dst_strd |
402 | | * WORD32 Destination stride |
403 | | * |
404 | | * @param[in] size |
405 | | * WORD32 transform Block size |
406 | | * |
407 | | * @returns Hadamard SAD |
408 | | * |
409 | | * @remarks |
410 | | * Not updating the transform destination now. Only returning the SATD |
411 | | * |
412 | | ******************************************************************************* |
413 | | */ |
414 | | UWORD32 ihevce_HAD_8x8_8bit( |
415 | | UWORD8 *pu1_origin, |
416 | | WORD32 src_strd, |
417 | | UWORD8 *pu1_pred_buf, |
418 | | WORD32 pred_strd, |
419 | | WORD16 *pi2_dst, |
420 | | WORD32 dst_strd) |
421 | 102M | { |
422 | 102M | WORD32 k; |
423 | 102M | UWORD32 u4_sad = 0; |
424 | 102M | WORD16 v[64]; |
425 | | |
426 | 102M | (void)pi2_dst; |
427 | 102M | (void)dst_strd; |
428 | 102M | ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8); |
429 | | |
430 | 6.64G | for(k = 0; k < 64; ++k) |
431 | 6.53G | u4_sad += abs(v[k]); |
432 | 102M | u4_sad = ((u4_sad + 4) >> 3); |
433 | | |
434 | 102M | return u4_sad; |
435 | 102M | } |
436 | | |
437 | | /** |
438 | | ******************************************************************************* |
439 | | * |
440 | | * @brief |
441 | | * Compute dc suppressed hadamard sad for 8x8 block with 8-bit input |
442 | | * |
443 | | * @par Description: |
444 | | * |
445 | | * @param[in] pu1_origin |
446 | | * UWORD8 pointer to the current block |
447 | | * |
448 | | * @param[in] src_strd |
449 | | * WORD32 Source stride |
450 | | * |
451 | | * @param[in] pu1_pred_buf |
452 | | * UWORD8 pointer to the prediction block |
453 | | * |
454 | | * @param[in] pred_strd |
455 | | * WORD32 Pred stride |
456 | | * |
457 | | * @param[in] pi2_dst |
458 | | * WORD16 pointer to the transform block |
459 | | * |
460 | | * @param[in] dst_strd |
461 | | * WORD32 Destination stride |
462 | | * |
463 | | * @param[in] size |
464 | | * WORD32 transform Block size |
465 | | * |
466 | | * @returns Hadamard SAD with DC Suppressed |
467 | | * |
468 | | * @remarks |
469 | | * Not updating the transform destination now. Only returning the SATD |
470 | | * |
471 | | ******************************************************************************* |
472 | | */ |
473 | | UWORD32 ihevce_compute_ac_had_8x8_8bit( |
474 | | UWORD8 *pu1_origin, |
475 | | WORD32 src_strd, |
476 | | UWORD8 *pu1_pred_buf, |
477 | | WORD32 pred_strd, |
478 | | WORD16 *pi2_dst, |
479 | | WORD32 dst_strd) |
480 | 0 | { |
481 | 0 | WORD32 k; |
482 | 0 | UWORD32 u4_sad = 0; |
483 | 0 | WORD16 v[64]; |
484 | |
|
485 | 0 | (void)pi2_dst; |
486 | 0 | (void)dst_strd; |
487 | 0 | ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8); |
488 | |
|
489 | 0 | v[0] = 0; |
490 | 0 | for(k = 0; k < 64; ++k) |
491 | 0 | u4_sad += abs(v[k]); |
492 | 0 | u4_sad = ((u4_sad + 4) >> 3); |
493 | |
|
494 | 0 | return u4_sad; |
495 | 0 | } |
496 | | |
497 | | /** |
498 | | ******************************************************************************* |
499 | | * |
500 | | * @brief |
501 | | * Computes Hadamard Sad for 16x16 block with 8-bit input |
502 | | * |
503 | | * @par Description: |
504 | | * |
505 | | * @param[in] pu1_origin |
506 | | * UWORD8 pointer to the current block |
507 | | * |
508 | | * @param[in] src_strd |
509 | | * WORD32 Source stride |
510 | | * |
511 | | * @param[in] pu1_pred_buf |
512 | | * UWORD8 pointer to the prediction block |
513 | | * |
514 | | * @param[in] pred_strd |
515 | | * WORD32 Pred stride |
516 | | * |
517 | | * @param[in] pi2_dst |
518 | | * WORD16 pointer to the transform block |
519 | | * |
520 | | * @param[in] dst_strd |
521 | | * WORD32 Destination stride |
522 | | * |
523 | | * @param[in] size |
524 | | * WORD32 transform Block size |
525 | | * |
526 | | * @returns Hadamard SAD |
527 | | * |
528 | | * @remarks |
529 | | * Not updating the transform destination now. Only returning the SATD |
530 | | * |
531 | | ******************************************************************************* |
532 | | */ |
533 | | UWORD32 ihevce_HAD_16x16_8bit( |
534 | | UWORD8 *pu1_origin, |
535 | | WORD32 src_strd, |
536 | | UWORD8 *pu1_pred_buf, |
537 | | WORD32 pred_strd, |
538 | | WORD16 *pi2_dst, |
539 | | WORD32 dst_strd) |
540 | 29.2M | { |
541 | 29.2M | WORD32 k; |
542 | 29.2M | UWORD32 u4_sad = 0; |
543 | 29.2M | WORD16 v[256]; |
544 | | |
545 | 29.2M | (void)pi2_dst; |
546 | 29.2M | (void)dst_strd; |
547 | 29.2M | ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16); |
548 | | |
549 | 7.50G | for(k = 0; k < 256; ++k) |
550 | 7.47G | u4_sad += abs(v[k]); |
551 | 29.2M | u4_sad = ((u4_sad + 4) >> 3); |
552 | | |
553 | 29.2M | return u4_sad; |
554 | 29.2M | } |
555 | | |
556 | | /** |
557 | | ******************************************************************************* |
558 | | * |
559 | | * @brief |
560 | | * Computes Hadamard Sad for 32x32 block with 8-bit input |
561 | | * |
562 | | * @par Description: |
563 | | * |
564 | | * @param[in] pu1_origin |
565 | | * UWORD8 pointer to the current block |
566 | | * |
567 | | * @param[in] src_strd |
568 | | * WORD32 Source stride |
569 | | * |
570 | | * @param[in] pu1_pred_buf |
571 | | * UWORD8 pointer to the prediction block |
572 | | * |
573 | | * @param[in] pred_strd |
574 | | * WORD32 Pred stride |
575 | | * |
576 | | * @param[in] pi2_dst |
577 | | * WORD16 pointer to the transform block |
578 | | * |
579 | | * @param[in] dst_strd |
580 | | * WORD32 Destination stride |
581 | | * |
582 | | * @param[in] size |
583 | | * WORD32 transform Block size |
584 | | * |
585 | | * @returns Hadamard SAD |
586 | | * |
587 | | * @remarks |
588 | | * Not updating the transform destination now. Only returning the SATD |
589 | | * |
590 | | ******************************************************************************* |
591 | | */ |
592 | | UWORD32 ihevce_HAD_32x32_8bit( |
593 | | UWORD8 *pu1_origin, |
594 | | WORD32 src_strd, |
595 | | UWORD8 *pu1_pred_buf, |
596 | | WORD32 pred_strd, |
597 | | WORD16 *pi2_dst, |
598 | | WORD32 dst_strd) |
599 | 4.42M | { |
600 | 4.42M | WORD32 k; |
601 | 4.42M | UWORD32 u4_sad = 0; |
602 | 4.42M | WORD16 v[32 * 32]; |
603 | | |
604 | 4.42M | (void)pi2_dst; |
605 | 4.42M | (void)dst_strd; |
606 | 4.42M | ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32); |
607 | | |
608 | 4.53G | for(k = 0; k < 32 * 32; ++k) |
609 | 4.53G | u4_sad += abs(v[k]); |
610 | 4.42M | u4_sad = ((u4_sad + 2) >> 2); |
611 | | |
612 | 4.42M | return u4_sad; |
613 | 4.42M | } |
614 | | |
615 | | //#if COMPUTE_16x16_R == C |
616 | | /** |
617 | | ******************************************************************************* |
618 | | * |
619 | | * @brief |
620 | | * Computes 8x8 transform using children 4x4 hadamard results |
621 | | * |
622 | | * @par Description: |
623 | | * |
624 | | * @param[in] pi2_4x4_had |
625 | | * WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) |
626 | | * |
627 | | * @param[in] had4_strd |
628 | | * stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 |
629 | | * |
630 | | * @param[out] pi2_dst |
631 | | * destination buffer where 8x8 hadamard result is stored |
632 | | * |
633 | | * @param[in] dst_stride |
634 | | * stride of destination block |
635 | | * |
636 | | * @param[in] i4_frm_qstep |
637 | | * frm_qstep value based on the which the threshold value is calculated |
638 | | * |
639 | | * @returns |
640 | | * 8x8 Hadamard SATD |
641 | | * @remarks |
642 | | * |
643 | | ******************************************************************************* |
644 | | */ |
645 | | static UWORD32 ihevce_compute_8x8HAD_using_4x4( |
646 | | WORD16 *pi2_4x4_had, |
647 | | WORD32 had4_strd, |
648 | | WORD16 *pi2_dst, |
649 | | WORD32 dst_strd, |
650 | | WORD32 i4_frm_qstep, |
651 | | WORD32 *pi4_cbf) |
652 | 31.7M | { |
653 | | /* Qstep value is right shifted by 8 */ |
654 | 31.7M | WORD32 threshold = (i4_frm_qstep >> 8); |
655 | | |
656 | | /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */ |
657 | 31.7M | WORD16 *pi2_y0 = pi2_4x4_had; |
658 | 31.7M | WORD16 *pi2_y1 = pi2_4x4_had + 4; |
659 | 31.7M | WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4; |
660 | 31.7M | WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4; |
661 | | |
662 | | /* Initialize pointers to store 8x8 HAD output */ |
663 | 31.7M | WORD16 *pi2_dst0 = pi2_dst; |
664 | 31.7M | WORD16 *pi2_dst1 = pi2_dst + 4; |
665 | 31.7M | WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4; |
666 | 31.7M | WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4; |
667 | | |
668 | 31.7M | UWORD32 u4_satd = 0; |
669 | 31.7M | WORD32 i; |
670 | | |
671 | | /* Child HAD results combined as follows to get Parent result */ |
672 | | /* _ _ */ |
673 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
674 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
675 | | /* \- -/ */ |
676 | 539M | for(i = 0; i < 16; i++) |
677 | 507M | { |
678 | 507M | WORD32 src_idx = (i >> 2) * had4_strd + (i % 4); |
679 | 507M | WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4); |
680 | | |
681 | 507M | WORD16 a0 = pi2_y0[src_idx]; |
682 | 507M | WORD16 a1 = pi2_y1[src_idx]; |
683 | 507M | WORD16 a2 = pi2_y2[src_idx]; |
684 | 507M | WORD16 a3 = pi2_y3[src_idx]; |
685 | | |
686 | 507M | WORD16 b0 = (a0 + a1); |
687 | 507M | WORD16 b1 = (a0 - a1); |
688 | 507M | WORD16 b2 = (a2 + a3); |
689 | 507M | WORD16 b3 = (a2 - a3); |
690 | | |
691 | 507M | pi2_dst0[dst_idx] = b0 + b2; |
692 | 507M | pi2_dst1[dst_idx] = b1 + b3; |
693 | 507M | pi2_dst2[dst_idx] = b0 - b2; |
694 | 507M | pi2_dst3[dst_idx] = b1 - b3; |
695 | | |
696 | 507M | if(ABS(pi2_dst0[dst_idx]) > threshold) |
697 | 20.6M | *pi4_cbf = 1; |
698 | 507M | if(ABS(pi2_dst1[dst_idx]) > threshold) |
699 | 20.0M | *pi4_cbf = 1; |
700 | 507M | if(ABS(pi2_dst2[dst_idx]) > threshold) |
701 | 20.2M | *pi4_cbf = 1; |
702 | 507M | if(ABS(pi2_dst3[dst_idx]) > threshold) |
703 | 19.8M | *pi4_cbf = 1; |
704 | | |
705 | 507M | u4_satd += ABS(pi2_dst0[dst_idx]); |
706 | 507M | u4_satd += ABS(pi2_dst1[dst_idx]); |
707 | 507M | u4_satd += ABS(pi2_dst2[dst_idx]); |
708 | 507M | u4_satd += ABS(pi2_dst3[dst_idx]); |
709 | 507M | } |
710 | | |
711 | | /* return the 8x8 satd */ |
712 | 31.7M | return (u4_satd); |
713 | 31.7M | } |
714 | | |
715 | | /** |
716 | | ******************************************************************************* |
717 | | * |
718 | | * @brief |
719 | | * Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of |
720 | | * a 8x8 block (Residue is computed for 8-bit src and prediction buffers) |
721 | | * Modified to incorporate the dead-zone implementation - Lokesh |
722 | | * |
723 | | * @par Description: |
724 | | * |
725 | | * @param[in] pu1_origin |
726 | | * UWORD8 pointer to the current block |
727 | | * |
728 | | * @param[in] src_strd |
729 | | * WORD32 Source stride |
730 | | * |
731 | | * @param[in] pu1_pred |
732 | | * UWORD8 pointer to the prediction block |
733 | | * |
734 | | * @param[in] pred_strd |
735 | | * WORD32 Pred stride |
736 | | * |
737 | | * @param[out] pi2_dst |
738 | | * WORD16 pointer to the transform block |
739 | | * |
740 | | * @param[in] dst_strd |
741 | | * WORD32 Destination stride |
742 | | * |
743 | | * @param[out] pi4_hsad |
744 | | * array for storing hadmard sad of each 4x4 block |
745 | | * |
746 | | * @param[in] hsad_stride |
747 | | * stride of hadmard sad destination buffer (for Zscan order of storing sads) |
748 | | * |
749 | | * @param[in] i4_frm_qstep |
750 | | * frm_qstep value based on the which the threshold value is calculated |
751 | | * |
752 | | * @returns |
753 | | * |
754 | | * @remarks |
755 | | * |
756 | | ******************************************************************************* |
757 | | */ |
758 | | static WORD32 ihevce_had4_4x4( |
759 | | UWORD8 *pu1_src, |
760 | | WORD32 src_strd, |
761 | | UWORD8 *pu1_pred, |
762 | | WORD32 pred_strd, |
763 | | WORD16 *pi2_dst4x4, |
764 | | WORD32 dst_strd, |
765 | | WORD32 *pi4_hsad, |
766 | | WORD32 hsad_stride, |
767 | | WORD32 i4_frm_qstep) |
768 | 108M | { |
769 | 108M | WORD32 i, k; |
770 | 108M | WORD32 i4_child_total_sad = 0; |
771 | | |
772 | 108M | (void)i4_frm_qstep; |
773 | | /* -------- Compute four 4x4 HAD Transforms ---------*/ |
774 | 543M | for(i = 0; i < 4; i++) |
775 | 434M | { |
776 | 434M | UWORD8 *pu1_pi0, *pu1_pi1; |
777 | 434M | WORD16 *pi2_dst; |
778 | 434M | WORD32 blkx, blky; |
779 | 434M | UWORD32 u4_hsad = 0; |
780 | | // TODO: choose deadzone as f(qstep) |
781 | 434M | WORD32 threshold = 0; |
782 | | |
783 | | /*****************************************************/ |
784 | | /* Assuming the looping structure of the four */ |
785 | | /* blocks is in Z scan order of 4x4s in a 8x8 */ |
786 | | /* block instead of raster scan */ |
787 | | /*****************************************************/ |
788 | 434M | blkx = (i & 0x1); |
789 | 434M | blky = (i >> 1); |
790 | | |
791 | 434M | pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd); |
792 | 434M | pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd); |
793 | 434M | pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd); |
794 | | |
795 | 434M | ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd); |
796 | | |
797 | 2.17G | for(k = 0; k < 4; k++) |
798 | 1.73G | { |
799 | 1.73G | if(ABS(pi2_dst[0 * dst_strd + k]) < threshold) |
800 | 0 | pi2_dst[0 * dst_strd + k] = 0; |
801 | | |
802 | 1.73G | if(ABS(pi2_dst[1 * dst_strd + k]) < threshold) |
803 | 0 | pi2_dst[1 * dst_strd + k] = 0; |
804 | | |
805 | 1.73G | if(ABS(pi2_dst[2 * dst_strd + k]) < threshold) |
806 | 0 | pi2_dst[2 * dst_strd + k] = 0; |
807 | | |
808 | 1.73G | if(ABS(pi2_dst[3 * dst_strd + k]) < threshold) |
809 | 0 | pi2_dst[3 * dst_strd + k] = 0; |
810 | | |
811 | | /* Accumulate the SATD */ |
812 | 1.73G | u4_hsad += ABS(pi2_dst[0 * dst_strd + k]); |
813 | 1.73G | u4_hsad += ABS(pi2_dst[1 * dst_strd + k]); |
814 | 1.73G | u4_hsad += ABS(pi2_dst[2 * dst_strd + k]); |
815 | 1.73G | u4_hsad += ABS(pi2_dst[3 * dst_strd + k]); |
816 | 1.73G | } |
817 | | |
818 | | /*===== Normalize the HSAD =====*/ |
819 | 434M | pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2); |
820 | 434M | i4_child_total_sad += ((u4_hsad + 2) >> 2); |
821 | 434M | } |
822 | 108M | return i4_child_total_sad; |
823 | 108M | } |
824 | | |
825 | | /** |
826 | | ******************************************************************************* |
827 | | * |
828 | | * @brief |
829 | | * HSAD is returned for the 4, 4x4 in 8x8 |
830 | | * |
831 | | * @par Description: |
832 | | * |
833 | | * @param[in] pu1_origin |
834 | | * UWORD8 pointer to the current block |
835 | | * |
836 | | * @param[in] src_strd |
837 | | * WORD32 Source stride |
838 | | * |
839 | | * @param[in] pu1_pred |
840 | | * UWORD8 pointer to the prediction block |
841 | | * |
842 | | * @param[in] pred_strd |
843 | | * WORD32 Pred stride |
844 | | * |
845 | | * @param[out] pi2_dst |
846 | | * WORD16 pointer to the transform output block |
847 | | * |
848 | | * @param[out] dst_strd |
849 | | * WORD32 Destination stride |
850 | | * |
851 | | * @param[out] ppi4_hsad |
852 | | * pointer to base pointers for storing hadmard sads of various |
853 | | * block sizes (4x4 to 32x32) |
854 | | * |
855 | | * @param[in] pos_x_y_4x4 |
856 | | * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB |
857 | | * Lower 16bits denote xpos and upper 16ypos of the 4x4block |
858 | | * |
859 | | * @param[in] num_4x4_in_row |
860 | | * Denotes the number of current 4x4 blocks in a ctb/CU/MB |
861 | | * |
862 | | * @returns |
863 | | * |
864 | | * @remarks |
865 | | * |
866 | | ******************************************************************************* |
867 | | */ |
868 | | void ihevce_had_8x8_using_4_4x4( |
869 | | UWORD8 *pu1_src, |
870 | | WORD32 src_strd, |
871 | | UWORD8 *pu1_pred, |
872 | | WORD32 pred_strd, |
873 | | WORD16 *pi2_dst, |
874 | | WORD32 dst_strd, |
875 | | WORD32 **ppi4_hsad, |
876 | | WORD32 pos_x_y_4x4, |
877 | | WORD32 num_4x4_in_row) |
878 | 76.9M | { |
879 | 76.9M | WORD16 ai2_4x4_had[64]; |
880 | 76.9M | WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; |
881 | 76.9M | WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; |
882 | 76.9M | WORD32 *pi4_4x4_hsad; |
883 | 76.9M | WORD32 *pi4_8x8_hsad; |
884 | | |
885 | 76.9M | (void)pi2_dst; |
886 | 76.9M | (void)dst_strd; |
887 | 76.9M | ASSERT(pos_x >= 0); |
888 | 76.9M | ASSERT(pos_y >= 0); |
889 | | |
890 | | /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */ |
891 | 76.9M | pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row; |
892 | 76.9M | pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); |
893 | | |
894 | | /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */ |
895 | 76.9M | pi4_8x8_hsad[0] = ihevce_had4_4x4( |
896 | 76.9M | pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0); |
897 | 76.9M | } |
898 | | |
899 | | /** |
900 | | ******************************************************************************* |
901 | | * |
902 | | * @brief |
903 | | * Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8 |
904 | | * block and its four subblocks(4x4). |
905 | | * |
906 | | * @par Description: |
907 | | * |
908 | | * @param[in] pu1_origin |
909 | | * UWORD8 pointer to the current block |
910 | | * |
911 | | * @param[in] src_strd |
912 | | * WORD32 Source stride |
913 | | * |
914 | | * @param[in] pu1_pred |
915 | | * UWORD8 pointer to the prediction block |
916 | | * |
917 | | * @param[in] pred_strd |
918 | | * WORD32 Pred stride |
919 | | * |
920 | | * @param[out] pi2_dst |
921 | | * WORD16 pointer to the transform output block |
922 | | * |
923 | | * @param[out] dst_strd |
924 | | * WORD32 Destination stride |
925 | | * |
926 | | * @param[out] ppi4_hsad |
927 | | * pointer to base pointers for storing hadmard sads of various |
928 | | * block sizes (4x4 to 32x32) |
929 | | * |
930 | | * @param[in] pos_x_y_4x4 |
931 | | * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB |
932 | | * Lower 16bits denote xpos and upper 16ypos of the 4x4block |
933 | | * |
934 | | * @param[in] num_4x4_in_row |
935 | | * Denotes the number of current 4x4 blocks in a ctb/CU/MB |
936 | | * |
937 | | * @param[in] i4_frm_qstep |
938 | | * frm_qstep value based on the which the threshold value is calculated |
939 | | * |
940 | | * @returns |
941 | | * |
942 | | * @remarks |
943 | | * |
944 | | ******************************************************************************* |
945 | | */ |
946 | | WORD32 ihevce_had_8x8_using_4_4x4_r( |
947 | | UWORD8 *pu1_src, |
948 | | WORD32 src_strd, |
949 | | UWORD8 *pu1_pred, |
950 | | WORD32 pred_strd, |
951 | | WORD16 *pi2_dst, |
952 | | WORD32 dst_strd, |
953 | | WORD32 **ppi4_hsad, |
954 | | WORD32 **ppi4_tu_split, |
955 | | WORD32 **ppi4_tu_early_cbf, |
956 | | WORD32 pos_x_y_4x4, |
957 | | WORD32 num_4x4_in_row, |
958 | | WORD32 lambda, |
959 | | WORD32 lambda_q_shift, |
960 | | WORD32 i4_frm_qstep, |
961 | | WORD32 i4_cur_depth, |
962 | | WORD32 i4_max_depth, |
963 | | WORD32 i4_max_tr_size, |
964 | | WORD32 *pi4_tu_split_cost, |
965 | | void *pv_func_sel) |
966 | 31.7M | { |
967 | 31.7M | WORD16 ai2_4x4_had[64]; |
968 | 31.7M | WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; |
969 | 31.7M | WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; |
970 | 31.7M | WORD32 *pi4_4x4_hsad; |
971 | 31.7M | WORD32 *pi4_8x8_hsad; |
972 | 31.7M | WORD32 *pi4_8x8_tu_split; |
973 | | |
974 | 31.7M | WORD32 *pi4_8x8_tu_early_cbf; |
975 | | |
976 | 31.7M | UWORD32 u4_satd; |
977 | 31.7M | WORD32 cost_child = 0, cost_parent = 0; |
978 | 31.7M | WORD32 early_cbf = 0; |
979 | | |
980 | 31.7M | const UWORD8 u1_cur_tr_size = 8; |
981 | | /* Stores the best cost for the Current 8x8: Lokesh */ |
982 | 31.7M | WORD32 best_cost = 0; |
983 | | |
984 | 31.7M | (void)pv_func_sel; |
985 | 31.7M | ASSERT(pos_x >= 0); |
986 | 31.7M | ASSERT(pos_y >= 0); |
987 | | |
988 | | /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */ |
989 | 31.7M | pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row; |
990 | 31.7M | pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); |
991 | 31.7M | pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); |
992 | 31.7M | pi4_8x8_tu_early_cbf = |
993 | 31.7M | ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); |
994 | | |
995 | | /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */ |
996 | 31.7M | cost_child = ihevce_had4_4x4( |
997 | 31.7M | pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0); |
998 | | |
999 | | /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */ |
1000 | 31.7M | u4_satd = ihevce_compute_8x8HAD_using_4x4( |
1001 | 31.7M | ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); |
1002 | | |
1003 | | /* store the normalized 8x8 satd */ |
1004 | 31.7M | cost_parent = ((u4_satd + 4) >> 3); |
1005 | | |
1006 | | /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */ |
1007 | 31.7M | cost_child += ((4) * lambda) >> (lambda_q_shift + 1); |
1008 | | |
1009 | 31.7M | if(i4_cur_depth < i4_max_depth) |
1010 | 16.4M | { |
1011 | 16.4M | if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size)) |
1012 | 757k | { |
1013 | | //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1); |
1014 | 757k | *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1); |
1015 | 757k | best_cost = cost_child; |
1016 | 757k | best_cost <<= 1; |
1017 | 757k | best_cost++; |
1018 | 757k | pi4_8x8_tu_split[0] = 1; |
1019 | 757k | pi4_8x8_hsad[0] = cost_child; |
1020 | 757k | } |
1021 | 15.7M | else |
1022 | 15.7M | { |
1023 | | //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1); |
1024 | 15.7M | best_cost = cost_parent; |
1025 | 15.7M | best_cost <<= 1; |
1026 | 15.7M | pi4_8x8_tu_split[0] = 0; |
1027 | 15.7M | pi4_8x8_hsad[0] = cost_parent; |
1028 | 15.7M | } |
1029 | 16.4M | } |
1030 | 15.2M | else |
1031 | 15.2M | { |
1032 | | //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1); |
1033 | 15.2M | best_cost = cost_parent; |
1034 | 15.2M | best_cost <<= 1; |
1035 | 15.2M | pi4_8x8_tu_split[0] = 0; |
1036 | 15.2M | pi4_8x8_hsad[0] = cost_parent; |
1037 | 15.2M | } |
1038 | | |
1039 | 31.7M | pi4_8x8_tu_early_cbf[0] = early_cbf; |
1040 | | |
1041 | | /* best cost has tu_split_flag at LSB(Least significant bit) */ |
1042 | 31.7M | return ((best_cost << 1) + early_cbf); |
1043 | 31.7M | } |
1044 | | |
1045 | | /** |
1046 | | ******************************************************************************* |
1047 | | * |
1048 | | * @brief |
1049 | | * Computes 16x16 transform using children 8x8 hadamard results |
1050 | | * Modified to incorporate the dead-zone implementation - Lokesh |
1051 | | * |
1052 | | * @par Description: |
1053 | | * |
1054 | | * @param[in] pi2_8x8_had |
1055 | | * WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) |
1056 | | * |
1057 | | * @param[in] had8_strd |
1058 | | * stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 |
1059 | | * |
1060 | | * @param[out] pi2_dst |
1061 | | * destination buffer where 8x8 hadamard result is stored |
1062 | | * |
1063 | | * @param[in] dst_stride |
1064 | | * stride of destination block |
1065 | | * |
1066 | | * @param[in] i4_frm_qstep |
1067 | | * frm_qstep value based on the which the threshold value is calculated |
1068 | | * |
1069 | | * @returns |
1070 | | * 16x16 Hadamard SATD |
1071 | | * @remarks |
1072 | | * |
1073 | | ******************************************************************************* |
1074 | | */ |
1075 | | static UWORD32 ihevce_compute_16x16HAD_using_8x8( |
1076 | | WORD16 *pi2_8x8_had, |
1077 | | WORD32 had8_strd, |
1078 | | WORD16 *pi2_dst, |
1079 | | WORD32 dst_strd, |
1080 | | WORD32 i4_frm_qstep, |
1081 | | WORD32 *pi4_cbf) |
1082 | 7.19M | { |
1083 | | /* Qstep value is right shifted by 8 */ |
1084 | 7.19M | WORD32 threshold = (i4_frm_qstep >> 8); |
1085 | | |
1086 | | /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */ |
1087 | 7.19M | WORD16 *pi2_y0 = pi2_8x8_had; |
1088 | 7.19M | WORD16 *pi2_y1 = pi2_8x8_had + 8; |
1089 | 7.19M | WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8; |
1090 | 7.19M | WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8; |
1091 | | |
1092 | | /* Initialize pointers to store 8x8 HAD output */ |
1093 | 7.19M | WORD16 *pi2_dst0 = pi2_dst; |
1094 | 7.19M | WORD16 *pi2_dst1 = pi2_dst + 8; |
1095 | 7.19M | WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8; |
1096 | 7.19M | WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8; |
1097 | | |
1098 | 7.19M | UWORD32 u4_satd = 0; |
1099 | 7.19M | WORD32 i; |
1100 | | |
1101 | | /* Child HAD results combined as follows to get Parent result */ |
1102 | | /* _ _ */ |
1103 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
1104 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
1105 | | /* \- -/ */ |
1106 | 467M | for(i = 0; i < 64; i++) |
1107 | 460M | { |
1108 | 460M | WORD32 src_idx = (i >> 3) * had8_strd + (i % 8); |
1109 | 460M | WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8); |
1110 | | |
1111 | 460M | WORD16 a0 = pi2_y0[src_idx]; |
1112 | 460M | WORD16 a1 = pi2_y1[src_idx]; |
1113 | 460M | WORD16 a2 = pi2_y2[src_idx]; |
1114 | 460M | WORD16 a3 = pi2_y3[src_idx]; |
1115 | | |
1116 | 460M | WORD16 b0 = (a0 + a1) >> 1; |
1117 | 460M | WORD16 b1 = (a0 - a1) >> 1; |
1118 | 460M | WORD16 b2 = (a2 + a3) >> 1; |
1119 | 460M | WORD16 b3 = (a2 - a3) >> 1; |
1120 | | |
1121 | 460M | pi2_dst0[dst_idx] = b0 + b2; |
1122 | 460M | pi2_dst1[dst_idx] = b1 + b3; |
1123 | 460M | pi2_dst2[dst_idx] = b0 - b2; |
1124 | 460M | pi2_dst3[dst_idx] = b1 - b3; |
1125 | | |
1126 | | /* Make the value of dst to zerp, if it falls below the dead-zone */ |
1127 | 460M | if(ABS(pi2_dst0[dst_idx]) > threshold) |
1128 | 31.9M | *pi4_cbf = 1; |
1129 | 460M | if(ABS(pi2_dst1[dst_idx]) > threshold) |
1130 | 31.7M | *pi4_cbf = 1; |
1131 | 460M | if(ABS(pi2_dst2[dst_idx]) > threshold) |
1132 | 31.6M | *pi4_cbf = 1; |
1133 | 460M | if(ABS(pi2_dst3[dst_idx]) > threshold) |
1134 | 31.4M | *pi4_cbf = 1; |
1135 | | |
1136 | 460M | u4_satd += ABS(pi2_dst0[dst_idx]); |
1137 | 460M | u4_satd += ABS(pi2_dst1[dst_idx]); |
1138 | 460M | u4_satd += ABS(pi2_dst2[dst_idx]); |
1139 | 460M | u4_satd += ABS(pi2_dst3[dst_idx]); |
1140 | 460M | } |
1141 | | |
1142 | | /* return 16x16 satd */ |
1143 | 7.19M | return (u4_satd); |
1144 | 7.19M | } |
1145 | | |
1146 | | /** |
1147 | | ******************************************************************************* |
1148 | | * |
1149 | | * @brief |
1150 | | * Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates. |
1151 | | * Uses recursive 8x8 had output to compute satd for 16x16 and its children |
1152 | | * |
1153 | | * @par Description: |
1154 | | * |
1155 | | * @param[in] pu1_origin |
1156 | | * UWORD8 pointer to the current block |
1157 | | * |
1158 | | * @param[in] src_strd |
1159 | | * WORD32 Source stride |
1160 | | * |
1161 | | * @param[in] pu1_pred |
1162 | | * UWORD8 pointer to the prediction block |
1163 | | * |
1164 | | * @param[in] pred_strd |
1165 | | * WORD32 Pred stride |
1166 | | * |
1167 | | * @param[out] pi2_dst |
1168 | | * WORD16 pointer to the transform output block |
1169 | | * |
1170 | | * @param[out] dst_strd |
1171 | | * WORD32 Destination stride |
1172 | | * |
1173 | | * @param[out] ppi4_hsad |
1174 | | * pointer to base pointers for storing hadmard sads of various |
1175 | | * block sizes (4x4 to 32x32) |
1176 | | * |
1177 | | * @param[in] pos_x_y_4x4 |
1178 | | * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB |
1179 | | * Lower 16bits denote xpos and upper 16ypos of the 4x4block |
1180 | | * |
1181 | | * @param[in] num_4x4_in_row |
1182 | | * Denotes the number of current 4x4 blocks in a ctb/CU/MB |
1183 | | * |
1184 | | * @param[in] lambda |
1185 | | * lambda values is the cost factor calculated based on QP |
1186 | | * |
1187 | | * @param[in] lambda_q_shift |
1188 | | * lambda_q_shift used to reverse the lambda value back from q8 format |
1189 | | * |
1190 | | * @param[in] depth |
1191 | | * depth gives the current TU depth with respect to the CU |
1192 | | * |
1193 | | * @param[in] i4_frm_qstep |
1194 | | * frm_qstep value based on the which the threshold value is calculated |
1195 | | * |
1196 | | * @returns |
1197 | | * |
1198 | | * @remarks |
1199 | | * |
1200 | | ******************************************************************************* |
1201 | | */ |
1202 | | |
1203 | | WORD32 ihevce_had_16x16_r( |
1204 | | UWORD8 *pu1_src, |
1205 | | WORD32 src_strd, |
1206 | | UWORD8 *pu1_pred, |
1207 | | WORD32 pred_strd, |
1208 | | WORD16 *pi2_dst, |
1209 | | WORD32 dst_strd, |
1210 | | WORD32 **ppi4_hsad, |
1211 | | WORD32 **ppi4_tu_split, |
1212 | | WORD32 **ppi4_tu_early_cbf, |
1213 | | WORD32 pos_x_y_4x4, |
1214 | | WORD32 num_4x4_in_row, |
1215 | | WORD32 lambda, |
1216 | | WORD32 lambda_q_shift, |
1217 | | WORD32 i4_frm_qstep, |
1218 | | WORD32 i4_cur_depth, |
1219 | | WORD32 i4_max_depth, |
1220 | | WORD32 i4_max_tr_size, |
1221 | | WORD32 *pi4_tu_split_cost, |
1222 | | void *pv_func_sel) |
1223 | 7.19M | { |
1224 | 7.19M | WORD16 ai2_8x8_had[256]; |
1225 | 7.19M | WORD32 *pi4_16x16_hsad; |
1226 | 7.19M | WORD32 *pi4_16x16_tu_split; |
1227 | | |
1228 | 7.19M | WORD32 *pi4_16x16_tu_early_cbf; |
1229 | | |
1230 | 7.19M | UWORD32 u4_satd = 0; |
1231 | 7.19M | WORD32 tu_split_flag = 0; |
1232 | 7.19M | WORD32 i4_early_cbf_flag = 0, early_cbf = 0; |
1233 | 7.19M | const UWORD8 u1_cur_tr_size = 16; |
1234 | | |
1235 | | /* cost_parent : Stores the cost of the parent HAD transform (16x16) */ |
1236 | | /* cost_child : Stores the cost of the child HAD transform (16x16) */ |
1237 | 7.19M | WORD32 cost_parent = 0, cost_child = 0; |
1238 | | |
1239 | | /*best_cost returns the best cost at the end of the function */ |
1240 | | /*tu_split denoes whether the TU (16x16)is split or not */ |
1241 | 7.19M | WORD32 best_cost = 0, best_cost_tu_split; |
1242 | 7.19M | WORD32 i; |
1243 | | |
1244 | 7.19M | WORD16 *pi2_y0; |
1245 | 7.19M | UWORD8 *pu1_src0; |
1246 | 7.19M | UWORD8 *pu1_pred0; |
1247 | 7.19M | WORD32 pos_x_y_4x4_0; |
1248 | | |
1249 | 7.19M | WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; |
1250 | 7.19M | WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; |
1251 | | |
1252 | 7.19M | ASSERT(pos_x >= 0); |
1253 | 7.19M | ASSERT(pos_y >= 0); |
1254 | | |
1255 | | /* Initialize pointers to store 16x16 SATDs */ |
1256 | 7.19M | pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); |
1257 | | |
1258 | 7.19M | pi4_16x16_tu_split = |
1259 | 7.19M | ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); |
1260 | | |
1261 | 7.19M | pi4_16x16_tu_early_cbf = |
1262 | 7.19M | ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); |
1263 | | |
1264 | | /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */ |
1265 | 35.9M | for(i = 0; i < 4; i++) |
1266 | 28.7M | { |
1267 | 28.7M | pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8; |
1268 | 28.7M | pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8; |
1269 | 28.7M | pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; |
1270 | 28.7M | pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); |
1271 | | |
1272 | 28.7M | best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r( |
1273 | 28.7M | pu1_src0, |
1274 | 28.7M | src_strd, |
1275 | 28.7M | pu1_pred0, |
1276 | 28.7M | pred_strd, |
1277 | 28.7M | pi2_y0, |
1278 | 28.7M | 16, |
1279 | 28.7M | ppi4_hsad, |
1280 | 28.7M | ppi4_tu_split, |
1281 | 28.7M | ppi4_tu_early_cbf, |
1282 | 28.7M | pos_x_y_4x4_0, |
1283 | 28.7M | num_4x4_in_row, |
1284 | 28.7M | lambda, |
1285 | 28.7M | lambda_q_shift, |
1286 | 28.7M | i4_frm_qstep, |
1287 | 28.7M | i4_cur_depth + 1, |
1288 | 28.7M | i4_max_depth, |
1289 | 28.7M | i4_max_tr_size, |
1290 | 28.7M | pi4_tu_split_cost, |
1291 | 28.7M | pv_func_sel); |
1292 | | |
1293 | | /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */ |
1294 | 28.7M | best_cost = (best_cost_tu_split >> 2); |
1295 | | |
1296 | | /* Last but one bit stores the information regarding the TU_Split */ |
1297 | 28.7M | tu_split_flag += (best_cost_tu_split & 0x3) >> 1; |
1298 | | |
1299 | | /* Last bit stores the information regarding the early_cbf */ |
1300 | 28.7M | i4_early_cbf_flag += (best_cost_tu_split & 0x1); |
1301 | | |
1302 | 28.7M | cost_child += best_cost; |
1303 | | |
1304 | 28.7M | tu_split_flag <<= 1; |
1305 | 28.7M | i4_early_cbf_flag <<= 1; |
1306 | 28.7M | } |
1307 | | |
1308 | | /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */ |
1309 | 7.19M | pi2_y0 = ai2_8x8_had; |
1310 | | |
1311 | | /* Threshold currently passed as "0" */ |
1312 | 7.19M | u4_satd = |
1313 | 7.19M | ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); |
1314 | | |
1315 | | /* store the normalized satd */ |
1316 | 7.19M | cost_parent = ((u4_satd + 4) >> 3); |
1317 | | |
1318 | | /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */ |
1319 | 7.19M | cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1); |
1320 | | |
1321 | 7.19M | i4_early_cbf_flag += early_cbf; |
1322 | | |
1323 | | /* Right now the depth is hard-coded to 4: The depth can be modified from the config file |
1324 | | which decides the extent to which TU_REC needs to be done */ |
1325 | 7.19M | if(i4_cur_depth < i4_max_depth) |
1326 | 5.25M | { |
1327 | 5.25M | if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size)) |
1328 | 524k | { |
1329 | | //cost_child -= ((4 + 4) * lambda) >> (lambda_q_shift + 1); |
1330 | 524k | *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1); |
1331 | 524k | tu_split_flag += 1; |
1332 | 524k | best_cost = cost_child; |
1333 | 524k | } |
1334 | 4.72M | else |
1335 | 4.72M | { |
1336 | | //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1); |
1337 | 4.72M | tu_split_flag += 0; |
1338 | 4.72M | best_cost = cost_parent; |
1339 | 4.72M | } |
1340 | 5.25M | } |
1341 | 1.94M | else |
1342 | 1.94M | { |
1343 | | //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1); |
1344 | 1.94M | tu_split_flag += 0; |
1345 | 1.94M | best_cost = cost_parent; |
1346 | 1.94M | } |
1347 | | |
1348 | 7.19M | pi4_16x16_hsad[0] = best_cost; |
1349 | 7.19M | pi4_16x16_tu_split[0] = tu_split_flag; |
1350 | 7.19M | pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag; |
1351 | | |
1352 | | /*returning two values(best cost & tu_split_flag) as a single value*/ |
1353 | 7.19M | return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag); |
1354 | 7.19M | } |
1355 | | |
1356 | | //#endif |
1357 | | /** |
1358 | | ******************************************************************************* |
1359 | | * |
1360 | | * @brief |
1361 | | * Computes 32x32 transform using children 16x16 hadamard results |
1362 | | * |
1363 | | * @par Description: |
1364 | | * |
1365 | | * @param[in] pi2_16x16_had |
1366 | | * WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) |
1367 | | * |
1368 | | * @param[in] had16_strd |
1369 | | * stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 |
1370 | | * |
1371 | | * @param[out] pi2_dst |
1372 | | * destination buffer where 16x16 hadamard result is stored |
1373 | | * |
1374 | | * @param[in] dst_stride |
1375 | | * stride of destination block |
1376 | | * |
1377 | | * @param[in] i4_frm_qstep |
1378 | | * frm_qstep value based on the which the threshold value is calculated |
1379 | | * |
1380 | | * @returns |
1381 | | * 32x32 Hadamard SATD |
1382 | | * @remarks |
1383 | | * |
1384 | | ******************************************************************************* |
1385 | | */ |
1386 | | //#if COMPUTE_32x32_USING_16X16 == C |
1387 | | UWORD32 ihevce_compute_32x32HAD_using_16x16( |
1388 | | WORD16 *pi2_16x16_had, |
1389 | | WORD32 had16_strd, |
1390 | | WORD16 *pi2_dst, |
1391 | | WORD32 dst_strd, |
1392 | | WORD32 i4_frm_qstep, |
1393 | | WORD32 *pi4_cbf) |
1394 | 1.12M | { |
1395 | | /* Qstep value is right shifted by 8 */ |
1396 | 1.12M | WORD32 threshold = (i4_frm_qstep >> 8); |
1397 | | |
1398 | | /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */ |
1399 | 1.12M | WORD16 *pi2_y0 = pi2_16x16_had; |
1400 | 1.12M | WORD16 *pi2_y1 = pi2_16x16_had + 16; |
1401 | 1.12M | WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16; |
1402 | 1.12M | WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16; |
1403 | | |
1404 | | /* Initialize pointers to store 8x8 HAD output */ |
1405 | 1.12M | WORD16 *pi2_dst0 = pi2_dst; |
1406 | 1.12M | WORD16 *pi2_dst1 = pi2_dst + 16; |
1407 | 1.12M | WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16; |
1408 | 1.12M | WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16; |
1409 | | |
1410 | 1.12M | UWORD32 u4_satd = 0; |
1411 | 1.12M | WORD32 i; |
1412 | | |
1413 | | /* Child HAD results combined as follows to get Parent result */ |
1414 | | /* _ _ */ |
1415 | | /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ |
1416 | | /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ |
1417 | | /* \- -/ */ |
1418 | 289M | for(i = 0; i < 256; i++) |
1419 | 288M | { |
1420 | 288M | WORD32 src_idx = (i >> 4) * had16_strd + (i % 16); |
1421 | 288M | WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16); |
1422 | | |
1423 | 288M | WORD16 a0 = pi2_y0[src_idx] >> 2; |
1424 | 288M | WORD16 a1 = pi2_y1[src_idx] >> 2; |
1425 | 288M | WORD16 a2 = pi2_y2[src_idx] >> 2; |
1426 | 288M | WORD16 a3 = pi2_y3[src_idx] >> 2; |
1427 | | |
1428 | 288M | WORD16 b0 = (a0 + a1); |
1429 | 288M | WORD16 b1 = (a0 - a1); |
1430 | 288M | WORD16 b2 = (a2 + a3); |
1431 | 288M | WORD16 b3 = (a2 - a3); |
1432 | | |
1433 | 288M | pi2_dst0[dst_idx] = b0 + b2; |
1434 | 288M | pi2_dst1[dst_idx] = b1 + b3; |
1435 | 288M | pi2_dst2[dst_idx] = b0 - b2; |
1436 | 288M | pi2_dst3[dst_idx] = b1 - b3; |
1437 | | |
1438 | | /* Make the value of dst to zerp, if it falls below the dead-zone */ |
1439 | 288M | if(ABS(pi2_dst0[dst_idx]) > threshold) |
1440 | 35.0M | *pi4_cbf = 1; |
1441 | 288M | if(ABS(pi2_dst1[dst_idx]) > threshold) |
1442 | 34.8M | *pi4_cbf = 1; |
1443 | 288M | if(ABS(pi2_dst2[dst_idx]) > threshold) |
1444 | 34.8M | *pi4_cbf = 1; |
1445 | 288M | if(ABS(pi2_dst3[dst_idx]) > threshold) |
1446 | 34.7M | *pi4_cbf = 1; |
1447 | | |
1448 | 288M | u4_satd += ABS(pi2_dst0[dst_idx]); |
1449 | 288M | u4_satd += ABS(pi2_dst1[dst_idx]); |
1450 | 288M | u4_satd += ABS(pi2_dst2[dst_idx]); |
1451 | 288M | u4_satd += ABS(pi2_dst3[dst_idx]); |
1452 | 288M | } |
1453 | | |
1454 | | /* return 32x32 satd */ |
1455 | 1.12M | return (u4_satd); |
1456 | 1.12M | } |
1457 | | //#endif |
1458 | | |
1459 | | /** |
1460 | | ******************************************************************************* |
1461 | | * |
1462 | | * @brief |
1463 | | * Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates. |
1464 | | * Uses recursive 16x16 had output to compute satd for 32x32 and its children |
1465 | | * |
1466 | | * @par Description: |
1467 | | * |
1468 | | * @param[in] pu1_origin |
1469 | | * UWORD8 pointer to the current block |
1470 | | * |
1471 | | * @param[in] src_strd |
1472 | | * WORD32 Source stride |
1473 | | * |
1474 | | * @param[in] pu1_pred |
1475 | | * UWORD8 pointer to the prediction block |
1476 | | * |
1477 | | * @param[in] pred_strd |
1478 | | * WORD32 Pred stride |
1479 | | * |
1480 | | * @param[out] pi2_dst |
1481 | | * WORD16 pointer to the transform output block |
1482 | | * |
1483 | | * @param[out] dst_strd |
1484 | | * WORD32 Destination stride |
1485 | | * |
1486 | | * @param[out] ppi4_hsad |
1487 | | * pointer to base pointers for storing hadmard sads of various |
1488 | | * block sizes (4x4 to 32x32) |
1489 | | * |
1490 | | * @param[in] pos_x_y_4x4 |
1491 | | * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB |
1492 | | * Lower 16bits denote xpos and upper 16ypos of the 4x4block |
1493 | | * |
1494 | | * @param[in] num_4x4_in_row |
1495 | | * Denotes the number of current 4x4 blocks in a ctb/CU/MB |
1496 | | * |
1497 | | * @param[in] lambda |
1498 | | * lambda values is the cost factor calculated based on QP |
1499 | | * |
1500 | | * @param[in] lambda_q_shift |
1501 | | * lambda_q_shift used to reverse the lambda value back from q8 format |
1502 | | * |
1503 | | * @param[in] depth |
1504 | | * depth gives the current TU depth with respect to the CU |
1505 | | * |
1506 | | * @param[in] i4_frm_qstep |
1507 | | * frm_qstep value based on the which the threshold value is calculated |
1508 | | * |
1509 | | * |
1510 | | * @returns |
1511 | | * |
1512 | | * @remarks |
1513 | | * |
1514 | | ******************************************************************************* |
1515 | | */ |
1516 | | void ihevce_had_32x32_r( |
1517 | | UWORD8 *pu1_src, |
1518 | | WORD32 src_strd, |
1519 | | UWORD8 *pu1_pred, |
1520 | | WORD32 pred_strd, |
1521 | | WORD16 *pi2_dst, |
1522 | | WORD32 dst_strd, |
1523 | | WORD32 **ppi4_hsad, |
1524 | | WORD32 **ppi4_tu_split, |
1525 | | WORD32 **ppi4_tu_early_cbf, |
1526 | | WORD32 pos_x_y_4x4, |
1527 | | WORD32 num_4x4_in_row, |
1528 | | WORD32 lambda, |
1529 | | WORD32 lambda_q_shift, |
1530 | | WORD32 i4_frm_qstep, |
1531 | | WORD32 i4_cur_depth, |
1532 | | WORD32 i4_max_depth, |
1533 | | WORD32 i4_max_tr_size, |
1534 | | WORD32 *pi4_tu_split_cost, |
1535 | | me_func_selector_t *ps_func_selector) |
1536 | | |
1537 | 1.12M | { |
1538 | 1.12M | WORD16 ai2_16x16_had[1024]; |
1539 | 1.12M | WORD32 *pi4_32x32_hsad; |
1540 | 1.12M | WORD32 *pi4_32x32_tu_split; |
1541 | 1.12M | WORD32 *pi4_32x32_tu_early_cbf; |
1542 | | |
1543 | 1.12M | WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; |
1544 | 1.12M | WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; |
1545 | 1.12M | WORD32 tu_split_flag = 0; |
1546 | 1.12M | const UWORD8 u1_cur_tr_size = 32; |
1547 | 1.12M | WORD32 i4_early_cbf_flag = 0, early_cbf = 0; |
1548 | | |
1549 | | /* cost_parent : Stores the cost of the parent HAD transform (16x16) */ |
1550 | | /* cost_child : Stores the cost of the child HAD transform (16x16) */ |
1551 | 1.12M | WORD32 cost_child = 0, cost_parent = 0; |
1552 | | |
1553 | | /*retuned as the best cost for the entire TU (32x32) */ |
1554 | 1.12M | WORD32 best_cost = 0; |
1555 | | /*captures the best cost and tu_split at child level */ |
1556 | 1.12M | WORD32 best_cost_tu_split; |
1557 | | |
1558 | | /* Initialize pointers to 4 8x8 blocks in 16x16 */ |
1559 | 1.12M | WORD16 *pi2_y0 = ai2_16x16_had; |
1560 | 1.12M | WORD16 *pi2_y1 = ai2_16x16_had + 16; |
1561 | 1.12M | WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16; |
1562 | 1.12M | WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16; |
1563 | | |
1564 | 1.12M | UWORD8 *pu1_src0 = pu1_src; |
1565 | 1.12M | UWORD8 *pu1_src1 = pu1_src + 16; |
1566 | 1.12M | UWORD8 *pu1_src2 = pu1_src + src_strd * 16; |
1567 | 1.12M | UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16; |
1568 | | |
1569 | 1.12M | UWORD8 *pu1_pred0 = pu1_pred; |
1570 | 1.12M | UWORD8 *pu1_pred1 = pu1_pred + 16; |
1571 | 1.12M | UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16; |
1572 | 1.12M | UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16; |
1573 | | |
1574 | 1.12M | ASSERT(pos_x >= 0); |
1575 | 1.12M | ASSERT(pos_y >= 0); |
1576 | | |
1577 | | /* Initialize pointers to store 32x32 SATDs */ |
1578 | 1.12M | pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); |
1579 | | |
1580 | 1.12M | pi4_32x32_tu_split = |
1581 | 1.12M | ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); |
1582 | | |
1583 | 1.12M | pi4_32x32_tu_early_cbf = |
1584 | 1.12M | ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); |
1585 | | |
1586 | | /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */ |
1587 | 1.12M | best_cost_tu_split = ps_func_selector->pf_had_16x16_r( |
1588 | 1.12M | pu1_src0, |
1589 | 1.12M | src_strd, |
1590 | 1.12M | pu1_pred0, |
1591 | 1.12M | pred_strd, |
1592 | 1.12M | pi2_y0, |
1593 | 1.12M | 32, |
1594 | 1.12M | ppi4_hsad, |
1595 | 1.12M | ppi4_tu_split, |
1596 | 1.12M | ppi4_tu_early_cbf, |
1597 | 1.12M | pos_x_y_4x4, |
1598 | 1.12M | num_4x4_in_row, |
1599 | 1.12M | lambda, |
1600 | 1.12M | lambda_q_shift, |
1601 | 1.12M | i4_frm_qstep, |
1602 | 1.12M | i4_cur_depth + 1, |
1603 | 1.12M | i4_max_depth, |
1604 | 1.12M | i4_max_tr_size, |
1605 | 1.12M | pi4_tu_split_cost, |
1606 | 1.12M | NULL); |
1607 | | |
1608 | | /* cost is shifted by 10bits */ |
1609 | 1.12M | best_cost = best_cost_tu_split >> 10; |
1610 | | |
1611 | | /* Tu split is present in the 6-10 bits */ |
1612 | 1.12M | tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; |
1613 | | |
1614 | | /*Early CBF info is present in the last 5 bits */ |
1615 | 1.12M | i4_early_cbf_flag += best_cost_tu_split & 0x1F; |
1616 | | |
1617 | 1.12M | tu_split_flag <<= 5; |
1618 | 1.12M | i4_early_cbf_flag <<= 5; |
1619 | | |
1620 | 1.12M | cost_child += best_cost; |
1621 | | |
1622 | 1.12M | best_cost_tu_split = ps_func_selector->pf_had_16x16_r( |
1623 | 1.12M | pu1_src1, |
1624 | 1.12M | src_strd, |
1625 | 1.12M | pu1_pred1, |
1626 | 1.12M | pred_strd, |
1627 | 1.12M | pi2_y1, |
1628 | 1.12M | 32, |
1629 | 1.12M | ppi4_hsad, |
1630 | 1.12M | ppi4_tu_split, |
1631 | 1.12M | ppi4_tu_early_cbf, |
1632 | 1.12M | pos_x_y_4x4 + 4, |
1633 | 1.12M | num_4x4_in_row, |
1634 | 1.12M | lambda, |
1635 | 1.12M | lambda_q_shift, |
1636 | 1.12M | i4_frm_qstep, |
1637 | 1.12M | i4_cur_depth + 1, |
1638 | 1.12M | i4_max_depth, |
1639 | 1.12M | i4_max_tr_size, |
1640 | 1.12M | pi4_tu_split_cost, |
1641 | 1.12M | NULL); |
1642 | | |
1643 | | /* cost is shifted by 10bits */ |
1644 | 1.12M | best_cost = best_cost_tu_split >> 10; |
1645 | | |
1646 | | /* Tu split is present in the 6-10 bits */ |
1647 | 1.12M | tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; |
1648 | | |
1649 | | /*Early CBF info is present in the last 5 bits */ |
1650 | 1.12M | i4_early_cbf_flag += best_cost_tu_split & 0x1F; |
1651 | | |
1652 | 1.12M | tu_split_flag <<= 5; |
1653 | 1.12M | i4_early_cbf_flag <<= 5; |
1654 | | |
1655 | 1.12M | cost_child += best_cost; |
1656 | | |
1657 | 1.12M | best_cost_tu_split = ps_func_selector->pf_had_16x16_r( |
1658 | 1.12M | pu1_src2, |
1659 | 1.12M | src_strd, |
1660 | 1.12M | pu1_pred2, |
1661 | 1.12M | pred_strd, |
1662 | 1.12M | pi2_y2, |
1663 | 1.12M | 32, |
1664 | 1.12M | ppi4_hsad, |
1665 | 1.12M | ppi4_tu_split, |
1666 | 1.12M | ppi4_tu_early_cbf, |
1667 | 1.12M | pos_x_y_4x4 + (4 << 16), |
1668 | 1.12M | num_4x4_in_row, |
1669 | 1.12M | lambda, |
1670 | 1.12M | lambda_q_shift, |
1671 | 1.12M | i4_frm_qstep, |
1672 | 1.12M | i4_cur_depth + 1, |
1673 | 1.12M | i4_max_depth, |
1674 | 1.12M | i4_max_tr_size, |
1675 | 1.12M | pi4_tu_split_cost, |
1676 | 1.12M | NULL); |
1677 | | |
1678 | | /* cost is shifted by 10bits */ |
1679 | 1.12M | best_cost = best_cost_tu_split >> 10; |
1680 | | |
1681 | | /* Tu split is present in the 6-10 bits */ |
1682 | 1.12M | tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; |
1683 | | |
1684 | | /*Early CBF info is present in the last 5 bits */ |
1685 | 1.12M | i4_early_cbf_flag += best_cost_tu_split & 0x1F; |
1686 | | |
1687 | 1.12M | tu_split_flag <<= 5; |
1688 | 1.12M | i4_early_cbf_flag <<= 5; |
1689 | | |
1690 | 1.12M | cost_child += best_cost; |
1691 | | |
1692 | 1.12M | best_cost_tu_split = ps_func_selector->pf_had_16x16_r( |
1693 | 1.12M | pu1_src3, |
1694 | 1.12M | src_strd, |
1695 | 1.12M | pu1_pred3, |
1696 | 1.12M | pred_strd, |
1697 | 1.12M | pi2_y3, |
1698 | 1.12M | 32, |
1699 | 1.12M | ppi4_hsad, |
1700 | 1.12M | ppi4_tu_split, |
1701 | 1.12M | ppi4_tu_early_cbf, |
1702 | 1.12M | pos_x_y_4x4 + (4 << 16) + 4, |
1703 | 1.12M | num_4x4_in_row, |
1704 | 1.12M | lambda, |
1705 | 1.12M | lambda_q_shift, |
1706 | 1.12M | i4_frm_qstep, |
1707 | 1.12M | i4_cur_depth + 1, |
1708 | 1.12M | i4_max_depth, |
1709 | 1.12M | i4_max_tr_size, |
1710 | 1.12M | pi4_tu_split_cost, |
1711 | 1.12M | NULL); |
1712 | | |
1713 | | /* cost is shifted by 10bits */ |
1714 | 1.12M | best_cost = best_cost_tu_split >> 10; |
1715 | | |
1716 | | /* Tu split is present in the 6-10 bits */ |
1717 | 1.12M | tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; |
1718 | | |
1719 | | /*Early CBF info is present in the last 5 bits */ |
1720 | 1.12M | i4_early_cbf_flag += best_cost_tu_split & 0x1F; |
1721 | | |
1722 | 1.12M | tu_split_flag <<= 1; |
1723 | 1.12M | i4_early_cbf_flag <<= 1; |
1724 | | |
1725 | 1.12M | cost_child += best_cost; |
1726 | | |
1727 | 1.12M | { |
1728 | 1.12M | UWORD32 u4_satd = 0; |
1729 | | |
1730 | 1.12M | u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16( |
1731 | 1.12M | pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); |
1732 | | |
1733 | 1.12M | cost_parent = ((u4_satd + 2) >> 2); |
1734 | 1.12M | } |
1735 | | |
1736 | | /* 4 TU_Split flags , 4 CBF Flags*/ |
1737 | 1.12M | cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1); |
1738 | | |
1739 | 1.12M | i4_early_cbf_flag += early_cbf; |
1740 | | |
1741 | | /* 1 TU_SPlit flag, 1 CBF flag */ |
1742 | | //cost_parent += ((1 + 1)* lambda) >> (lambda_q_shift + 1); |
1743 | | |
1744 | 1.12M | if(i4_cur_depth < i4_max_depth) |
1745 | 1.06M | { |
1746 | 1.06M | if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size)) |
1747 | 266k | { |
1748 | 266k | *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1); |
1749 | 266k | best_cost = cost_child; |
1750 | 266k | tu_split_flag++; |
1751 | 266k | } |
1752 | 795k | else |
1753 | 795k | { |
1754 | 795k | tu_split_flag = 0; |
1755 | 795k | best_cost = cost_parent; |
1756 | 795k | } |
1757 | 1.06M | } |
1758 | 63.7k | else |
1759 | 63.7k | { |
1760 | 63.7k | tu_split_flag = 0; |
1761 | 63.7k | best_cost = cost_parent; |
1762 | 63.7k | } |
1763 | | |
1764 | 1.12M | pi4_32x32_tu_split[0] = tu_split_flag; |
1765 | | |
1766 | 1.12M | pi4_32x32_hsad[0] = best_cost; |
1767 | | |
1768 | 1.12M | pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag; |
1769 | 1.12M | } |