/src/x264/encoder/analyse.c
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * analyse.c: macroblock analysis |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Laurent Aimar <fenrir@via.ecp.fr> |
7 | | * Loren Merritt <lorenm@u.washington.edu> |
8 | | * Fiona Glaser <fiona@x264.com> |
9 | | * |
10 | | * This program is free software; you can redistribute it and/or modify |
11 | | * it under the terms of the GNU General Public License as published by |
12 | | * the Free Software Foundation; either version 2 of the License, or |
13 | | * (at your option) any later version. |
14 | | * |
15 | | * This program is distributed in the hope that it will be useful, |
16 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18 | | * GNU General Public License for more details. |
19 | | * |
20 | | * You should have received a copy of the GNU General Public License |
21 | | * along with this program; if not, write to the Free Software |
22 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
23 | | * |
24 | | * This program is also available under a commercial proprietary license. |
25 | | * For more information, contact us at licensing@x264.com. |
26 | | *****************************************************************************/ |
27 | | |
28 | | #include "common/common.h" |
29 | | #include "macroblock.h" |
30 | | #include "me.h" |
31 | | #include "ratecontrol.h" |
32 | | #include "analyse.h" |
33 | | #include "rdo.c" |
34 | | |
35 | | typedef struct |
36 | | { |
37 | | x264_me_t me16x16; |
38 | | x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */ |
39 | | x264_me_t me8x8[4]; |
40 | | x264_me_t me4x4[4][4]; |
41 | | x264_me_t me8x4[4][2]; |
42 | | x264_me_t me4x8[4][2]; |
43 | | x264_me_t me16x8[2]; |
44 | | x264_me_t me8x16[2]; |
45 | | int i_rd16x16; |
46 | | int i_cost8x8; |
47 | | int i_cost4x4[4]; /* cost per 8x8 partition */ |
48 | | int i_cost8x4[4]; /* cost per 8x8 partition */ |
49 | | int i_cost4x8[4]; /* cost per 8x8 partition */ |
50 | | int i_cost16x8; |
51 | | int i_cost8x16; |
52 | | /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3], [ref][5] is for alignment */ |
53 | | ALIGNED_8( int16_t mvc[32][6][2] ); |
54 | | } x264_mb_analysis_list_t; |
55 | | |
56 | | typedef struct |
57 | | { |
58 | | /* conduct the analysis using this lamda and QP */ |
59 | | int i_lambda; |
60 | | int i_lambda2; |
61 | | int i_qp; |
62 | | uint16_t *p_cost_mv; |
63 | | uint16_t *p_cost_ref[2]; |
64 | | int i_mbrd; |
65 | | |
66 | | |
67 | | /* I: Intra part */ |
68 | | /* Take some shortcuts in intra search if intra is deemed unlikely */ |
69 | | int b_fast_intra; |
70 | | int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */ |
71 | | int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */ |
72 | | int b_try_skip; |
73 | | |
74 | | /* Luma part */ |
75 | | int i_satd_i16x16; |
76 | | int i_satd_i16x16_dir[7]; |
77 | | int i_predict16x16; |
78 | | |
79 | | int i_satd_i8x8; |
80 | | int i_cbp_i8x8_luma; |
81 | | ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] ); |
82 | | int i_predict8x8[4]; |
83 | | |
84 | | int i_satd_i4x4; |
85 | | int i_predict4x4[16]; |
86 | | |
87 | | int i_satd_pcm; |
88 | | |
89 | | /* Chroma part */ |
90 | | int i_satd_chroma; |
91 | | int i_satd_chroma_dir[7]; |
92 | | int i_predict8x8chroma; |
93 | | |
94 | | /* II: Inter part P/B frame */ |
95 | | x264_mb_analysis_list_t l0; |
96 | | x264_mb_analysis_list_t l1; |
97 | | |
98 | | int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */ |
99 | | int i_cost16x16direct; |
100 | | int i_cost8x8bi; |
101 | | int i_cost8x8direct[4]; |
102 | | int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */ |
103 | | int i_cost_est16x8[2]; /* Per-partition estimated cost */ |
104 | | int i_cost_est8x16[2]; |
105 | | int i_cost16x8bi; |
106 | | int i_cost8x16bi; |
107 | | int i_rd16x16bi; |
108 | | int i_rd16x16direct; |
109 | | int i_rd16x8bi; |
110 | | int i_rd8x16bi; |
111 | | int i_rd8x8bi; |
112 | | |
113 | | int i_mb_partition16x8[2]; /* mb_partition_e */ |
114 | | int i_mb_partition8x16[2]; |
115 | | int i_mb_type16x8; /* mb_class_e */ |
116 | | int i_mb_type8x16; |
117 | | |
118 | | int b_direct_available; |
119 | | int b_early_terminate; |
120 | | |
121 | | } x264_mb_analysis_t; |
122 | | |
123 | | /* TODO: calculate CABAC costs */ |
124 | | static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = |
125 | | { |
126 | | 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 |
127 | | }; |
128 | | static const uint8_t i_mb_b16x8_cost_table[17] = |
129 | | { |
130 | | 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 |
131 | | }; |
132 | | static const uint8_t i_sub_mb_b_cost_table[13] = |
133 | | { |
134 | | 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1 |
135 | | }; |
136 | | static const uint8_t i_sub_mb_p_cost_table[4] = |
137 | | { |
138 | | 5, 3, 3, 1 |
139 | | }; |
140 | | |
141 | | static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); |
142 | | |
143 | | static int init_costs( x264_t *h, float *logs, int qp ) |
144 | 0 | { |
145 | 0 | if( h->cost_mv[qp] ) |
146 | 0 | return 0; |
147 | | |
148 | 0 | int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; |
149 | 0 | int lambda = x264_lambda_tab[qp]; |
150 | | /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ |
151 | 0 | CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) ); |
152 | 0 | h->cost_mv[qp] += 2*4*mv_range; |
153 | 0 | for( int i = 0; i <= 2*4*mv_range; i++ ) |
154 | 0 | { |
155 | 0 | h->cost_mv[qp][-i] = |
156 | 0 | h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX ); |
157 | 0 | } |
158 | 0 | for( int i = 0; i < 3; i++ ) |
159 | 0 | for( int j = 0; j < 33; j++ ) |
160 | 0 | h->cost_table->ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0; |
161 | 0 | if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] ) |
162 | 0 | { |
163 | 0 | for( int j = 0; j < 4; j++ ) |
164 | 0 | { |
165 | 0 | CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) ); |
166 | 0 | h->cost_mv_fpel[qp][j] += 2*mv_range; |
167 | 0 | for( int i = -2*mv_range; i < 2*mv_range; i++ ) |
168 | 0 | h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j]; |
169 | 0 | } |
170 | 0 | } |
171 | 0 | uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[qp]; |
172 | 0 | for( int i = 0; i < 17; i++ ) |
173 | 0 | cost_i4x4_mode[i] = 3*lambda*(i!=8); |
174 | 0 | return 0; |
175 | 0 | fail: |
176 | 0 | return -1; |
177 | 0 | } |
178 | | |
179 | | int x264_analyse_init_costs( x264_t *h ) |
180 | 0 | { |
181 | 0 | int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; |
182 | 0 | float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) ); |
183 | 0 | if( !logs ) |
184 | 0 | return -1; |
185 | | |
186 | 0 | logs[0] = 0.718f; |
187 | 0 | for( int i = 1; i <= 2*4*mv_range; i++ ) |
188 | 0 | logs[i] = log2f( i+1 ) * 2.0f + 1.718f; |
189 | |
|
190 | 0 | for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) |
191 | 0 | if( init_costs( h, logs, qp ) ) |
192 | 0 | goto fail; |
193 | | |
194 | 0 | if( init_costs( h, logs, X264_LOOKAHEAD_QP ) ) |
195 | 0 | goto fail; |
196 | | |
197 | 0 | x264_free( logs ); |
198 | 0 | return 0; |
199 | 0 | fail: |
200 | 0 | x264_free( logs ); |
201 | 0 | return -1; |
202 | 0 | } Unexecuted instantiation: x264_8_analyse_init_costs Unexecuted instantiation: x264_10_analyse_init_costs |
203 | | |
204 | | void x264_analyse_free_costs( x264_t *h ) |
205 | 0 | { |
206 | 0 | int mv_range = h->param.analyse.i_mv_range << PARAM_INTERLACED; |
207 | 0 | for( int i = 0; i < QP_MAX+1; i++ ) |
208 | 0 | { |
209 | 0 | if( h->cost_mv[i] ) |
210 | 0 | x264_free( h->cost_mv[i] - 2*4*mv_range ); |
211 | 0 | for( int j = 0; j < 4; j++ ) |
212 | 0 | { |
213 | 0 | if( h->cost_mv_fpel[i][j] ) |
214 | 0 | x264_free( h->cost_mv_fpel[i][j] - 2*mv_range ); |
215 | 0 | } |
216 | 0 | } |
217 | 0 | } Unexecuted instantiation: x264_8_analyse_free_costs Unexecuted instantiation: x264_10_analyse_free_costs |
218 | | |
219 | | void x264_analyse_weight_frame( x264_t *h, int end ) |
220 | 0 | { |
221 | 0 | for( int j = 0; j < h->i_ref[0]; j++ ) |
222 | 0 | { |
223 | 0 | if( h->sh.weight[j][0].weightfn ) |
224 | 0 | { |
225 | 0 | x264_frame_t *frame = h->fref[0][j]; |
226 | 0 | int width = frame->i_width[0] + PADH2; |
227 | 0 | int i_padv = PADV << PARAM_INTERLACED; |
228 | 0 | int offset, height; |
229 | 0 | pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH_ALIGN; |
230 | 0 | height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted; |
231 | 0 | offset = h->fenc->i_lines_weighted*frame->i_stride[0]; |
232 | 0 | h->fenc->i_lines_weighted += height; |
233 | 0 | if( height ) |
234 | 0 | for( int k = j; k < h->i_ref[0]; k++ ) |
235 | 0 | if( h->sh.weight[k][0].weightfn ) |
236 | 0 | { |
237 | 0 | pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN; |
238 | 0 | x264_weight_scale_plane( h, dst + offset, frame->i_stride[0], |
239 | 0 | src + offset, frame->i_stride[0], |
240 | 0 | width, height, &h->sh.weight[k][0] ); |
241 | 0 | } |
242 | 0 | break; |
243 | 0 | } |
244 | 0 | } |
245 | 0 | } Unexecuted instantiation: x264_8_analyse_weight_frame Unexecuted instantiation: x264_10_analyse_weight_frame |
246 | | |
247 | | /* initialize an array of lambda*nbits for all possible mvs */ |
248 | | static void mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) |
249 | 0 | { |
250 | 0 | a->p_cost_mv = h->cost_mv[a->i_qp]; |
251 | 0 | a->p_cost_ref[0] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; |
252 | 0 | a->p_cost_ref[1] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; |
253 | 0 | } |
254 | | |
255 | | static void mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp ) |
256 | 0 | { |
257 | 0 | int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 ); |
258 | 0 | a->i_lambda = x264_lambda_tab[qp]; |
259 | 0 | a->i_lambda2 = x264_lambda2_tab[qp]; |
260 | |
|
261 | 0 | h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd; |
262 | 0 | if( h->param.analyse.i_trellis ) |
263 | 0 | { |
264 | 0 | h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp]; |
265 | 0 | h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp]; |
266 | 0 | h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp]; |
267 | 0 | h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp]; |
268 | 0 | } |
269 | 0 | h->mb.i_psy_rd_lambda = a->i_lambda; |
270 | | /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */ |
271 | 0 | int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET ); |
272 | 0 | h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256; |
273 | |
|
274 | 0 | if( qp > QP_MAX_SPEC ) |
275 | 0 | { |
276 | 0 | h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1]; |
277 | 0 | h->nr_residual_sum = h->nr_residual_sum_buf[1]; |
278 | 0 | h->nr_count = h->nr_count_buf[1]; |
279 | 0 | h->mb.b_noise_reduction = 1; |
280 | 0 | qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */ |
281 | 0 | } |
282 | 0 | else |
283 | 0 | { |
284 | 0 | h->nr_offset = h->nr_offset_denoise; |
285 | 0 | h->nr_residual_sum = h->nr_residual_sum_buf[0]; |
286 | 0 | h->nr_count = h->nr_count_buf[0]; |
287 | 0 | h->mb.b_noise_reduction = 0; |
288 | 0 | } |
289 | |
|
290 | 0 | a->i_qp = h->mb.i_qp = qp; |
291 | 0 | h->mb.i_chroma_qp = h->chroma_qp_table[qp]; |
292 | 0 | } |
293 | | |
294 | | static void mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) |
295 | 0 | { |
296 | 0 | int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); |
297 | | |
298 | | /* mbrd == 1 -> RD mode decision */ |
299 | | /* mbrd == 2 -> RD refinement */ |
300 | | /* mbrd == 3 -> QPRD */ |
301 | 0 | a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10); |
302 | 0 | h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1; |
303 | 0 | a->b_early_terminate = h->param.analyse.i_subpel_refine < 11; |
304 | |
|
305 | 0 | mb_analyse_init_qp( h, a, qp ); |
306 | |
|
307 | 0 | h->mb.b_transform_8x8 = 0; |
308 | | |
309 | | /* I: Intra part */ |
310 | 0 | a->i_satd_i16x16 = |
311 | 0 | a->i_satd_i8x8 = |
312 | 0 | a->i_satd_i4x4 = COST_MAX; |
313 | 0 | a->i_satd_chroma = CHROMA_FORMAT ? COST_MAX : 0; |
314 | | |
315 | | /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it. |
316 | | * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */ |
317 | 0 | uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8; |
318 | 0 | a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX; |
319 | |
|
320 | 0 | a->b_fast_intra = 0; |
321 | 0 | a->b_avoid_topright = 0; |
322 | 0 | h->mb.i_skip_intra = |
323 | 0 | h->mb.b_lossless ? 0 : |
324 | 0 | a->i_mbrd ? 2 : |
325 | 0 | !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction; |
326 | | |
327 | | /* II: Inter part P/B frame */ |
328 | 0 | if( h->sh.i_type != SLICE_TYPE_I ) |
329 | 0 | { |
330 | 0 | int i_fmv_range = 4 * h->param.analyse.i_mv_range; |
331 | | // limit motion search to a slightly smaller range than the theoretical limit, |
332 | | // since the search may go a few iterations past its given range |
333 | 0 | int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel |
334 | | |
335 | | /* Calculate max allowed MV range */ |
336 | 0 | h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 ); |
337 | 0 | h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); |
338 | 0 | h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range ); |
339 | 0 | h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 ); |
340 | 0 | if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) |
341 | 0 | { |
342 | 0 | int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */ |
343 | 0 | int max_mv = max_x - 4*16*h->mb.i_mb_x; |
344 | | /* If we're left of the refresh bar, don't reference right of it. */ |
345 | 0 | if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) |
346 | 0 | h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); |
347 | 0 | } |
348 | 0 | h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; |
349 | 0 | h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; |
350 | 0 | if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) ) |
351 | 0 | { |
352 | 0 | int mb_y = h->mb.i_mb_y >> SLICE_MBAFF; |
353 | 0 | int thread_mvy_range = i_fmv_range; |
354 | |
|
355 | 0 | if( h->i_thread_frames > 1 ) |
356 | 0 | { |
357 | 0 | int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16; |
358 | 0 | int thresh = pix_y + h->param.analyse.i_mv_range_thread; |
359 | 0 | for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) |
360 | 0 | for( int j = 0; j < h->i_ref[i]; j++ ) |
361 | 0 | { |
362 | 0 | int completed = x264_frame_cond_wait( h->fref[i][j]->orig, thresh ); |
363 | 0 | thread_mvy_range = X264_MIN( thread_mvy_range, completed - pix_y ); |
364 | 0 | } |
365 | |
|
366 | 0 | if( h->param.b_deterministic ) |
367 | 0 | thread_mvy_range = h->param.analyse.i_mv_range_thread; |
368 | 0 | if( PARAM_INTERLACED ) |
369 | 0 | thread_mvy_range >>= 1; |
370 | |
|
371 | 0 | x264_analyse_weight_frame( h, pix_y + thread_mvy_range ); |
372 | 0 | } |
373 | |
|
374 | 0 | if( PARAM_INTERLACED ) |
375 | 0 | { |
376 | | /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */ |
377 | 0 | for( int i = 0; i < 3; i++ ) |
378 | 0 | { |
379 | 0 | int j = i == 2; |
380 | 0 | mb_y = (h->mb.i_mb_y >> j) + (i == 1); |
381 | 0 | h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 ); |
382 | 0 | h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); |
383 | 0 | h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range ); |
384 | 0 | h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range ); |
385 | 0 | h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; |
386 | 0 | h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; |
387 | 0 | } |
388 | 0 | } |
389 | 0 | else |
390 | 0 | { |
391 | 0 | h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); |
392 | 0 | h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 ); |
393 | 0 | h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range ); |
394 | 0 | h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range ); |
395 | 0 | h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; |
396 | 0 | h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; |
397 | 0 | } |
398 | 0 | } |
399 | 0 | if( PARAM_INTERLACED ) |
400 | 0 | { |
401 | 0 | int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1; |
402 | 0 | h->mb.mv_min[1] = h->mb.mv_miny_row[i]; |
403 | 0 | h->mb.mv_max[1] = h->mb.mv_maxy_row[i]; |
404 | 0 | h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i]; |
405 | 0 | h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i]; |
406 | 0 | h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; |
407 | 0 | h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; |
408 | 0 | } |
409 | |
|
410 | 0 | a->l0.me16x16.cost = |
411 | 0 | a->l0.i_rd16x16 = |
412 | 0 | a->l0.i_cost8x8 = |
413 | 0 | a->l0.i_cost16x8 = |
414 | 0 | a->l0.i_cost8x16 = COST_MAX; |
415 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
416 | 0 | { |
417 | 0 | a->l1.me16x16.cost = |
418 | 0 | a->l1.i_rd16x16 = |
419 | 0 | a->l1.i_cost8x8 = |
420 | 0 | a->i_cost8x8direct[0] = |
421 | 0 | a->i_cost8x8direct[1] = |
422 | 0 | a->i_cost8x8direct[2] = |
423 | 0 | a->i_cost8x8direct[3] = |
424 | 0 | a->l1.i_cost16x8 = |
425 | 0 | a->l1.i_cost8x16 = |
426 | 0 | a->i_rd16x16bi = |
427 | 0 | a->i_rd16x16direct = |
428 | 0 | a->i_rd8x8bi = |
429 | 0 | a->i_rd16x8bi = |
430 | 0 | a->i_rd8x16bi = |
431 | 0 | a->i_cost16x16bi = |
432 | 0 | a->i_cost16x16direct = |
433 | 0 | a->i_cost8x8bi = |
434 | 0 | a->i_cost16x8bi = |
435 | 0 | a->i_cost8x16bi = COST_MAX; |
436 | 0 | } |
437 | 0 | else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) |
438 | 0 | for( int i = 0; i < 4; i++ ) |
439 | 0 | { |
440 | 0 | a->l0.i_cost4x4[i] = |
441 | 0 | a->l0.i_cost8x4[i] = |
442 | 0 | a->l0.i_cost4x8[i] = COST_MAX; |
443 | 0 | } |
444 | | |
445 | | /* Fast intra decision */ |
446 | 0 | if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 ) |
447 | 0 | { |
448 | 0 | if( IS_INTRA( h->mb.i_mb_type_left[0] ) || |
449 | 0 | IS_INTRA( h->mb.i_mb_type_top ) || |
450 | 0 | IS_INTRA( h->mb.i_mb_type_topleft ) || |
451 | 0 | IS_INTRA( h->mb.i_mb_type_topright ) || |
452 | 0 | (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) || |
453 | 0 | (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16] + h->stat.frame.i_mb_count[I_PCM])) ) |
454 | 0 | { /* intra is likely */ } |
455 | 0 | else |
456 | 0 | { |
457 | 0 | a->b_fast_intra = 1; |
458 | 0 | } |
459 | 0 | } |
460 | 0 | h->mb.b_skip_mc = 0; |
461 | 0 | if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P && |
462 | 0 | h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col ) |
463 | 0 | { |
464 | 0 | a->b_force_intra = 1; |
465 | 0 | a->b_fast_intra = 0; |
466 | 0 | a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col; |
467 | 0 | } |
468 | 0 | else |
469 | 0 | a->b_force_intra = 0; |
470 | 0 | } |
471 | 0 | } |
472 | | |
473 | | /* Prediction modes allowed for various combinations of neighbors. */ |
474 | | /* Terminated by a -1. */ |
475 | | /* In order, no neighbors, left, top, top/left, top/left/topleft */ |
476 | | static const int8_t i16x16_mode_available[5][5] = |
477 | | { |
478 | | {I_PRED_16x16_DC_128, -1, -1, -1, -1}, |
479 | | {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1}, |
480 | | {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1}, |
481 | | {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1}, |
482 | | {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1}, |
483 | | }; |
484 | | |
485 | | static const int8_t chroma_mode_available[5][5] = |
486 | | { |
487 | | {I_PRED_CHROMA_DC_128, -1, -1, -1, -1}, |
488 | | {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1}, |
489 | | {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1}, |
490 | | {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1}, |
491 | | {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1}, |
492 | | }; |
493 | | |
494 | | static const int8_t i8x8_mode_available[2][5][10] = |
495 | | { |
496 | | { |
497 | | {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, |
498 | | {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, |
499 | | {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1}, |
500 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1}, |
501 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1}, |
502 | | }, |
503 | | { |
504 | | {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, |
505 | | {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, |
506 | | {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, |
507 | | {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1}, |
508 | | {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, |
509 | | } |
510 | | }; |
511 | | |
512 | | static const int8_t i4x4_mode_available[2][5][10] = |
513 | | { |
514 | | { |
515 | | {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, |
516 | | {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, |
517 | | {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1}, |
518 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1}, |
519 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1}, |
520 | | }, |
521 | | { |
522 | | {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, |
523 | | {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, |
524 | | {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1}, |
525 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1}, |
526 | | {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1}, |
527 | | } |
528 | | }; |
529 | | |
530 | | static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour ) |
531 | 0 | { |
532 | 0 | int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); |
533 | 0 | idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); |
534 | 0 | return i16x16_mode_available[idx]; |
535 | 0 | } |
536 | | |
537 | | static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour ) |
538 | 0 | { |
539 | 0 | int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); |
540 | 0 | idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); |
541 | 0 | return chroma_mode_available[idx]; |
542 | 0 | } |
543 | | |
544 | | static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i ) |
545 | 0 | { |
546 | 0 | int avoid_topright = force_intra && (i&1); |
547 | 0 | int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); |
548 | 0 | idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); |
549 | 0 | return i8x8_mode_available[avoid_topright][idx]; |
550 | 0 | } |
551 | | |
552 | | static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i ) |
553 | 0 | { |
554 | 0 | int avoid_topright = force_intra && ((i&5) == 5); |
555 | 0 | int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); |
556 | 0 | idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); |
557 | 0 | return i4x4_mode_available[avoid_topright][idx]; |
558 | 0 | } |
559 | | |
560 | | /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ |
561 | | static inline void psy_trellis_init( x264_t *h, int do_both_dct ) |
562 | 0 | { |
563 | 0 | if( do_both_dct || h->mb.b_transform_8x8 ) |
564 | 0 | h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero ); |
565 | 0 | if( do_both_dct || !h->mb.b_transform_8x8 ) |
566 | 0 | h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero ); |
567 | 0 | } |
568 | | |
569 | | /* Reset fenc satd scores cache for psy RD */ |
570 | | static inline void mb_init_fenc_cache( x264_t *h, int b_satd ) |
571 | 0 | { |
572 | 0 | if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) |
573 | 0 | psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); |
574 | 0 | if( !h->mb.i_psy_rd ) |
575 | 0 | return; |
576 | | |
577 | 0 | M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO; |
578 | 0 | M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO; |
579 | 0 | M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO; |
580 | 0 | M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO; |
581 | 0 | h->mb.pic.fenc_hadamard_cache[8] = 0; |
582 | 0 | if( b_satd ) |
583 | 0 | h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) ); |
584 | 0 | } |
585 | | |
586 | | static void mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) |
587 | 0 | { |
588 | 0 | if( a->i_satd_chroma < COST_MAX ) |
589 | 0 | return; |
590 | | |
591 | 0 | if( CHROMA444 ) |
592 | 0 | { |
593 | 0 | if( !h->mb.b_chroma_me ) |
594 | 0 | { |
595 | 0 | a->i_satd_chroma = 0; |
596 | 0 | return; |
597 | 0 | } |
598 | | |
599 | | /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */ |
600 | 0 | if( h->mb.b_lossless ) |
601 | 0 | { |
602 | 0 | x264_predict_lossless_16x16( h, 1, a->i_predict16x16 ); |
603 | 0 | x264_predict_lossless_16x16( h, 2, a->i_predict16x16 ); |
604 | 0 | } |
605 | 0 | else |
606 | 0 | { |
607 | 0 | h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] ); |
608 | 0 | h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] ); |
609 | 0 | } |
610 | 0 | a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) |
611 | 0 | + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); |
612 | 0 | return; |
613 | 0 | } |
614 | | |
615 | 0 | const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); |
616 | 0 | int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; |
617 | | |
618 | | /* Prediction selection for chroma */ |
619 | 0 | if( predict_mode[3] >= 0 && !h->mb.b_lossless ) |
620 | 0 | { |
621 | 0 | int satdu[4], satdv[4]; |
622 | 0 | h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu ); |
623 | 0 | h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); |
624 | 0 | h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); |
625 | 0 | h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); |
626 | 0 | satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ); |
627 | 0 | satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); |
628 | |
|
629 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
630 | 0 | { |
631 | 0 | int i_mode = *predict_mode; |
632 | 0 | int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode ); |
633 | |
|
634 | 0 | a->i_satd_chroma_dir[i_mode] = i_satd; |
635 | 0 | COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); |
636 | 0 | } |
637 | 0 | } |
638 | 0 | else |
639 | 0 | { |
640 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
641 | 0 | { |
642 | 0 | int i_satd; |
643 | 0 | int i_mode = *predict_mode; |
644 | | |
645 | | /* we do the prediction */ |
646 | 0 | if( h->mb.b_lossless ) |
647 | 0 | x264_predict_lossless_chroma( h, i_mode ); |
648 | 0 | else |
649 | 0 | { |
650 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); |
651 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); |
652 | 0 | } |
653 | | |
654 | | /* we calculate the cost */ |
655 | 0 | i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + |
656 | 0 | h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) + |
657 | 0 | a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] ); |
658 | |
|
659 | 0 | a->i_satd_chroma_dir[i_mode] = i_satd; |
660 | 0 | COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); |
661 | 0 | } |
662 | 0 | } |
663 | |
|
664 | 0 | h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; |
665 | 0 | } |
666 | | |
667 | | /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */ |
668 | | static void mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) |
669 | 0 | { |
670 | 0 | const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter; |
671 | 0 | pixel *p_src = h->mb.pic.p_fenc[0]; |
672 | 0 | pixel *p_dst = h->mb.pic.p_fdec[0]; |
673 | 0 | static const int8_t intra_analysis_shortcut[2][2][2][5] = |
674 | 0 | { |
675 | 0 | {{{I_PRED_4x4_HU, -1, -1, -1, -1}, |
676 | 0 | {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}}, |
677 | 0 | {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1}, |
678 | 0 | {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}}, |
679 | 0 | {{{I_PRED_4x4_HU, -1, -1, -1, -1}, |
680 | 0 | {-1, -1, -1, -1, -1}}, |
681 | 0 | {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1}, |
682 | 0 | {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}}, |
683 | 0 | }; |
684 | |
|
685 | 0 | int idx; |
686 | 0 | int lambda = a->i_lambda; |
687 | | |
688 | | /*---------------- Try all mode and calculate their score ---------------*/ |
689 | | /* Disabled i16x16 for AVC-Intra compat */ |
690 | 0 | if( !h->param.i_avcintra_class ) |
691 | 0 | { |
692 | 0 | const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); |
693 | | |
694 | | /* Not heavily tuned */ |
695 | 0 | static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 }; |
696 | 0 | int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX; |
697 | |
|
698 | 0 | if( !h->mb.b_lossless && predict_mode[3] >= 0 ) |
699 | 0 | { |
700 | 0 | h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); |
701 | 0 | a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0); |
702 | 0 | a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1); |
703 | 0 | a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2); |
704 | 0 | COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 ); |
705 | 0 | COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 ); |
706 | 0 | COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 ); |
707 | | |
708 | | /* Plane is expensive, so don't check it unless one of the previous modes was useful. */ |
709 | 0 | if( a->i_satd_i16x16 <= i16x16_thresh ) |
710 | 0 | { |
711 | 0 | h->predict_16x16[I_PRED_16x16_P]( p_dst ); |
712 | 0 | a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ); |
713 | 0 | a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3); |
714 | 0 | COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 ); |
715 | 0 | } |
716 | 0 | } |
717 | 0 | else |
718 | 0 | { |
719 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
720 | 0 | { |
721 | 0 | int i_satd; |
722 | 0 | int i_mode = *predict_mode; |
723 | |
|
724 | 0 | if( h->mb.b_lossless ) |
725 | 0 | x264_predict_lossless_16x16( h, 0, i_mode ); |
726 | 0 | else |
727 | 0 | h->predict_16x16[i_mode]( p_dst ); |
728 | |
|
729 | 0 | i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) + |
730 | 0 | lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); |
731 | 0 | COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); |
732 | 0 | a->i_satd_i16x16_dir[i_mode] = i_satd; |
733 | 0 | } |
734 | 0 | } |
735 | |
|
736 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
737 | | /* cavlc mb type prefix */ |
738 | 0 | a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16]; |
739 | |
|
740 | 0 | if( a->i_satd_i16x16 > i16x16_thresh ) |
741 | 0 | return; |
742 | 0 | } |
743 | | |
744 | 0 | uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[a->i_qp] + 8; |
745 | | /* 8x8 prediction selection */ |
746 | 0 | if( flags & X264_ANALYSE_I8x8 ) |
747 | 0 | { |
748 | 0 | ALIGNED_ARRAY_32( pixel, edge,[36] ); |
749 | 0 | x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; |
750 | 0 | int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); |
751 | | |
752 | | // FIXME some bias like in i4x4? |
753 | 0 | int i_cost = lambda * 4; /* base predmode costs */ |
754 | 0 | h->mb.i_cbp_luma = 0; |
755 | |
|
756 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
757 | 0 | i_cost += lambda * i_mb_b_cost_table[I_8x8]; |
758 | |
|
759 | 0 | for( idx = 0;; idx++ ) |
760 | 0 | { |
761 | 0 | int x = idx&1; |
762 | 0 | int y = idx>>1; |
763 | 0 | pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE; |
764 | 0 | pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE; |
765 | 0 | int i_best = COST_MAX; |
766 | 0 | int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); |
767 | |
|
768 | 0 | const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx ); |
769 | 0 | h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); |
770 | |
|
771 | 0 | if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 ) |
772 | 0 | { |
773 | | /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ |
774 | 0 | i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] ); |
775 | 0 | i_cost += i_best & 0xffff; |
776 | 0 | i_best >>= 16; |
777 | 0 | a->i_predict8x8[idx] = i_best; |
778 | 0 | if( idx == 3 || i_cost > i_satd_thresh ) |
779 | 0 | break; |
780 | 0 | x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best ); |
781 | 0 | } |
782 | 0 | else |
783 | 0 | { |
784 | 0 | if( !h->mb.b_lossless && predict_mode[5] >= 0 ) |
785 | 0 | { |
786 | 0 | ALIGNED_ARRAY_16( int32_t, satd,[4] ); |
787 | 0 | h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); |
788 | 0 | int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; |
789 | 0 | if( i_pred_mode < 3 ) |
790 | 0 | satd[i_pred_mode] -= 3 * lambda; |
791 | 0 | for( int i = 2; i >= 0; i-- ) |
792 | 0 | { |
793 | 0 | int cost = satd[i]; |
794 | 0 | a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda; |
795 | 0 | COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); |
796 | 0 | } |
797 | | |
798 | | /* Take analysis shortcuts: don't analyse modes that are too |
799 | | * far away direction-wise from the favored mode. */ |
800 | 0 | if( a->i_mbrd < 1 + a->b_fast_intra ) |
801 | 0 | predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; |
802 | 0 | else |
803 | 0 | predict_mode += 3; |
804 | 0 | } |
805 | |
|
806 | 0 | for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ ) |
807 | 0 | { |
808 | 0 | int i_satd; |
809 | 0 | int i_mode = *predict_mode; |
810 | |
|
811 | 0 | if( h->mb.b_lossless ) |
812 | 0 | x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge ); |
813 | 0 | else |
814 | 0 | h->predict_8x8[i_mode]( p_dst_by, edge ); |
815 | |
|
816 | 0 | i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); |
817 | 0 | if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) |
818 | 0 | i_satd -= 3 * lambda; |
819 | |
|
820 | 0 | COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode ); |
821 | 0 | a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda; |
822 | 0 | } |
823 | 0 | i_cost += i_best + 3*lambda; |
824 | |
|
825 | 0 | if( idx == 3 || i_cost > i_satd_thresh ) |
826 | 0 | break; |
827 | 0 | if( h->mb.b_lossless ) |
828 | 0 | x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge ); |
829 | 0 | else |
830 | 0 | h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge ); |
831 | 0 | x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); |
832 | 0 | } |
833 | | /* we need to encode this block now (for next ones) */ |
834 | 0 | x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 ); |
835 | 0 | } |
836 | |
|
837 | 0 | if( idx == 3 ) |
838 | 0 | { |
839 | 0 | a->i_satd_i8x8 = i_cost; |
840 | 0 | if( h->mb.i_skip_intra ) |
841 | 0 | { |
842 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); |
843 | 0 | h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); |
844 | 0 | h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); |
845 | 0 | h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); |
846 | 0 | h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); |
847 | 0 | h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; |
848 | 0 | if( h->mb.i_skip_intra == 2 ) |
849 | 0 | h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); |
850 | 0 | } |
851 | 0 | } |
852 | 0 | else |
853 | 0 | { |
854 | 0 | static const uint16_t cost_div_fix8[3] = {1024,512,341}; |
855 | 0 | a->i_satd_i8x8 = COST_MAX; |
856 | 0 | i_cost = (i_cost * cost_div_fix8[idx]) >> 8; |
857 | 0 | } |
858 | | /* Not heavily tuned */ |
859 | 0 | static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 }; |
860 | 0 | if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 ) |
861 | 0 | return; |
862 | 0 | } |
863 | | |
864 | | /* 4x4 prediction selection */ |
865 | 0 | if( flags & X264_ANALYSE_I4x4 ) |
866 | 0 | { |
867 | 0 | int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */ |
868 | 0 | int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX; |
869 | 0 | h->mb.i_cbp_luma = 0; |
870 | |
|
871 | 0 | if( a->b_early_terminate && a->i_mbrd ) |
872 | 0 | i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; |
873 | |
|
874 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
875 | 0 | i_cost += lambda * i_mb_b_cost_table[I_4x4]; |
876 | |
|
877 | 0 | for( idx = 0;; idx++ ) |
878 | 0 | { |
879 | 0 | pixel *p_src_by = p_src + block_idx_xy_fenc[idx]; |
880 | 0 | pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx]; |
881 | 0 | int i_best = COST_MAX; |
882 | 0 | int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); |
883 | |
|
884 | 0 | const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx ); |
885 | |
|
886 | 0 | if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) |
887 | | /* emulate missing topright samples */ |
888 | 0 | MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] ); |
889 | |
|
890 | 0 | if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 ) |
891 | 0 | { |
892 | | /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */ |
893 | 0 | i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode ); |
894 | 0 | i_cost += i_best & 0xffff; |
895 | 0 | i_best >>= 16; |
896 | 0 | a->i_predict4x4[idx] = i_best; |
897 | 0 | if( i_cost > i_satd_thresh || idx == 15 ) |
898 | 0 | break; |
899 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best; |
900 | 0 | } |
901 | 0 | else |
902 | 0 | { |
903 | 0 | if( !h->mb.b_lossless && predict_mode[5] >= 0 ) |
904 | 0 | { |
905 | 0 | ALIGNED_ARRAY_16( int32_t, satd,[4] ); |
906 | 0 | h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); |
907 | 0 | int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V]; |
908 | 0 | if( i_pred_mode < 3 ) |
909 | 0 | satd[i_pred_mode] -= 3 * lambda; |
910 | 0 | i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC; |
911 | 0 | COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H ); |
912 | 0 | COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V ); |
913 | | |
914 | | /* Take analysis shortcuts: don't analyse modes that are too |
915 | | * far away direction-wise from the favored mode. */ |
916 | 0 | if( a->i_mbrd < 1 + a->b_fast_intra ) |
917 | 0 | predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical]; |
918 | 0 | else |
919 | 0 | predict_mode += 3; |
920 | 0 | } |
921 | |
|
922 | 0 | if( i_best > 0 ) |
923 | 0 | { |
924 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
925 | 0 | { |
926 | 0 | int i_satd; |
927 | 0 | int i_mode = *predict_mode; |
928 | |
|
929 | 0 | if( h->mb.b_lossless ) |
930 | 0 | x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode ); |
931 | 0 | else |
932 | 0 | h->predict_4x4[i_mode]( p_dst_by ); |
933 | |
|
934 | 0 | i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE ); |
935 | 0 | if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) |
936 | 0 | { |
937 | 0 | i_satd -= lambda * 3; |
938 | 0 | if( i_satd <= 0 ) |
939 | 0 | { |
940 | 0 | i_best = i_satd; |
941 | 0 | a->i_predict4x4[idx] = i_mode; |
942 | 0 | break; |
943 | 0 | } |
944 | 0 | } |
945 | | |
946 | 0 | COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); |
947 | 0 | } |
948 | 0 | } |
949 | |
|
950 | 0 | i_cost += i_best + 3 * lambda; |
951 | 0 | if( i_cost > i_satd_thresh || idx == 15 ) |
952 | 0 | break; |
953 | 0 | if( h->mb.b_lossless ) |
954 | 0 | x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] ); |
955 | 0 | else |
956 | 0 | h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by ); |
957 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; |
958 | 0 | } |
959 | | /* we need to encode this block now (for next ones) */ |
960 | 0 | x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 ); |
961 | 0 | } |
962 | 0 | if( idx == 15 ) |
963 | 0 | { |
964 | 0 | a->i_satd_i4x4 = i_cost; |
965 | 0 | if( h->mb.i_skip_intra ) |
966 | 0 | { |
967 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); |
968 | 0 | h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); |
969 | 0 | h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); |
970 | 0 | h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); |
971 | 0 | h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); |
972 | 0 | h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; |
973 | 0 | if( h->mb.i_skip_intra == 2 ) |
974 | 0 | h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); |
975 | 0 | } |
976 | 0 | } |
977 | 0 | else |
978 | 0 | a->i_satd_i4x4 = COST_MAX; |
979 | 0 | } |
980 | 0 | } |
981 | | |
982 | | static void intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh ) |
983 | 0 | { |
984 | 0 | if( !a->b_early_terminate ) |
985 | 0 | i_satd_thresh = COST_MAX; |
986 | |
|
987 | 0 | if( a->i_satd_i16x16 < i_satd_thresh ) |
988 | 0 | { |
989 | 0 | h->mb.i_type = I_16x16; |
990 | 0 | analyse_update_cache( h, a ); |
991 | 0 | a->i_satd_i16x16 = rd_cost_mb( h, a->i_lambda2 ); |
992 | 0 | } |
993 | 0 | else |
994 | 0 | a->i_satd_i16x16 = COST_MAX; |
995 | |
|
996 | 0 | if( a->i_satd_i4x4 < i_satd_thresh ) |
997 | 0 | { |
998 | 0 | h->mb.i_type = I_4x4; |
999 | 0 | analyse_update_cache( h, a ); |
1000 | 0 | a->i_satd_i4x4 = rd_cost_mb( h, a->i_lambda2 ); |
1001 | 0 | } |
1002 | 0 | else |
1003 | 0 | a->i_satd_i4x4 = COST_MAX; |
1004 | |
|
1005 | 0 | if( a->i_satd_i8x8 < i_satd_thresh ) |
1006 | 0 | { |
1007 | 0 | h->mb.i_type = I_8x8; |
1008 | 0 | analyse_update_cache( h, a ); |
1009 | 0 | a->i_satd_i8x8 = rd_cost_mb( h, a->i_lambda2 ); |
1010 | 0 | a->i_cbp_i8x8_luma = h->mb.i_cbp_luma; |
1011 | 0 | } |
1012 | 0 | else |
1013 | 0 | a->i_satd_i8x8 = COST_MAX; |
1014 | 0 | } |
1015 | | |
1016 | | static void intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) |
1017 | 0 | { |
1018 | 0 | uint64_t i_satd, i_best; |
1019 | 0 | int plane_count = CHROMA444 ? 3 : 1; |
1020 | 0 | h->mb.i_skip_intra = 0; |
1021 | |
|
1022 | 0 | if( h->mb.i_type == I_16x16 ) |
1023 | 0 | { |
1024 | 0 | int old_pred_mode = a->i_predict16x16; |
1025 | 0 | const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); |
1026 | 0 | int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX; |
1027 | 0 | i_best = a->i_satd_i16x16; |
1028 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
1029 | 0 | { |
1030 | 0 | int i_mode = *predict_mode; |
1031 | 0 | if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh ) |
1032 | 0 | continue; |
1033 | 0 | h->mb.i_intra16x16_pred_mode = i_mode; |
1034 | 0 | i_satd = rd_cost_mb( h, a->i_lambda2 ); |
1035 | 0 | COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode ); |
1036 | 0 | } |
1037 | 0 | } |
1038 | | |
1039 | | /* RD selection for chroma prediction */ |
1040 | 0 | if( CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422 ) |
1041 | 0 | { |
1042 | 0 | const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); |
1043 | 0 | if( predict_mode[1] >= 0 ) |
1044 | 0 | { |
1045 | 0 | int8_t predict_mode_sorted[4]; |
1046 | 0 | int i_max; |
1047 | 0 | int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX; |
1048 | |
|
1049 | 0 | for( i_max = 0; *predict_mode >= 0; predict_mode++ ) |
1050 | 0 | { |
1051 | 0 | int i_mode = *predict_mode; |
1052 | 0 | if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma ) |
1053 | 0 | predict_mode_sorted[i_max++] = i_mode; |
1054 | 0 | } |
1055 | |
|
1056 | 0 | if( i_max > 0 ) |
1057 | 0 | { |
1058 | 0 | int i_cbp_chroma_best = h->mb.i_cbp_chroma; |
1059 | 0 | int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp]; |
1060 | | /* the previous thing encoded was intra_rd(), so the pixels and |
1061 | | * coefs for the current chroma mode are still around, so we only |
1062 | | * have to recount the bits. */ |
1063 | 0 | i_best = rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); |
1064 | 0 | for( int i = 0; i < i_max; i++ ) |
1065 | 0 | { |
1066 | 0 | int i_mode = predict_mode_sorted[i]; |
1067 | 0 | if( h->mb.b_lossless ) |
1068 | 0 | x264_predict_lossless_chroma( h, i_mode ); |
1069 | 0 | else |
1070 | 0 | { |
1071 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); |
1072 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); |
1073 | 0 | } |
1074 | | /* if we've already found a mode that needs no residual, then |
1075 | | * probably any mode with a residual will be worse. |
1076 | | * so avoid dct on the remaining modes to improve speed. */ |
1077 | 0 | i_satd = rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); |
1078 | 0 | COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma ); |
1079 | 0 | } |
1080 | 0 | h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; |
1081 | 0 | h->mb.i_cbp_chroma = i_cbp_chroma_best; |
1082 | 0 | } |
1083 | 0 | } |
1084 | 0 | } |
1085 | |
|
1086 | 0 | if( h->mb.i_type == I_4x4 ) |
1087 | 0 | { |
1088 | 0 | pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning |
1089 | 0 | int nnz[3] = {0}; |
1090 | 0 | for( int idx = 0; idx < 16; idx++ ) |
1091 | 0 | { |
1092 | 0 | pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx], |
1093 | 0 | CHROMA_FORMAT ? h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx] : NULL, |
1094 | 0 | CHROMA_FORMAT ? h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx] : NULL}; |
1095 | 0 | i_best = COST_MAX64; |
1096 | |
|
1097 | 0 | const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx ); |
1098 | |
|
1099 | 0 | if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) |
1100 | 0 | for( int p = 0; p < plane_count; p++ ) |
1101 | | /* emulate missing topright samples */ |
1102 | 0 | MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] ); |
1103 | |
|
1104 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
1105 | 0 | { |
1106 | 0 | int i_mode = *predict_mode; |
1107 | 0 | i_satd = rd_cost_i4x4( h, a->i_lambda2, idx, i_mode ); |
1108 | |
|
1109 | 0 | if( i_best > i_satd ) |
1110 | 0 | { |
1111 | 0 | a->i_predict4x4[idx] = i_mode; |
1112 | 0 | i_best = i_satd; |
1113 | 0 | for( int p = 0; p < plane_count; p++ ) |
1114 | 0 | { |
1115 | 0 | pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE ); |
1116 | 0 | pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE ); |
1117 | 0 | pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE ); |
1118 | 0 | pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE ); |
1119 | 0 | nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]]; |
1120 | 0 | } |
1121 | 0 | } |
1122 | 0 | } |
1123 | |
|
1124 | 0 | for( int p = 0; p < plane_count; p++ ) |
1125 | 0 | { |
1126 | 0 | MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0]; |
1127 | 0 | MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1]; |
1128 | 0 | MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2]; |
1129 | 0 | MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3]; |
1130 | 0 | h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p]; |
1131 | 0 | } |
1132 | |
|
1133 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; |
1134 | 0 | } |
1135 | 0 | } |
1136 | 0 | else if( h->mb.i_type == I_8x8 ) |
1137 | 0 | { |
1138 | 0 | ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap |
1139 | 0 | pixel4 pels_h[3][2] = {{0}}; |
1140 | 0 | pixel pels_v[3][7] = {{0}}; |
1141 | 0 | uint16_t nnz[3][2] = {{0}}; //shut up gcc |
1142 | 0 | for( int idx = 0; idx < 4; idx++ ) |
1143 | 0 | { |
1144 | 0 | int x = idx&1; |
1145 | 0 | int y = idx>>1; |
1146 | 0 | int s8 = X264_SCAN8_0 + 2*x + 16*y; |
1147 | 0 | pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE, |
1148 | 0 | CHROMA_FORMAT ? h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE : NULL, |
1149 | 0 | CHROMA_FORMAT ? h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE : NULL}; |
1150 | 0 | int cbp_luma_new = 0; |
1151 | 0 | int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX; |
1152 | |
|
1153 | 0 | i_best = COST_MAX64; |
1154 | |
|
1155 | 0 | const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx ); |
1156 | 0 | for( int p = 0; p < plane_count; p++ ) |
1157 | 0 | h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); |
1158 | |
|
1159 | 0 | for( ; *predict_mode >= 0; predict_mode++ ) |
1160 | 0 | { |
1161 | 0 | int i_mode = *predict_mode; |
1162 | 0 | if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh ) |
1163 | 0 | continue; |
1164 | | |
1165 | 0 | h->mb.i_cbp_luma = a->i_cbp_i8x8_luma; |
1166 | 0 | i_satd = rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge ); |
1167 | |
|
1168 | 0 | if( i_best > i_satd ) |
1169 | 0 | { |
1170 | 0 | a->i_predict8x8[idx] = i_mode; |
1171 | 0 | cbp_luma_new = h->mb.i_cbp_luma; |
1172 | 0 | i_best = i_satd; |
1173 | |
|
1174 | 0 | for( int p = 0; p < plane_count; p++ ) |
1175 | 0 | { |
1176 | 0 | pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ); |
1177 | 0 | pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ); |
1178 | 0 | if( !(idx&1) ) |
1179 | 0 | for( int j = 0; j < 7; j++ ) |
1180 | 0 | pels_v[p][j] = dst[p][7+j*FDEC_STRIDE]; |
1181 | 0 | nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ); |
1182 | 0 | nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ); |
1183 | 0 | } |
1184 | 0 | } |
1185 | 0 | } |
1186 | 0 | a->i_cbp_i8x8_luma = cbp_luma_new; |
1187 | 0 | for( int p = 0; p < plane_count; p++ ) |
1188 | 0 | { |
1189 | 0 | MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0]; |
1190 | 0 | MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1]; |
1191 | 0 | if( !(idx&1) ) |
1192 | 0 | for( int j = 0; j < 7; j++ ) |
1193 | 0 | dst[p][7+j*FDEC_STRIDE] = pels_v[p][j]; |
1194 | 0 | M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0]; |
1195 | 0 | M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1]; |
1196 | 0 | } |
1197 | |
|
1198 | 0 | x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); |
1199 | 0 | } |
1200 | 0 | } |
1201 | 0 | } |
1202 | | |
1203 | 0 | #define LOAD_FENC(m, src, xoff, yoff) \ |
1204 | 0 | { \ |
1205 | 0 | (m)->p_cost_mv = a->p_cost_mv; \ |
1206 | 0 | (m)->i_stride[0] = h->mb.pic.i_stride[0]; \ |
1207 | 0 | (m)->i_stride[1] = h->mb.pic.i_stride[1]; \ |
1208 | 0 | (m)->i_stride[2] = h->mb.pic.i_stride[2]; \ |
1209 | 0 | (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \ |
1210 | 0 | if( CHROMA_FORMAT ) \ |
1211 | 0 | { \ |
1212 | 0 | (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \ |
1213 | 0 | (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \ |
1214 | 0 | } \ |
1215 | 0 | } |
1216 | | |
1217 | 0 | #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \ |
1218 | 0 | { \ |
1219 | 0 | (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1220 | 0 | if( h->param.analyse.i_subpel_refine ) \ |
1221 | 0 | { \ |
1222 | 0 | (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1223 | 0 | (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1224 | 0 | (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1225 | 0 | } \ |
1226 | 0 | if( CHROMA444 ) \ |
1227 | 0 | { \ |
1228 | 0 | (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \ |
1229 | 0 | (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \ |
1230 | 0 | if( h->param.analyse.i_subpel_refine ) \ |
1231 | 0 | { \ |
1232 | 0 | (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \ |
1233 | 0 | (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \ |
1234 | 0 | (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \ |
1235 | 0 | (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \ |
1236 | 0 | (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \ |
1237 | 0 | (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \ |
1238 | 0 | } \ |
1239 | 0 | } \ |
1240 | 0 | else if( CHROMA_FORMAT ) \ |
1241 | 0 | (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \ |
1242 | 0 | if( h->param.analyse.i_me_method >= X264_ME_ESA ) \ |
1243 | 0 | (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1244 | 0 | (m)->weight = x264_weight_none; \ |
1245 | 0 | (m)->i_ref = ref; \ |
1246 | 0 | } |
1247 | | |
1248 | | #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \ |
1249 | 0 | (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \ |
1250 | 0 | (m)->weight = h->sh.weight[i_ref]; |
1251 | | |
1252 | | #define REF_COST(list, ref) \ |
1253 | 0 | (a->p_cost_ref[list][ref]) |
1254 | | |
1255 | | static void mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) |
1256 | 0 | { |
1257 | 0 | x264_me_t m; |
1258 | 0 | int i_mvc; |
1259 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[8],[2] ); |
1260 | 0 | int i_halfpel_thresh = INT_MAX; |
1261 | 0 | int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL; |
1262 | | |
1263 | | /* 16x16 Search on all ref frame */ |
1264 | 0 | m.i_pixel = PIXEL_16x16; |
1265 | 0 | LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); |
1266 | |
|
1267 | 0 | a->l0.me16x16.cost = INT_MAX; |
1268 | 0 | for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) |
1269 | 0 | { |
1270 | 0 | m.i_ref_cost = REF_COST( 0, i_ref ); |
1271 | 0 | i_halfpel_thresh -= m.i_ref_cost; |
1272 | | |
1273 | | /* search with ref */ |
1274 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 ); |
1275 | 0 | LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 ); |
1276 | |
|
1277 | 0 | x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); |
1278 | |
|
1279 | 0 | if( h->mb.ref_blind_dupe == i_ref ) |
1280 | 0 | { |
1281 | 0 | CP32( m.mv, a->l0.mvc[0][0] ); |
1282 | 0 | x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh ); |
1283 | 0 | } |
1284 | 0 | else |
1285 | 0 | { |
1286 | 0 | x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); |
1287 | 0 | x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); |
1288 | 0 | } |
1289 | | |
1290 | | /* save mv for predicting neighbors */ |
1291 | 0 | CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv ); |
1292 | 0 | CP32( a->l0.mvc[i_ref][0], m.mv ); |
1293 | | |
1294 | | /* early termination |
1295 | | * SSD threshold would probably be better than SATD */ |
1296 | 0 | if( i_ref == 0 |
1297 | 0 | && a->b_try_skip |
1298 | 0 | && m.cost-m.cost_mv < 300*a->i_lambda |
1299 | 0 | && abs(m.mv[0]-h->mb.cache.pskip_mv[0]) |
1300 | 0 | + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1 |
1301 | 0 | && x264_macroblock_probe_pskip( h ) ) |
1302 | 0 | { |
1303 | 0 | h->mb.i_type = P_SKIP; |
1304 | 0 | analyse_update_cache( h, a ); |
1305 | 0 | assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); |
1306 | 0 | return; |
1307 | 0 | } |
1308 | | |
1309 | 0 | m.cost += m.i_ref_cost; |
1310 | 0 | i_halfpel_thresh += m.i_ref_cost; |
1311 | |
|
1312 | 0 | if( m.cost < a->l0.me16x16.cost ) |
1313 | 0 | h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); |
1314 | 0 | } |
1315 | | |
1316 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); |
1317 | 0 | assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); |
1318 | |
|
1319 | 0 | h->mb.i_type = P_L0; |
1320 | 0 | if( a->i_mbrd ) |
1321 | 0 | { |
1322 | 0 | mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ); |
1323 | 0 | if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra ) |
1324 | 0 | { |
1325 | 0 | h->mb.i_partition = D_16x16; |
1326 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); |
1327 | 0 | a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); |
1328 | 0 | if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) ) |
1329 | 0 | h->mb.i_type = P_SKIP; |
1330 | 0 | } |
1331 | 0 | } |
1332 | 0 | } |
1333 | | |
1334 | | static void mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) |
1335 | 0 | { |
1336 | 0 | x264_me_t m; |
1337 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1338 | 0 | int i_maxref = h->mb.pic.i_fref[0]-1; |
1339 | |
|
1340 | 0 | h->mb.i_partition = D_8x8; |
1341 | |
|
1342 | 0 | #define CHECK_NEIGHBOUR(i)\ |
1343 | 0 | {\ |
1344 | 0 | int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\ |
1345 | 0 | if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\ |
1346 | 0 | i_maxref = ref;\ |
1347 | 0 | } |
1348 | | |
1349 | | /* early termination: if 16x16 chose ref 0, then evaluate no refs older |
1350 | | * than those used by the neighbors */ |
1351 | 0 | if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) && |
1352 | 0 | h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) ) |
1353 | 0 | { |
1354 | 0 | i_maxref = 0; |
1355 | 0 | CHECK_NEIGHBOUR( -8 - 1 ); |
1356 | 0 | CHECK_NEIGHBOUR( -8 + 0 ); |
1357 | 0 | CHECK_NEIGHBOUR( -8 + 2 ); |
1358 | 0 | CHECK_NEIGHBOUR( -8 + 4 ); |
1359 | 0 | CHECK_NEIGHBOUR( 0 - 1 ); |
1360 | 0 | CHECK_NEIGHBOUR( 2*8 - 1 ); |
1361 | 0 | } |
1362 | 0 | #undef CHECK_NEIGHBOUR |
1363 | |
|
1364 | 0 | for( int i_ref = 0; i_ref <= i_maxref; i_ref++ ) |
1365 | 0 | CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] ); |
1366 | |
|
1367 | 0 | for( int i = 0; i < 4; i++ ) |
1368 | 0 | { |
1369 | 0 | x264_me_t *l0m = &a->l0.me8x8[i]; |
1370 | 0 | int x8 = i&1; |
1371 | 0 | int y8 = i>>1; |
1372 | |
|
1373 | 0 | m.i_pixel = PIXEL_8x8; |
1374 | |
|
1375 | 0 | LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 ); |
1376 | 0 | l0m->cost = INT_MAX; |
1377 | 0 | for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; ) |
1378 | 0 | { |
1379 | 0 | m.i_ref_cost = REF_COST( 0, i_ref ); |
1380 | |
|
1381 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); |
1382 | 0 | LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); |
1383 | |
|
1384 | 0 | x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); |
1385 | 0 | x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); |
1386 | 0 | if( h->mb.ref_blind_dupe == i_ref ) |
1387 | 0 | { |
1388 | 0 | CP32( m.mv, a->l0.mvc[0][i+1] ); |
1389 | 0 | x264_me_refine_qpel_refdupe( h, &m, NULL ); |
1390 | 0 | } |
1391 | 0 | else |
1392 | 0 | x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 ); |
1393 | |
|
1394 | 0 | m.cost += m.i_ref_cost; |
1395 | |
|
1396 | 0 | CP32( a->l0.mvc[i_ref][i+1], m.mv ); |
1397 | |
|
1398 | 0 | if( m.cost < l0m->cost ) |
1399 | 0 | h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); |
1400 | 0 | if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe ) |
1401 | 0 | i_ref = h->mb.ref_blind_dupe; |
1402 | 0 | else |
1403 | 0 | i_ref++; |
1404 | 0 | } |
1405 | 0 | x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); |
1406 | 0 | x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); |
1407 | |
|
1408 | 0 | a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost ); |
1409 | | |
1410 | | /* If CABAC is on and we're not doing sub-8x8 analysis, the costs |
1411 | | are effectively zero. */ |
1412 | 0 | if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) |
1413 | 0 | l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; |
1414 | 0 | } |
1415 | |
|
1416 | 0 | a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + |
1417 | 0 | a->l0.me8x8[2].cost + a->l0.me8x8[3].cost; |
1418 | | /* P_8x8 ref0 has no ref cost */ |
1419 | 0 | if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref | |
1420 | 0 | a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) ) |
1421 | 0 | a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4; |
1422 | 0 | M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; |
1423 | 0 | } |
1424 | | |
1425 | | static void mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) |
1426 | 0 | { |
1427 | | /* Duplicate refs are rarely useful in p8x8 due to the high cost of the |
1428 | | * reference frame flags. Thus, if we're not doing mixedrefs, just |
1429 | | * don't bother analysing the dupes. */ |
1430 | 0 | const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref; |
1431 | 0 | const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0; |
1432 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1433 | 0 | int i_mvc; |
1434 | 0 | int16_t (*mvc)[2] = a->l0.mvc[i_ref]; |
1435 | | |
1436 | | /* XXX Needed for x264_mb_predict_mv */ |
1437 | 0 | h->mb.i_partition = D_8x8; |
1438 | |
|
1439 | 0 | i_mvc = 1; |
1440 | 0 | CP32( mvc[0], a->l0.me16x16.mv ); |
1441 | |
|
1442 | 0 | for( int i = 0; i < 4; i++ ) |
1443 | 0 | { |
1444 | 0 | x264_me_t *m = &a->l0.me8x8[i]; |
1445 | 0 | int x8 = i&1; |
1446 | 0 | int y8 = i>>1; |
1447 | |
|
1448 | 0 | m->i_pixel = PIXEL_8x8; |
1449 | 0 | m->i_ref_cost = i_ref_cost; |
1450 | |
|
1451 | 0 | LOAD_FENC( m, p_fenc, 8*x8, 8*y8 ); |
1452 | 0 | LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); |
1453 | 0 | LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); |
1454 | |
|
1455 | 0 | x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); |
1456 | 0 | x264_me_search( h, m, mvc, i_mvc ); |
1457 | |
|
1458 | 0 | x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv ); |
1459 | |
|
1460 | 0 | CP32( mvc[i_mvc], m->mv ); |
1461 | 0 | i_mvc++; |
1462 | |
|
1463 | 0 | a->i_satd8x8[0][i] = m->cost - m->cost_mv; |
1464 | | |
1465 | | /* mb type cost */ |
1466 | 0 | m->cost += i_ref_cost; |
1467 | 0 | if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) |
1468 | 0 | m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; |
1469 | 0 | } |
1470 | |
|
1471 | 0 | a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + |
1472 | 0 | a->l0.me8x8[2].cost + a->l0.me8x8[3].cost; |
1473 | | /* theoretically this should include 4*ref_cost, |
1474 | | * but 3 seems a better approximation of cabac. */ |
1475 | 0 | if( h->param.b_cabac ) |
1476 | 0 | a->l0.i_cost8x8 -= i_ref_cost; |
1477 | 0 | M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; |
1478 | 0 | } |
1479 | | |
1480 | | static void mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) |
1481 | 0 | { |
1482 | 0 | x264_me_t m; |
1483 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1484 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); |
1485 | | |
1486 | | /* XXX Needed for x264_mb_predict_mv */ |
1487 | 0 | h->mb.i_partition = D_16x8; |
1488 | |
|
1489 | 0 | for( int i = 0; i < 2; i++ ) |
1490 | 0 | { |
1491 | 0 | x264_me_t *l0m = &a->l0.me16x8[i]; |
1492 | 0 | const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); |
1493 | 0 | const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); |
1494 | 0 | const int ref8[2] = { minref, maxref }; |
1495 | 0 | const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; |
1496 | |
|
1497 | 0 | m.i_pixel = PIXEL_16x8; |
1498 | |
|
1499 | 0 | LOAD_FENC( &m, p_fenc, 0, 8*i ); |
1500 | 0 | l0m->cost = INT_MAX; |
1501 | 0 | for( int j = 0; j < i_ref8s; j++ ) |
1502 | 0 | { |
1503 | 0 | const int i_ref = ref8[j]; |
1504 | 0 | m.i_ref_cost = REF_COST( 0, i_ref ); |
1505 | | |
1506 | | /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ |
1507 | 0 | CP32( mvc[0], a->l0.mvc[i_ref][0] ); |
1508 | 0 | CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] ); |
1509 | 0 | CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] ); |
1510 | |
|
1511 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); |
1512 | 0 | LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i ); |
1513 | |
|
1514 | 0 | x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); |
1515 | 0 | x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); |
1516 | | /* We can only take this shortcut if the first search was performed on ref0. */ |
1517 | 0 | if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) |
1518 | 0 | { |
1519 | | /* We can just leave the MV from the previous ref search. */ |
1520 | 0 | x264_me_refine_qpel_refdupe( h, &m, NULL ); |
1521 | 0 | } |
1522 | 0 | else |
1523 | 0 | x264_me_search( h, &m, mvc, 3 ); |
1524 | |
|
1525 | 0 | m.cost += m.i_ref_cost; |
1526 | |
|
1527 | 0 | if( m.cost < l0m->cost ) |
1528 | 0 | h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); |
1529 | 0 | } |
1530 | | |
1531 | | /* Early termination based on the current SATD score of partition[0] |
1532 | | plus the estimated SATD score of partition[1] */ |
1533 | 0 | if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) ) |
1534 | 0 | { |
1535 | 0 | a->l0.i_cost16x8 = COST_MAX; |
1536 | 0 | return; |
1537 | 0 | } |
1538 | | |
1539 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv ); |
1540 | 0 | x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref ); |
1541 | 0 | } |
1542 | | |
1543 | 0 | a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost; |
1544 | 0 | } |
1545 | | |
1546 | | static void mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) |
1547 | 0 | { |
1548 | 0 | x264_me_t m; |
1549 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1550 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); |
1551 | | |
1552 | | /* XXX Needed for x264_mb_predict_mv */ |
1553 | 0 | h->mb.i_partition = D_8x16; |
1554 | |
|
1555 | 0 | for( int i = 0; i < 2; i++ ) |
1556 | 0 | { |
1557 | 0 | x264_me_t *l0m = &a->l0.me8x16[i]; |
1558 | 0 | const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); |
1559 | 0 | const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); |
1560 | 0 | const int ref8[2] = { minref, maxref }; |
1561 | 0 | const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; |
1562 | |
|
1563 | 0 | m.i_pixel = PIXEL_8x16; |
1564 | |
|
1565 | 0 | LOAD_FENC( &m, p_fenc, 8*i, 0 ); |
1566 | 0 | l0m->cost = INT_MAX; |
1567 | 0 | for( int j = 0; j < i_ref8s; j++ ) |
1568 | 0 | { |
1569 | 0 | const int i_ref = ref8[j]; |
1570 | 0 | m.i_ref_cost = REF_COST( 0, i_ref ); |
1571 | |
|
1572 | 0 | CP32( mvc[0], a->l0.mvc[i_ref][0] ); |
1573 | 0 | CP32( mvc[1], a->l0.mvc[i_ref][i+1] ); |
1574 | 0 | CP32( mvc[2], a->l0.mvc[i_ref][i+3] ); |
1575 | |
|
1576 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); |
1577 | 0 | LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 ); |
1578 | |
|
1579 | 0 | x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); |
1580 | 0 | x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); |
1581 | | /* We can only take this shortcut if the first search was performed on ref0. */ |
1582 | 0 | if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) |
1583 | 0 | { |
1584 | | /* We can just leave the MV from the previous ref search. */ |
1585 | 0 | x264_me_refine_qpel_refdupe( h, &m, NULL ); |
1586 | 0 | } |
1587 | 0 | else |
1588 | 0 | x264_me_search( h, &m, mvc, 3 ); |
1589 | |
|
1590 | 0 | m.cost += m.i_ref_cost; |
1591 | |
|
1592 | 0 | if( m.cost < l0m->cost ) |
1593 | 0 | h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); |
1594 | 0 | } |
1595 | | |
1596 | | /* Early termination based on the current SATD score of partition[0] |
1597 | | plus the estimated SATD score of partition[1] */ |
1598 | 0 | if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) ) |
1599 | 0 | { |
1600 | 0 | a->l0.i_cost8x16 = COST_MAX; |
1601 | 0 | return; |
1602 | 0 | } |
1603 | | |
1604 | 0 | x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv ); |
1605 | 0 | x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref ); |
1606 | 0 | } |
1607 | | |
1608 | 0 | a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost; |
1609 | 0 | } |
1610 | | |
1611 | | static ALWAYS_INLINE int mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, |
1612 | | pixel **p_fref, int i8x8, int size, int chroma ) |
1613 | 0 | { |
1614 | 0 | ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); |
1615 | 0 | pixel *pix2 = pix1+8; |
1616 | 0 | int i_stride = h->mb.pic.i_stride[1]; |
1617 | 0 | int chroma_h_shift = chroma <= CHROMA_422; |
1618 | 0 | int chroma_v_shift = chroma == CHROMA_420; |
1619 | 0 | int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride; |
1620 | 0 | int i_ref = a->l0.me8x8[i8x8].i_ref; |
1621 | 0 | int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; |
1622 | 0 | x264_weight_t *weight = h->sh.weight[i_ref]; |
1623 | | |
1624 | | // FIXME weight can be done on 4x4 blocks even if mc is smaller |
1625 | 0 | #define CHROMA4x4MC( width, height, me, x, y ) \ |
1626 | 0 | if( chroma == CHROMA_444 ) \ |
1627 | 0 | { \ |
1628 | 0 | int mvx = (me).mv[0] + 4*2*x; \ |
1629 | 0 | int mvy = (me).mv[1] + 4*2*y; \ |
1630 | 0 | h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \ |
1631 | 0 | mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \ |
1632 | 0 | h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \ |
1633 | 0 | mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \ |
1634 | 0 | } \ |
1635 | 0 | else \ |
1636 | 0 | { \ |
1637 | 0 | int offset = x + (2>>chroma_v_shift)*16*y; \ |
1638 | 0 | int chroma_height = (2>>chroma_v_shift)*height; \ |
1639 | 0 | h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \ |
1640 | 0 | (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \ |
1641 | 0 | if( weight[1].weightfn ) \ |
1642 | 0 | weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \ |
1643 | 0 | if( weight[2].weightfn ) \ |
1644 | 0 | weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \ |
1645 | 0 | } |
1646 | |
|
1647 | 0 | if( size == PIXEL_4x4 ) |
1648 | 0 | { |
1649 | 0 | x264_me_t *m = a->l0.me4x4[i8x8]; |
1650 | 0 | CHROMA4x4MC( 2,2, m[0], 0,0 ); |
1651 | 0 | CHROMA4x4MC( 2,2, m[1], 2,0 ); |
1652 | 0 | CHROMA4x4MC( 2,2, m[2], 0,2 ); |
1653 | 0 | CHROMA4x4MC( 2,2, m[3], 2,2 ); |
1654 | 0 | } |
1655 | 0 | else if( size == PIXEL_8x4 ) |
1656 | 0 | { |
1657 | 0 | x264_me_t *m = a->l0.me8x4[i8x8]; |
1658 | 0 | CHROMA4x4MC( 4,2, m[0], 0,0 ); |
1659 | 0 | CHROMA4x4MC( 4,2, m[1], 0,2 ); |
1660 | 0 | } |
1661 | 0 | else |
1662 | 0 | { |
1663 | 0 | x264_me_t *m = a->l0.me4x8[i8x8]; |
1664 | 0 | CHROMA4x4MC( 2,4, m[0], 0,0 ); |
1665 | 0 | CHROMA4x4MC( 2,4, m[1], 2,0 ); |
1666 | 0 | } |
1667 | 0 | #undef CHROMA4x4MC |
1668 | |
|
1669 | 0 | int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE; |
1670 | 0 | int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4; |
1671 | 0 | return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) |
1672 | 0 | + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 ); |
1673 | 0 | } |
1674 | | |
1675 | | static int mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size ) |
1676 | 0 | { |
1677 | 0 | if( CHROMA_FORMAT == CHROMA_444 ) |
1678 | 0 | return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 ); |
1679 | 0 | else if( CHROMA_FORMAT == CHROMA_422 ) |
1680 | 0 | return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 ); |
1681 | 0 | else |
1682 | 0 | return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 ); |
1683 | 0 | } |
1684 | | |
1685 | | static void mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) |
1686 | 0 | { |
1687 | 0 | pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; |
1688 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1689 | 0 | const int i_ref = a->l0.me8x8[i8x8].i_ref; |
1690 | | |
1691 | | /* XXX Needed for x264_mb_predict_mv */ |
1692 | 0 | h->mb.i_partition = D_8x8; |
1693 | |
|
1694 | 0 | for( int i4x4 = 0; i4x4 < 4; i4x4++ ) |
1695 | 0 | { |
1696 | 0 | const int idx = 4*i8x8 + i4x4; |
1697 | 0 | const int x4 = block_idx_x[idx]; |
1698 | 0 | const int y4 = block_idx_y[idx]; |
1699 | 0 | const int i_mvc = (i4x4 == 0); |
1700 | |
|
1701 | 0 | x264_me_t *m = &a->l0.me4x4[i8x8][i4x4]; |
1702 | |
|
1703 | 0 | m->i_pixel = PIXEL_4x4; |
1704 | |
|
1705 | 0 | LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); |
1706 | 0 | LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); |
1707 | 0 | LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); |
1708 | |
|
1709 | 0 | x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); |
1710 | 0 | x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc ); |
1711 | |
|
1712 | 0 | x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv ); |
1713 | 0 | } |
1714 | 0 | a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost + |
1715 | 0 | a->l0.me4x4[i8x8][1].cost + |
1716 | 0 | a->l0.me4x4[i8x8][2].cost + |
1717 | 0 | a->l0.me4x4[i8x8][3].cost + |
1718 | 0 | REF_COST( 0, i_ref ) + |
1719 | 0 | a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4]; |
1720 | 0 | if( h->mb.b_chroma_me && !CHROMA444 ) |
1721 | 0 | a->l0.i_cost4x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 ); |
1722 | 0 | } |
1723 | | |
1724 | | static void mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) |
1725 | 0 | { |
1726 | 0 | pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; |
1727 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1728 | 0 | const int i_ref = a->l0.me8x8[i8x8].i_ref; |
1729 | | |
1730 | | /* XXX Needed for x264_mb_predict_mv */ |
1731 | 0 | h->mb.i_partition = D_8x8; |
1732 | |
|
1733 | 0 | for( int i8x4 = 0; i8x4 < 2; i8x4++ ) |
1734 | 0 | { |
1735 | 0 | const int idx = 4*i8x8 + 2*i8x4; |
1736 | 0 | const int x4 = block_idx_x[idx]; |
1737 | 0 | const int y4 = block_idx_y[idx]; |
1738 | 0 | const int i_mvc = (i8x4 == 0); |
1739 | |
|
1740 | 0 | x264_me_t *m = &a->l0.me8x4[i8x8][i8x4]; |
1741 | |
|
1742 | 0 | m->i_pixel = PIXEL_8x4; |
1743 | |
|
1744 | 0 | LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); |
1745 | 0 | LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); |
1746 | 0 | LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); |
1747 | |
|
1748 | 0 | x264_mb_predict_mv( h, 0, idx, 2, m->mvp ); |
1749 | 0 | x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); |
1750 | |
|
1751 | 0 | x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv ); |
1752 | 0 | } |
1753 | 0 | a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost + |
1754 | 0 | REF_COST( 0, i_ref ) + |
1755 | 0 | a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4]; |
1756 | 0 | if( h->mb.b_chroma_me && !CHROMA444 ) |
1757 | 0 | a->l0.i_cost8x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 ); |
1758 | 0 | } |
1759 | | |
1760 | | static void mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) |
1761 | 0 | { |
1762 | 0 | pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; |
1763 | 0 | pixel **p_fenc = h->mb.pic.p_fenc; |
1764 | 0 | const int i_ref = a->l0.me8x8[i8x8].i_ref; |
1765 | | |
1766 | | /* XXX Needed for x264_mb_predict_mv */ |
1767 | 0 | h->mb.i_partition = D_8x8; |
1768 | |
|
1769 | 0 | for( int i4x8 = 0; i4x8 < 2; i4x8++ ) |
1770 | 0 | { |
1771 | 0 | const int idx = 4*i8x8 + i4x8; |
1772 | 0 | const int x4 = block_idx_x[idx]; |
1773 | 0 | const int y4 = block_idx_y[idx]; |
1774 | 0 | const int i_mvc = (i4x8 == 0); |
1775 | |
|
1776 | 0 | x264_me_t *m = &a->l0.me4x8[i8x8][i4x8]; |
1777 | |
|
1778 | 0 | m->i_pixel = PIXEL_4x8; |
1779 | |
|
1780 | 0 | LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); |
1781 | 0 | LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); |
1782 | 0 | LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); |
1783 | |
|
1784 | 0 | x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); |
1785 | 0 | x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); |
1786 | |
|
1787 | 0 | x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv ); |
1788 | 0 | } |
1789 | 0 | a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost + |
1790 | 0 | REF_COST( 0, i_ref ) + |
1791 | 0 | a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8]; |
1792 | 0 | if( h->mb.b_chroma_me && !CHROMA444 ) |
1793 | 0 | a->l0.i_cost4x8[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 ); |
1794 | 0 | } |
1795 | | |
1796 | | static ALWAYS_INLINE int analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) |
1797 | 0 | { |
1798 | 0 | ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] ); |
1799 | 0 | ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] ); |
1800 | 0 | int i_chroma_cost = 0; |
1801 | 0 | int chromapix = h->luma2chroma_pixel[i_pixel]; |
1802 | |
|
1803 | 0 | #define COST_BI_CHROMA( m0, m1, width, height ) \ |
1804 | 0 | { \ |
1805 | 0 | if( CHROMA444 ) \ |
1806 | 0 | { \ |
1807 | 0 | h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \ |
1808 | 0 | m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ |
1809 | 0 | h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \ |
1810 | 0 | m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ |
1811 | 0 | h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \ |
1812 | 0 | m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ |
1813 | 0 | h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \ |
1814 | 0 | m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ |
1815 | 0 | } \ |
1816 | 0 | else \ |
1817 | 0 | { \ |
1818 | 0 | int v_shift = CHROMA_V_SHIFT; \ |
1819 | 0 | int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ |
1820 | 0 | int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ |
1821 | 0 | h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \ |
1822 | 0 | m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ |
1823 | 0 | h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \ |
1824 | 0 | m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ |
1825 | 0 | } \ |
1826 | 0 | h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ |
1827 | 0 | h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ |
1828 | 0 | i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \ |
1829 | 0 | + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \ |
1830 | 0 | } |
1831 | |
|
1832 | 0 | if( i_pixel == PIXEL_16x16 ) |
1833 | 0 | COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 ) |
1834 | 0 | else if( i_pixel == PIXEL_16x8 ) |
1835 | 0 | COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 ) |
1836 | 0 | else if( i_pixel == PIXEL_8x16 ) |
1837 | 0 | COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 ) |
1838 | 0 | else |
1839 | 0 | COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 ) |
1840 | |
|
1841 | 0 | return i_chroma_cost; |
1842 | 0 | } |
1843 | | |
1844 | | static void mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) |
1845 | 0 | { |
1846 | | /* Assumes that fdec still contains the results of |
1847 | | * x264_mb_predict_mv_direct16x16 and x264_mb_mc */ |
1848 | |
|
1849 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[0]; |
1850 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[0]; |
1851 | |
|
1852 | 0 | a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT]; |
1853 | 0 | if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 ) |
1854 | 0 | { |
1855 | 0 | int chromapix = h->luma2chroma_pixel[PIXEL_8x8]; |
1856 | |
|
1857 | 0 | for( int i = 0; i < 4; i++ ) |
1858 | 0 | { |
1859 | 0 | const int x = (i&1)*8; |
1860 | 0 | const int y = (i>>1)*8; |
1861 | 0 | a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, |
1862 | 0 | &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE ); |
1863 | 0 | if( h->mb.b_chroma_me ) |
1864 | 0 | { |
1865 | 0 | int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE; |
1866 | 0 | int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE; |
1867 | 0 | a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE, |
1868 | 0 | &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE ) |
1869 | 0 | + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE, |
1870 | 0 | &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE ); |
1871 | 0 | } |
1872 | 0 | a->i_cost16x16direct += a->i_cost8x8direct[i]; |
1873 | | |
1874 | | /* mb type cost */ |
1875 | 0 | a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8]; |
1876 | 0 | } |
1877 | 0 | } |
1878 | 0 | else |
1879 | 0 | { |
1880 | 0 | a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE ); |
1881 | 0 | if( h->mb.b_chroma_me ) |
1882 | 0 | { |
1883 | 0 | int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; |
1884 | 0 | a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) |
1885 | 0 | + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); |
1886 | 0 | } |
1887 | 0 | } |
1888 | 0 | } |
1889 | | |
1890 | | static void mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) |
1891 | 0 | { |
1892 | 0 | ALIGNED_ARRAY_32( pixel, pix0,[16*16] ); |
1893 | 0 | ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); |
1894 | 0 | pixel *src0, *src1; |
1895 | 0 | intptr_t stride0 = 16, stride1 = 16; |
1896 | 0 | int i_ref, i_mvc; |
1897 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[9],[2] ); |
1898 | 0 | int try_skip = a->b_try_skip; |
1899 | 0 | int list1_skipped = 0; |
1900 | 0 | int i_halfpel_thresh[2] = {INT_MAX, INT_MAX}; |
1901 | 0 | int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL, |
1902 | 0 | (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL}; |
1903 | |
|
1904 | 0 | x264_me_t m; |
1905 | 0 | m.i_pixel = PIXEL_16x16; |
1906 | |
|
1907 | 0 | LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); |
1908 | | |
1909 | | /* 16x16 Search on list 0 and list 1 */ |
1910 | 0 | a->l0.me16x16.cost = INT_MAX; |
1911 | 0 | a->l1.me16x16.cost = INT_MAX; |
1912 | 0 | for( int l = 1; l >= 0; ) |
1913 | 0 | { |
1914 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
1915 | | |
1916 | | /* This loop is extremely munged in order to facilitate the following order of operations, |
1917 | | * necessary for an efficient fast skip. |
1918 | | * 1. Search list1 ref0. |
1919 | | * 2. Search list0 ref0. |
1920 | | * 3. Try skip. |
1921 | | * 4. Search the rest of list0. |
1922 | | * 5. Go back and finish list1. |
1923 | | */ |
1924 | 0 | for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ ) |
1925 | 0 | { |
1926 | 0 | if( try_skip && l == 1 && i_ref > 0 ) |
1927 | 0 | { |
1928 | 0 | list1_skipped = 1; |
1929 | 0 | break; |
1930 | 0 | } |
1931 | | |
1932 | 0 | m.i_ref_cost = REF_COST( l, i_ref ); |
1933 | | |
1934 | | /* search with ref */ |
1935 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 ); |
1936 | 0 | x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp ); |
1937 | 0 | x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc ); |
1938 | 0 | x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] ); |
1939 | | |
1940 | | /* add ref cost */ |
1941 | 0 | m.cost += m.i_ref_cost; |
1942 | |
|
1943 | 0 | if( m.cost < lX->me16x16.cost ) |
1944 | 0 | h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) ); |
1945 | | |
1946 | | /* save mv for predicting neighbors */ |
1947 | 0 | CP32( lX->mvc[i_ref][0], m.mv ); |
1948 | 0 | CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv ); |
1949 | | |
1950 | | /* Fast skip detection. */ |
1951 | 0 | if( i_ref == 0 && try_skip ) |
1952 | 0 | { |
1953 | 0 | if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) + |
1954 | 0 | abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 ) |
1955 | 0 | { |
1956 | 0 | try_skip = 0; |
1957 | 0 | } |
1958 | 0 | else if( !l ) |
1959 | 0 | { |
1960 | | /* We already tested skip */ |
1961 | 0 | h->mb.i_type = B_SKIP; |
1962 | 0 | analyse_update_cache( h, a ); |
1963 | 0 | return; |
1964 | 0 | } |
1965 | 0 | } |
1966 | 0 | } |
1967 | 0 | if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] ) |
1968 | 0 | break; |
1969 | 0 | if( list1_skipped && l == 0 ) |
1970 | 0 | l = 1; |
1971 | 0 | else |
1972 | 0 | l--; |
1973 | 0 | } |
1974 | | |
1975 | | /* get cost of BI mode */ |
1976 | 0 | h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) ); |
1977 | 0 | h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) ); |
1978 | 0 | int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref ); |
1979 | 0 | src0 = h->mc.get_ref( pix0, &stride0, |
1980 | 0 | h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0], |
1981 | 0 | a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none ); |
1982 | 0 | src1 = h->mc.get_ref( pix1, &stride1, |
1983 | 0 | h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0], |
1984 | 0 | a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none ); |
1985 | |
|
1986 | 0 | h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
1987 | |
|
1988 | 0 | a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) |
1989 | 0 | + ref_costs |
1990 | 0 | + a->l0.bi16x16.cost_mv |
1991 | 0 | + a->l1.bi16x16.cost_mv; |
1992 | |
|
1993 | 0 | if( h->mb.b_chroma_me ) |
1994 | 0 | a->i_cost16x16bi += analyse_bi_chroma( h, a, 0, PIXEL_16x16 ); |
1995 | | |
1996 | | /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */ |
1997 | 0 | if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) ) |
1998 | 0 | { |
1999 | 0 | int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]] |
2000 | 0 | + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]]; |
2001 | 0 | int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]] |
2002 | 0 | + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]]; |
2003 | 0 | h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0], |
2004 | 0 | h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0], |
2005 | 0 | h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
2006 | 0 | int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) |
2007 | 0 | + ref_costs + l0_mv_cost + l1_mv_cost; |
2008 | |
|
2009 | 0 | if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi ) |
2010 | 0 | { |
2011 | 0 | ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] ); |
2012 | |
|
2013 | 0 | if( CHROMA444 ) |
2014 | 0 | { |
2015 | 0 | h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], |
2016 | 0 | h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], |
2017 | 0 | h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
2018 | 0 | cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ); |
2019 | 0 | h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2], |
2020 | 0 | h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2], |
2021 | 0 | h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
2022 | 0 | cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE ); |
2023 | 0 | } |
2024 | 0 | else |
2025 | 0 | { |
2026 | 0 | ALIGNED_ARRAY_64( pixel, pixuv, [2],[16*FENC_STRIDE] ); |
2027 | 0 | int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; |
2028 | 0 | int v_shift = CHROMA_V_SHIFT; |
2029 | |
|
2030 | 0 | if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref ) |
2031 | 0 | { |
2032 | 0 | int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; |
2033 | 0 | h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], |
2034 | 0 | h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 ); |
2035 | 0 | } |
2036 | 0 | else |
2037 | 0 | h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], |
2038 | 0 | h->mb.pic.i_stride[1], 16>>v_shift ); |
2039 | |
|
2040 | 0 | if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref ) |
2041 | 0 | { |
2042 | 0 | int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; |
2043 | 0 | h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], |
2044 | 0 | h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 ); |
2045 | 0 | } |
2046 | 0 | else |
2047 | 0 | h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], |
2048 | 0 | h->mb.pic.i_stride[1], 16>>v_shift ); |
2049 | |
|
2050 | 0 | h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE, |
2051 | 0 | h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
2052 | 0 | h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE, |
2053 | 0 | h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); |
2054 | |
|
2055 | 0 | cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ) |
2056 | 0 | + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE ); |
2057 | 0 | } |
2058 | 0 | } |
2059 | |
|
2060 | 0 | if( cost00 < a->i_cost16x16bi ) |
2061 | 0 | { |
2062 | 0 | M32( a->l0.bi16x16.mv ) = 0; |
2063 | 0 | M32( a->l1.bi16x16.mv ) = 0; |
2064 | 0 | a->l0.bi16x16.cost_mv = l0_mv_cost; |
2065 | 0 | a->l1.bi16x16.cost_mv = l1_mv_cost; |
2066 | 0 | a->i_cost16x16bi = cost00; |
2067 | 0 | } |
2068 | 0 | } |
2069 | | |
2070 | | /* mb type cost */ |
2071 | 0 | a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI]; |
2072 | 0 | a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0]; |
2073 | 0 | a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1]; |
2074 | 0 | } |
2075 | | |
2076 | | static inline void mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i ) |
2077 | 0 | { |
2078 | 0 | int x = 2*(i&1); |
2079 | 0 | int y = i&2; |
2080 | |
|
2081 | 0 | switch( h->mb.i_sub_partition[i] ) |
2082 | 0 | { |
2083 | 0 | case D_L0_8x8: |
2084 | 0 | x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv ); |
2085 | 0 | break; |
2086 | 0 | case D_L0_8x4: |
2087 | 0 | x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv ); |
2088 | 0 | x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv ); |
2089 | 0 | break; |
2090 | 0 | case D_L0_4x8: |
2091 | 0 | x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv ); |
2092 | 0 | x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv ); |
2093 | 0 | break; |
2094 | 0 | case D_L0_4x4: |
2095 | 0 | x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv ); |
2096 | 0 | x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv ); |
2097 | 0 | x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv ); |
2098 | 0 | x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv ); |
2099 | 0 | break; |
2100 | 0 | default: |
2101 | 0 | x264_log( h, X264_LOG_ERROR, "internal error\n" ); |
2102 | 0 | break; |
2103 | 0 | } |
2104 | 0 | } |
2105 | | |
2106 | | static void mb_load_mv_direct8x8( x264_t *h, int idx ) |
2107 | 0 | { |
2108 | 0 | int x = 2*(idx&1); |
2109 | 0 | int y = idx&2; |
2110 | 0 | x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] ); |
2111 | 0 | x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] ); |
2112 | 0 | x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] ); |
2113 | 0 | x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] ); |
2114 | 0 | } |
2115 | | |
2116 | | #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \ |
2117 | 0 | if( x264_mb_partition_listX_table[0][part] ) \ |
2118 | 0 | { \ |
2119 | 0 | x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \ |
2120 | 0 | x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \ |
2121 | 0 | } \ |
2122 | 0 | else \ |
2123 | 0 | { \ |
2124 | 0 | x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \ |
2125 | 0 | x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \ |
2126 | 0 | if( b_mvd ) \ |
2127 | 0 | x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \ |
2128 | 0 | } \ |
2129 | 0 | if( x264_mb_partition_listX_table[1][part] ) \ |
2130 | 0 | { \ |
2131 | 0 | x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \ |
2132 | 0 | x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \ |
2133 | 0 | } \ |
2134 | 0 | else \ |
2135 | 0 | { \ |
2136 | 0 | x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \ |
2137 | 0 | x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \ |
2138 | 0 | if( b_mvd ) \ |
2139 | 0 | x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \ |
2140 | 0 | } |
2141 | | |
2142 | | static inline void mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) |
2143 | 0 | { |
2144 | 0 | int x = 2*(i&1); |
2145 | 0 | int y = i&2; |
2146 | 0 | if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 ) |
2147 | 0 | { |
2148 | 0 | mb_load_mv_direct8x8( h, i ); |
2149 | 0 | if( b_mvd ) |
2150 | 0 | { |
2151 | 0 | x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 ); |
2152 | 0 | x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 ); |
2153 | 0 | x264_macroblock_cache_skip( h, x, y, 2, 2, 1 ); |
2154 | 0 | } |
2155 | 0 | } |
2156 | 0 | else |
2157 | 0 | { |
2158 | 0 | CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] ); |
2159 | 0 | } |
2160 | 0 | } |
2161 | | static inline void mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) |
2162 | 0 | { |
2163 | 0 | CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] ); |
2164 | 0 | } |
2165 | | static inline void mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd ) |
2166 | 0 | { |
2167 | 0 | CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] ); |
2168 | 0 | } |
2169 | | #undef CACHE_MV_BI |
2170 | | |
2171 | | static void mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) |
2172 | 0 | { |
2173 | 0 | ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] ); |
2174 | 0 | int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1}; |
2175 | | |
2176 | | /* early termination: if 16x16 chose ref 0, then evaluate no refs older |
2177 | | * than those used by the neighbors */ |
2178 | 0 | #define CHECK_NEIGHBOUR(i)\ |
2179 | 0 | {\ |
2180 | 0 | int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\ |
2181 | 0 | if( ref > i_maxref[l] )\ |
2182 | 0 | i_maxref[l] = ref;\ |
2183 | 0 | } |
2184 | |
|
2185 | 0 | for( int l = 0; l < 2; l++ ) |
2186 | 0 | { |
2187 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
2188 | 0 | if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 && |
2189 | 0 | h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 ) |
2190 | 0 | { |
2191 | 0 | i_maxref[l] = 0; |
2192 | 0 | CHECK_NEIGHBOUR( -8 - 1 ); |
2193 | 0 | CHECK_NEIGHBOUR( -8 + 0 ); |
2194 | 0 | CHECK_NEIGHBOUR( -8 + 2 ); |
2195 | 0 | CHECK_NEIGHBOUR( -8 + 4 ); |
2196 | 0 | CHECK_NEIGHBOUR( 0 - 1 ); |
2197 | 0 | CHECK_NEIGHBOUR( 2*8 - 1 ); |
2198 | 0 | } |
2199 | 0 | } |
2200 | | |
2201 | | /* XXX Needed for x264_mb_predict_mv */ |
2202 | 0 | h->mb.i_partition = D_8x8; |
2203 | |
|
2204 | 0 | a->i_cost8x8bi = 0; |
2205 | |
|
2206 | 0 | for( int i = 0; i < 4; i++ ) |
2207 | 0 | { |
2208 | 0 | int x8 = i&1; |
2209 | 0 | int y8 = i>>1; |
2210 | 0 | int i_part_cost; |
2211 | 0 | int i_part_cost_bi; |
2212 | 0 | intptr_t stride[2] = {8,8}; |
2213 | 0 | pixel *src[2]; |
2214 | 0 | x264_me_t m; |
2215 | 0 | m.i_pixel = PIXEL_8x8; |
2216 | 0 | LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); |
2217 | |
|
2218 | 0 | for( int l = 0; l < 2; l++ ) |
2219 | 0 | { |
2220 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
2221 | |
|
2222 | 0 | lX->me8x8[i].cost = INT_MAX; |
2223 | 0 | for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ ) |
2224 | 0 | { |
2225 | 0 | m.i_ref_cost = REF_COST( l, i_ref ); |
2226 | |
|
2227 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 ); |
2228 | |
|
2229 | 0 | x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref ); |
2230 | 0 | x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); |
2231 | 0 | x264_me_search( h, &m, lX->mvc[i_ref], i+1 ); |
2232 | 0 | m.cost += m.i_ref_cost; |
2233 | |
|
2234 | 0 | if( m.cost < lX->me8x8[i].cost ) |
2235 | 0 | { |
2236 | 0 | h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) ); |
2237 | 0 | a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost ); |
2238 | 0 | } |
2239 | | |
2240 | | /* save mv for predicting other partitions within this MB */ |
2241 | 0 | CP32( lX->mvc[i_ref][i+1], m.mv ); |
2242 | 0 | } |
2243 | 0 | } |
2244 | | |
2245 | | /* BI mode */ |
2246 | 0 | src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0], |
2247 | 0 | a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none ); |
2248 | 0 | src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0], |
2249 | 0 | a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none ); |
2250 | 0 | h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], |
2251 | 0 | h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] ); |
2252 | |
|
2253 | 0 | a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); |
2254 | 0 | i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv |
2255 | 0 | + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost |
2256 | 0 | + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; |
2257 | |
|
2258 | 0 | if( h->mb.b_chroma_me ) |
2259 | 0 | { |
2260 | 0 | int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 ); |
2261 | 0 | i_part_cost_bi += i_chroma_cost; |
2262 | 0 | a->i_satd8x8[2][i] += i_chroma_cost; |
2263 | 0 | } |
2264 | |
|
2265 | 0 | a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; |
2266 | 0 | a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; |
2267 | |
|
2268 | 0 | i_part_cost = a->l0.me8x8[i].cost; |
2269 | 0 | h->mb.i_sub_partition[i] = D_L0_8x8; |
2270 | 0 | COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 ); |
2271 | 0 | COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 ); |
2272 | 0 | COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 ); |
2273 | 0 | a->i_cost8x8bi += i_part_cost; |
2274 | | |
2275 | | /* XXX Needed for x264_mb_predict_mv */ |
2276 | 0 | mb_cache_mv_b8x8( h, a, i, 0 ); |
2277 | 0 | } |
2278 | | |
2279 | | /* mb type cost */ |
2280 | 0 | a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8]; |
2281 | 0 | } |
2282 | | |
2283 | | static void mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) |
2284 | 0 | { |
2285 | 0 | pixel **p_fref[2] = |
2286 | 0 | { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref], |
2287 | 0 | h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] }; |
2288 | 0 | ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] ); |
2289 | | |
2290 | | /* XXX Needed for x264_mb_predict_mv */ |
2291 | 0 | h->mb.i_partition = D_8x8; |
2292 | |
|
2293 | 0 | a->i_cost8x8bi = 0; |
2294 | |
|
2295 | 0 | for( int i = 0; i < 4; i++ ) |
2296 | 0 | { |
2297 | 0 | int x8 = i&1; |
2298 | 0 | int y8 = i>>1; |
2299 | 0 | int i_part_cost; |
2300 | 0 | int i_part_cost_bi = 0; |
2301 | 0 | intptr_t stride[2] = {8,8}; |
2302 | 0 | pixel *src[2]; |
2303 | |
|
2304 | 0 | for( int l = 0; l < 2; l++ ) |
2305 | 0 | { |
2306 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
2307 | 0 | x264_me_t *m = &lX->me8x8[i]; |
2308 | 0 | m->i_pixel = PIXEL_8x8; |
2309 | 0 | LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); |
2310 | |
|
2311 | 0 | m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref ); |
2312 | 0 | m->i_ref = lX->me16x16.i_ref; |
2313 | |
|
2314 | 0 | LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 ); |
2315 | |
|
2316 | 0 | x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref ); |
2317 | 0 | x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); |
2318 | 0 | x264_me_search( h, m, &lX->me16x16.mv, 1 ); |
2319 | 0 | a->i_satd8x8[l][i] = m->cost - m->cost_mv; |
2320 | 0 | m->cost += m->i_ref_cost; |
2321 | |
|
2322 | 0 | x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); |
2323 | | |
2324 | | /* save mv for predicting other partitions within this MB */ |
2325 | 0 | CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv ); |
2326 | | |
2327 | | /* BI mode */ |
2328 | 0 | src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], |
2329 | 0 | m->mv[0], m->mv[1], 8, 8, x264_weight_none ); |
2330 | 0 | i_part_cost_bi += m->cost_mv + m->i_ref_cost; |
2331 | 0 | } |
2332 | 0 | h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] ); |
2333 | 0 | a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); |
2334 | 0 | i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; |
2335 | 0 | a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; |
2336 | 0 | a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; |
2337 | |
|
2338 | 0 | if( h->mb.b_chroma_me ) |
2339 | 0 | { |
2340 | 0 | int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 ); |
2341 | 0 | i_part_cost_bi += i_chroma_cost; |
2342 | 0 | a->i_satd8x8[2][i] += i_chroma_cost; |
2343 | 0 | } |
2344 | |
|
2345 | 0 | i_part_cost = a->l0.me8x8[i].cost; |
2346 | 0 | h->mb.i_sub_partition[i] = D_L0_8x8; |
2347 | 0 | COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 ); |
2348 | 0 | COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 ); |
2349 | 0 | COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 ); |
2350 | 0 | a->i_cost8x8bi += i_part_cost; |
2351 | | |
2352 | | /* XXX Needed for x264_mb_predict_mv */ |
2353 | 0 | mb_cache_mv_b8x8( h, a, i, 0 ); |
2354 | 0 | } |
2355 | | |
2356 | | /* mb type cost */ |
2357 | 0 | a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8]; |
2358 | 0 | } |
2359 | | |
2360 | | static void mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) |
2361 | 0 | { |
2362 | 0 | ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] ); |
2363 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); |
2364 | |
|
2365 | 0 | h->mb.i_partition = D_16x8; |
2366 | 0 | a->i_cost16x8bi = 0; |
2367 | |
|
2368 | 0 | for( int i = 0; i < 2; i++ ) |
2369 | 0 | { |
2370 | 0 | int i_part_cost; |
2371 | 0 | int i_part_cost_bi = 0; |
2372 | 0 | intptr_t stride[2] = {16,16}; |
2373 | 0 | pixel *src[2]; |
2374 | 0 | x264_me_t m; |
2375 | 0 | m.i_pixel = PIXEL_16x8; |
2376 | 0 | LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i ); |
2377 | |
|
2378 | 0 | for( int l = 0; l < 2; l++ ) |
2379 | 0 | { |
2380 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
2381 | 0 | int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref }; |
2382 | 0 | int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; |
2383 | 0 | lX->me16x8[i].cost = INT_MAX; |
2384 | 0 | for( int j = 0; j < i_ref8s; j++ ) |
2385 | 0 | { |
2386 | 0 | int i_ref = ref8[j]; |
2387 | 0 | m.i_ref_cost = REF_COST( l, i_ref ); |
2388 | |
|
2389 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i ); |
2390 | |
|
2391 | 0 | CP32( mvc[0], lX->mvc[i_ref][0] ); |
2392 | 0 | CP32( mvc[1], lX->mvc[i_ref][2*i+1] ); |
2393 | 0 | CP32( mvc[2], lX->mvc[i_ref][2*i+2] ); |
2394 | |
|
2395 | 0 | x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref ); |
2396 | 0 | x264_mb_predict_mv( h, l, 8*i, 4, m.mvp ); |
2397 | 0 | x264_me_search( h, &m, mvc, 3 ); |
2398 | 0 | m.cost += m.i_ref_cost; |
2399 | |
|
2400 | 0 | if( m.cost < lX->me16x8[i].cost ) |
2401 | 0 | h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) ); |
2402 | 0 | } |
2403 | 0 | } |
2404 | | |
2405 | | /* BI mode */ |
2406 | 0 | src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0], |
2407 | 0 | a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none ); |
2408 | 0 | src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0], |
2409 | 0 | a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none ); |
2410 | 0 | h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], |
2411 | 0 | h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] ); |
2412 | |
|
2413 | 0 | i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ) |
2414 | 0 | + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost |
2415 | 0 | + a->l1.me16x8[i].i_ref_cost; |
2416 | |
|
2417 | 0 | if( h->mb.b_chroma_me ) |
2418 | 0 | i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_16x8 ); |
2419 | |
|
2420 | 0 | i_part_cost = a->l0.me16x8[i].cost; |
2421 | 0 | a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */ |
2422 | |
|
2423 | 0 | if( a->l1.me16x8[i].cost < i_part_cost ) |
2424 | 0 | { |
2425 | 0 | i_part_cost = a->l1.me16x8[i].cost; |
2426 | 0 | a->i_mb_partition16x8[i] = D_L1_8x8; |
2427 | 0 | } |
2428 | 0 | if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost ) |
2429 | 0 | { |
2430 | 0 | i_part_cost = i_part_cost_bi; |
2431 | 0 | a->i_mb_partition16x8[i] = D_BI_8x8; |
2432 | 0 | } |
2433 | 0 | a->i_cost16x8bi += i_part_cost; |
2434 | | |
2435 | | /* Early termination based on the current SATD score of partition[0] |
2436 | | plus the estimated SATD score of partition[1] */ |
2437 | 0 | if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd |
2438 | 0 | * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) ) |
2439 | 0 | { |
2440 | 0 | a->i_cost16x8bi = COST_MAX; |
2441 | 0 | return; |
2442 | 0 | } |
2443 | | |
2444 | 0 | mb_cache_mv_b16x8( h, a, i, 0 ); |
2445 | 0 | } |
2446 | | |
2447 | | /* mb type cost */ |
2448 | 0 | a->i_mb_type16x8 = B_L0_L0 |
2449 | 0 | + (a->i_mb_partition16x8[0]>>2) * 3 |
2450 | 0 | + (a->i_mb_partition16x8[1]>>2); |
2451 | 0 | a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8]; |
2452 | 0 | } |
2453 | | |
2454 | | static void mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) |
2455 | 0 | { |
2456 | 0 | ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] ); |
2457 | 0 | ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] ); |
2458 | |
|
2459 | 0 | h->mb.i_partition = D_8x16; |
2460 | 0 | a->i_cost8x16bi = 0; |
2461 | |
|
2462 | 0 | for( int i = 0; i < 2; i++ ) |
2463 | 0 | { |
2464 | 0 | int i_part_cost; |
2465 | 0 | int i_part_cost_bi = 0; |
2466 | 0 | intptr_t stride[2] = {8,8}; |
2467 | 0 | pixel *src[2]; |
2468 | 0 | x264_me_t m; |
2469 | 0 | m.i_pixel = PIXEL_8x16; |
2470 | 0 | LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 ); |
2471 | |
|
2472 | 0 | for( int l = 0; l < 2; l++ ) |
2473 | 0 | { |
2474 | 0 | x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; |
2475 | 0 | int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref }; |
2476 | 0 | int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; |
2477 | 0 | lX->me8x16[i].cost = INT_MAX; |
2478 | 0 | for( int j = 0; j < i_ref8s; j++ ) |
2479 | 0 | { |
2480 | 0 | int i_ref = ref8[j]; |
2481 | 0 | m.i_ref_cost = REF_COST( l, i_ref ); |
2482 | |
|
2483 | 0 | LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 ); |
2484 | |
|
2485 | 0 | CP32( mvc[0], lX->mvc[i_ref][0] ); |
2486 | 0 | CP32( mvc[1], lX->mvc[i_ref][i+1] ); |
2487 | 0 | CP32( mvc[2], lX->mvc[i_ref][i+3] ); |
2488 | |
|
2489 | 0 | x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref ); |
2490 | 0 | x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); |
2491 | 0 | x264_me_search( h, &m, mvc, 3 ); |
2492 | 0 | m.cost += m.i_ref_cost; |
2493 | |
|
2494 | 0 | if( m.cost < lX->me8x16[i].cost ) |
2495 | 0 | h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) ); |
2496 | 0 | } |
2497 | 0 | } |
2498 | | |
2499 | | /* BI mode */ |
2500 | 0 | src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0], |
2501 | 0 | a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none ); |
2502 | 0 | src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0], |
2503 | 0 | a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none ); |
2504 | 0 | h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] ); |
2505 | |
|
2506 | 0 | i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) |
2507 | 0 | + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost |
2508 | 0 | + a->l1.me8x16[i].i_ref_cost; |
2509 | |
|
2510 | 0 | if( h->mb.b_chroma_me ) |
2511 | 0 | i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_8x16 ); |
2512 | |
|
2513 | 0 | i_part_cost = a->l0.me8x16[i].cost; |
2514 | 0 | a->i_mb_partition8x16[i] = D_L0_8x8; |
2515 | |
|
2516 | 0 | if( a->l1.me8x16[i].cost < i_part_cost ) |
2517 | 0 | { |
2518 | 0 | i_part_cost = a->l1.me8x16[i].cost; |
2519 | 0 | a->i_mb_partition8x16[i] = D_L1_8x8; |
2520 | 0 | } |
2521 | 0 | if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost ) |
2522 | 0 | { |
2523 | 0 | i_part_cost = i_part_cost_bi; |
2524 | 0 | a->i_mb_partition8x16[i] = D_BI_8x8; |
2525 | 0 | } |
2526 | 0 | a->i_cost8x16bi += i_part_cost; |
2527 | | |
2528 | | /* Early termination based on the current SATD score of partition[0] |
2529 | | plus the estimated SATD score of partition[1] */ |
2530 | 0 | if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd |
2531 | 0 | * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) ) |
2532 | 0 | { |
2533 | 0 | a->i_cost8x16bi = COST_MAX; |
2534 | 0 | return; |
2535 | 0 | } |
2536 | | |
2537 | 0 | mb_cache_mv_b8x16( h, a, i, 0 ); |
2538 | 0 | } |
2539 | | |
2540 | | /* mb type cost */ |
2541 | 0 | a->i_mb_type8x16 = B_L0_L0 |
2542 | 0 | + (a->i_mb_partition8x16[0]>>2) * 3 |
2543 | 0 | + (a->i_mb_partition8x16[1]>>2); |
2544 | 0 | a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16]; |
2545 | 0 | } |
2546 | | |
2547 | | static void mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) |
2548 | 0 | { |
2549 | 0 | int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX; |
2550 | |
|
2551 | 0 | h->mb.i_type = P_L0; |
2552 | 0 | if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) ) |
2553 | 0 | { |
2554 | 0 | h->mb.i_partition = D_16x16; |
2555 | 0 | analyse_update_cache( h, a ); |
2556 | 0 | a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); |
2557 | 0 | } |
2558 | |
|
2559 | 0 | if( a->l0.i_cost16x8 < thresh ) |
2560 | 0 | { |
2561 | 0 | h->mb.i_partition = D_16x8; |
2562 | 0 | analyse_update_cache( h, a ); |
2563 | 0 | a->l0.i_cost16x8 = rd_cost_mb( h, a->i_lambda2 ); |
2564 | 0 | } |
2565 | 0 | else |
2566 | 0 | a->l0.i_cost16x8 = COST_MAX; |
2567 | |
|
2568 | 0 | if( a->l0.i_cost8x16 < thresh ) |
2569 | 0 | { |
2570 | 0 | h->mb.i_partition = D_8x16; |
2571 | 0 | analyse_update_cache( h, a ); |
2572 | 0 | a->l0.i_cost8x16 = rd_cost_mb( h, a->i_lambda2 ); |
2573 | 0 | } |
2574 | 0 | else |
2575 | 0 | a->l0.i_cost8x16 = COST_MAX; |
2576 | |
|
2577 | 0 | if( a->l0.i_cost8x8 < thresh ) |
2578 | 0 | { |
2579 | 0 | h->mb.i_type = P_8x8; |
2580 | 0 | h->mb.i_partition = D_8x8; |
2581 | 0 | if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) |
2582 | 0 | { |
2583 | 0 | x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref ); |
2584 | 0 | x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); |
2585 | 0 | x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); |
2586 | 0 | x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); |
2587 | | /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection |
2588 | | * for future blocks are those left over from previous RDO calls. */ |
2589 | 0 | for( int i = 0; i < 4; i++ ) |
2590 | 0 | { |
2591 | 0 | int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost}; |
2592 | 0 | int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX; |
2593 | 0 | int subtype, btype = D_L0_8x8; |
2594 | 0 | uint64_t bcost = COST_MAX64; |
2595 | 0 | for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ ) |
2596 | 0 | { |
2597 | 0 | uint64_t cost; |
2598 | 0 | if( costs[subtype] > sub8x8_thresh ) |
2599 | 0 | continue; |
2600 | 0 | h->mb.i_sub_partition[i] = subtype; |
2601 | 0 | mb_cache_mv_p8x8( h, a, i ); |
2602 | 0 | if( subtype == btype ) |
2603 | 0 | continue; |
2604 | 0 | cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 ); |
2605 | 0 | COPY2_IF_LT( bcost, cost, btype, subtype ); |
2606 | 0 | } |
2607 | 0 | if( h->mb.i_sub_partition[i] != btype ) |
2608 | 0 | { |
2609 | 0 | h->mb.i_sub_partition[i] = btype; |
2610 | 0 | mb_cache_mv_p8x8( h, a, i ); |
2611 | 0 | } |
2612 | 0 | } |
2613 | 0 | } |
2614 | 0 | else |
2615 | 0 | analyse_update_cache( h, a ); |
2616 | 0 | a->l0.i_cost8x8 = rd_cost_mb( h, a->i_lambda2 ); |
2617 | 0 | } |
2618 | 0 | else |
2619 | 0 | a->l0.i_cost8x8 = COST_MAX; |
2620 | 0 | } |
2621 | | |
2622 | | static void mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) |
2623 | 0 | { |
2624 | 0 | int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX; |
2625 | |
|
2626 | 0 | if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) |
2627 | 0 | { |
2628 | 0 | h->mb.i_type = B_DIRECT; |
2629 | | /* Assumes direct/skip MC is still in fdec */ |
2630 | | /* Requires b-rdo to be done before intra analysis */ |
2631 | 0 | h->mb.b_skip_mc = 1; |
2632 | 0 | analyse_update_cache( h, a ); |
2633 | 0 | a->i_rd16x16direct = rd_cost_mb( h, a->i_lambda2 ); |
2634 | 0 | h->mb.b_skip_mc = 0; |
2635 | 0 | } |
2636 | | |
2637 | | //FIXME not all the update_cache calls are needed |
2638 | 0 | h->mb.i_partition = D_16x16; |
2639 | | /* L0 */ |
2640 | 0 | if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX ) |
2641 | 0 | { |
2642 | 0 | h->mb.i_type = B_L0_L0; |
2643 | 0 | analyse_update_cache( h, a ); |
2644 | 0 | a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); |
2645 | 0 | } |
2646 | | |
2647 | | /* L1 */ |
2648 | 0 | if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX ) |
2649 | 0 | { |
2650 | 0 | h->mb.i_type = B_L1_L1; |
2651 | 0 | analyse_update_cache( h, a ); |
2652 | 0 | a->l1.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 ); |
2653 | 0 | } |
2654 | | |
2655 | | /* BI */ |
2656 | 0 | if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX ) |
2657 | 0 | { |
2658 | 0 | h->mb.i_type = B_BI_BI; |
2659 | 0 | analyse_update_cache( h, a ); |
2660 | 0 | a->i_rd16x16bi = rd_cost_mb( h, a->i_lambda2 ); |
2661 | 0 | } |
2662 | | |
2663 | | /* 8x8 */ |
2664 | 0 | if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX ) |
2665 | 0 | { |
2666 | 0 | h->mb.i_type = B_8x8; |
2667 | 0 | h->mb.i_partition = D_8x8; |
2668 | 0 | analyse_update_cache( h, a ); |
2669 | 0 | a->i_rd8x8bi = rd_cost_mb( h, a->i_lambda2 ); |
2670 | 0 | x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); |
2671 | 0 | } |
2672 | | |
2673 | | /* 16x8 */ |
2674 | 0 | if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX ) |
2675 | 0 | { |
2676 | 0 | h->mb.i_type = a->i_mb_type16x8; |
2677 | 0 | h->mb.i_partition = D_16x8; |
2678 | 0 | analyse_update_cache( h, a ); |
2679 | 0 | a->i_rd16x8bi = rd_cost_mb( h, a->i_lambda2 ); |
2680 | 0 | } |
2681 | | |
2682 | | /* 8x16 */ |
2683 | 0 | if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX ) |
2684 | 0 | { |
2685 | 0 | h->mb.i_type = a->i_mb_type8x16; |
2686 | 0 | h->mb.i_partition = D_8x16; |
2687 | 0 | analyse_update_cache( h, a ); |
2688 | 0 | a->i_rd8x16bi = rd_cost_mb( h, a->i_lambda2 ); |
2689 | 0 | } |
2690 | 0 | } |
2691 | | |
2692 | | static void refine_bidir( x264_t *h, x264_mb_analysis_t *a ) |
2693 | 0 | { |
2694 | 0 | int i_biweight; |
2695 | |
|
2696 | 0 | if( IS_INTRA(h->mb.i_type) ) |
2697 | 0 | return; |
2698 | | |
2699 | 0 | switch( h->mb.i_partition ) |
2700 | 0 | { |
2701 | 0 | case D_16x16: |
2702 | 0 | if( h->mb.i_type == B_BI_BI ) |
2703 | 0 | { |
2704 | 0 | i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref]; |
2705 | 0 | x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight ); |
2706 | 0 | } |
2707 | 0 | break; |
2708 | 0 | case D_16x8: |
2709 | 0 | for( int i = 0; i < 2; i++ ) |
2710 | 0 | if( a->i_mb_partition16x8[i] == D_BI_8x8 ) |
2711 | 0 | { |
2712 | 0 | i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref]; |
2713 | 0 | x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight ); |
2714 | 0 | } |
2715 | 0 | break; |
2716 | 0 | case D_8x16: |
2717 | 0 | for( int i = 0; i < 2; i++ ) |
2718 | 0 | if( a->i_mb_partition8x16[i] == D_BI_8x8 ) |
2719 | 0 | { |
2720 | 0 | i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref]; |
2721 | 0 | x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight ); |
2722 | 0 | } |
2723 | 0 | break; |
2724 | 0 | case D_8x8: |
2725 | 0 | for( int i = 0; i < 4; i++ ) |
2726 | 0 | if( h->mb.i_sub_partition[i] == D_BI_8x8 ) |
2727 | 0 | { |
2728 | 0 | i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref]; |
2729 | 0 | x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight ); |
2730 | 0 | } |
2731 | 0 | break; |
2732 | 0 | } |
2733 | 0 | } |
2734 | | |
2735 | | static inline void mb_analyse_transform( x264_t *h ) |
2736 | 0 | { |
2737 | 0 | if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless ) |
2738 | 0 | { |
2739 | | /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */ |
2740 | 0 | x264_mb_mc( h ); |
2741 | |
|
2742 | 0 | int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1; |
2743 | 0 | int i_cost8 = 0, i_cost4 = 0; |
2744 | | /* Not all platforms have a merged SATD function */ |
2745 | 0 | if( h->pixf.sa8d_satd[PIXEL_16x16] ) |
2746 | 0 | { |
2747 | 0 | uint64_t cost = 0; |
2748 | 0 | for( int p = 0; p < plane_count; p++ ) |
2749 | 0 | { |
2750 | 0 | cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, |
2751 | 0 | h->mb.pic.p_fdec[p], FDEC_STRIDE ); |
2752 | |
|
2753 | 0 | } |
2754 | 0 | i_cost8 = (uint32_t)cost; |
2755 | 0 | i_cost4 = (uint32_t)(cost >> 32); |
2756 | 0 | } |
2757 | 0 | else |
2758 | 0 | { |
2759 | 0 | for( int p = 0; p < plane_count; p++ ) |
2760 | 0 | { |
2761 | 0 | i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, |
2762 | 0 | h->mb.pic.p_fdec[p], FDEC_STRIDE ); |
2763 | 0 | i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, |
2764 | 0 | h->mb.pic.p_fdec[p], FDEC_STRIDE ); |
2765 | 0 | } |
2766 | 0 | } |
2767 | |
|
2768 | 0 | h->mb.b_transform_8x8 = i_cost8 < i_cost4; |
2769 | 0 | h->mb.b_skip_mc = 1; |
2770 | 0 | } |
2771 | 0 | } |
2772 | | |
2773 | | static inline void mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd ) |
2774 | 0 | { |
2775 | 0 | if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode ) |
2776 | 0 | { |
2777 | 0 | uint32_t subpart_bak = M32( h->mb.i_sub_partition ); |
2778 | | /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */ |
2779 | 0 | if( h->mb.i_type == P_8x8 ) |
2780 | 0 | M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101; |
2781 | 0 | else if( !x264_transform_allowed[h->mb.i_type] ) |
2782 | 0 | return; |
2783 | | |
2784 | 0 | analyse_update_cache( h, a ); |
2785 | 0 | h->mb.b_transform_8x8 ^= 1; |
2786 | | /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */ |
2787 | 0 | int i_rd8 = rd_cost_mb( h, a->i_lambda2 ); |
2788 | |
|
2789 | 0 | if( *i_rd >= i_rd8 ) |
2790 | 0 | { |
2791 | 0 | if( *i_rd > 0 ) |
2792 | 0 | *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd; |
2793 | 0 | *i_rd = i_rd8; |
2794 | 0 | } |
2795 | 0 | else |
2796 | 0 | { |
2797 | 0 | h->mb.b_transform_8x8 ^= 1; |
2798 | 0 | M32( h->mb.i_sub_partition ) = subpart_bak; |
2799 | 0 | } |
2800 | 0 | } |
2801 | 0 | } |
2802 | | |
2803 | | /* Rate-distortion optimal QP selection. |
2804 | | * FIXME: More than half of the benefit of this function seems to be |
2805 | | * in the way it improves the coding of chroma DC (by decimating or |
2806 | | * finding a better way to code a single DC coefficient.) |
2807 | | * There must be a more efficient way to get that portion of the benefit |
2808 | | * without doing full QP-RD, but RD-decimation doesn't seem to do the |
2809 | | * trick. */ |
2810 | | static inline void mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) |
2811 | 0 | { |
2812 | 0 | int bcost, cost, failures, prevcost, origcost; |
2813 | 0 | int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp; |
2814 | 0 | int last_qp_tried = 0; |
2815 | 0 | origcost = bcost = rd_cost_mb( h, a->i_lambda2 ); |
2816 | 0 | int origcbp = h->mb.cbp[h->mb.i_mb_xy]; |
2817 | | |
2818 | | /* If CBP is already zero, don't raise the quantizer any higher. */ |
2819 | 0 | for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 ) |
2820 | 0 | { |
2821 | | /* Without psy-RD, require monotonicity when moving quant away from previous |
2822 | | * macroblock's quant; allow 1 failure when moving quant towards previous quant. |
2823 | | * With psy-RD, allow 1 failure when moving quant away from previous quant, |
2824 | | * allow 2 failures when moving quant towards previous quant. |
2825 | | * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */ |
2826 | 0 | int threshold = (!!h->mb.i_psy_rd); |
2827 | | /* Raise the threshold for failures if we're moving towards the last QP. */ |
2828 | 0 | if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) || |
2829 | 0 | ( h->mb.i_last_qp > orig_qp && direction == 1 ) ) |
2830 | 0 | threshold++; |
2831 | 0 | h->mb.i_qp = orig_qp; |
2832 | 0 | failures = 0; |
2833 | 0 | prevcost = origcost; |
2834 | | |
2835 | | /* If the current QP results in an empty CBP, it's highly likely that lower QPs |
2836 | | * (up to a point) will too. So, jump down to where the threshold will kick in |
2837 | | * and check the QP there. If the CBP is still empty, skip the main loop. |
2838 | | * If it isn't empty, we would have ended up having to check this QP anyways, |
2839 | | * so as long as we store it for later lookup, we lose nothing. */ |
2840 | 0 | int already_checked_qp = -1; |
2841 | 0 | int already_checked_cost = COST_MAX; |
2842 | 0 | if( direction == -1 ) |
2843 | 0 | { |
2844 | 0 | if( !origcbp ) |
2845 | 0 | { |
2846 | 0 | h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) ); |
2847 | 0 | h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; |
2848 | 0 | already_checked_cost = rd_cost_mb( h, a->i_lambda2 ); |
2849 | 0 | if( !h->mb.cbp[h->mb.i_mb_xy] ) |
2850 | 0 | { |
2851 | | /* If our empty-CBP block is lower QP than the last QP, |
2852 | | * the last QP almost surely doesn't have a CBP either. */ |
2853 | 0 | if( h->mb.i_last_qp > h->mb.i_qp ) |
2854 | 0 | last_qp_tried = 1; |
2855 | 0 | break; |
2856 | 0 | } |
2857 | 0 | already_checked_qp = h->mb.i_qp; |
2858 | 0 | h->mb.i_qp = orig_qp; |
2859 | 0 | } |
2860 | 0 | } |
2861 | | |
2862 | 0 | h->mb.i_qp += direction; |
2863 | 0 | while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) ) |
2864 | 0 | { |
2865 | 0 | if( h->mb.i_last_qp == h->mb.i_qp ) |
2866 | 0 | last_qp_tried = 1; |
2867 | 0 | if( h->mb.i_qp == already_checked_qp ) |
2868 | 0 | cost = already_checked_cost; |
2869 | 0 | else |
2870 | 0 | { |
2871 | 0 | h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; |
2872 | 0 | cost = rd_cost_mb( h, a->i_lambda2 ); |
2873 | 0 | COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); |
2874 | 0 | } |
2875 | | |
2876 | | /* We can't assume that the costs are monotonic over QPs. |
2877 | | * Tie case-as-failure seems to give better results. */ |
2878 | 0 | if( cost < prevcost ) |
2879 | 0 | failures = 0; |
2880 | 0 | else |
2881 | 0 | failures++; |
2882 | 0 | prevcost = cost; |
2883 | |
|
2884 | 0 | if( failures > threshold ) |
2885 | 0 | break; |
2886 | 0 | if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] ) |
2887 | 0 | break; |
2888 | 0 | h->mb.i_qp += direction; |
2889 | 0 | } |
2890 | 0 | } |
2891 | | |
2892 | | /* Always try the last block's QP. */ |
2893 | 0 | if( !last_qp_tried ) |
2894 | 0 | { |
2895 | 0 | h->mb.i_qp = h->mb.i_last_qp; |
2896 | 0 | h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; |
2897 | 0 | cost = rd_cost_mb( h, a->i_lambda2 ); |
2898 | 0 | COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); |
2899 | 0 | } |
2900 | |
|
2901 | 0 | h->mb.i_qp = bqp; |
2902 | 0 | h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; |
2903 | | |
2904 | | /* Check transform again; decision from before may no longer be optimal. */ |
2905 | 0 | if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 && |
2906 | 0 | x264_mb_transform_8x8_allowed( h ) ) |
2907 | 0 | { |
2908 | 0 | h->mb.b_transform_8x8 ^= 1; |
2909 | 0 | cost = rd_cost_mb( h, a->i_lambda2 ); |
2910 | 0 | if( cost > bcost ) |
2911 | 0 | h->mb.b_transform_8x8 ^= 1; |
2912 | 0 | } |
2913 | 0 | } |
2914 | | |
2915 | | /***************************************************************************** |
2916 | | * x264_macroblock_analyse: |
2917 | | *****************************************************************************/ |
2918 | | void x264_macroblock_analyse( x264_t *h ) |
2919 | 0 | { |
2920 | 0 | x264_mb_analysis_t analysis; |
2921 | 0 | int i_cost = COST_MAX; |
2922 | |
|
2923 | 0 | h->mb.i_qp = x264_ratecontrol_mb_qp( h ); |
2924 | | /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, |
2925 | | * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ |
2926 | 0 | if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 ) |
2927 | 0 | h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp; |
2928 | |
|
2929 | 0 | if( h->param.analyse.b_mb_info ) |
2930 | 0 | h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */ |
2931 | 0 | mb_analyse_init( h, &analysis, h->mb.i_qp ); |
2932 | | |
2933 | | /*--------------------------- Do the analysis ---------------------------*/ |
2934 | 0 | if( h->sh.i_type == SLICE_TYPE_I ) |
2935 | 0 | { |
2936 | 0 | intra_analysis: |
2937 | 0 | if( analysis.i_mbrd ) |
2938 | 0 | mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); |
2939 | 0 | mb_analyse_intra( h, &analysis, COST_MAX ); |
2940 | 0 | if( analysis.i_mbrd ) |
2941 | 0 | intra_rd( h, &analysis, COST_MAX ); |
2942 | |
|
2943 | 0 | i_cost = analysis.i_satd_i16x16; |
2944 | 0 | h->mb.i_type = I_16x16; |
2945 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 ); |
2946 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 ); |
2947 | 0 | if( analysis.i_satd_pcm < i_cost ) |
2948 | 0 | h->mb.i_type = I_PCM; |
2949 | | |
2950 | 0 | else if( analysis.i_mbrd >= 2 ) |
2951 | 0 | intra_rd_refine( h, &analysis ); |
2952 | 0 | } |
2953 | 0 | else if( h->sh.i_type == SLICE_TYPE_P ) |
2954 | 0 | { |
2955 | 0 | int b_skip = 0; |
2956 | |
|
2957 | 0 | h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 ); |
2958 | |
|
2959 | 0 | analysis.b_try_skip = 0; |
2960 | 0 | if( analysis.b_force_intra ) |
2961 | 0 | { |
2962 | 0 | if( !h->param.analyse.b_psy ) |
2963 | 0 | { |
2964 | 0 | mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); |
2965 | 0 | goto intra_analysis; |
2966 | 0 | } |
2967 | 0 | } |
2968 | 0 | else |
2969 | 0 | { |
2970 | | /* Special fast-skip logic using information from mb_info. */ |
2971 | 0 | if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) ) |
2972 | 0 | { |
2973 | 0 | if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred && |
2974 | 0 | h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp ) |
2975 | 0 | { |
2976 | 0 | h->mb.i_partition = D_16x16; |
2977 | | /* Use the P-SKIP MV if we can... */ |
2978 | 0 | if( !M32(h->mb.cache.pskip_mv) ) |
2979 | 0 | { |
2980 | 0 | b_skip = 1; |
2981 | 0 | h->mb.i_type = P_SKIP; |
2982 | 0 | } |
2983 | | /* Otherwise, just force a 16x16 block. */ |
2984 | 0 | else |
2985 | 0 | { |
2986 | 0 | h->mb.i_type = P_L0; |
2987 | 0 | analysis.l0.me16x16.i_ref = 0; |
2988 | 0 | M32( analysis.l0.me16x16.mv ) = 0; |
2989 | 0 | } |
2990 | 0 | goto skip_analysis; |
2991 | 0 | } |
2992 | | /* Reset the information accordingly */ |
2993 | 0 | else if( h->param.analyse.b_mb_info_update ) |
2994 | 0 | h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT; |
2995 | 0 | } |
2996 | | |
2997 | 0 | int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1]; |
2998 | | /* If the current macroblock is off the frame, just skip it. */ |
2999 | 0 | if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid ) |
3000 | 0 | b_skip = 1; |
3001 | | /* Fast P_SKIP detection */ |
3002 | 0 | else if( h->param.analyse.b_fast_pskip ) |
3003 | 0 | { |
3004 | 0 | if( skip_invalid ) |
3005 | | // FIXME don't need to check this if the reference frame is done |
3006 | 0 | {} |
3007 | 0 | else if( h->param.analyse.i_subpel_refine >= 3 ) |
3008 | 0 | analysis.b_try_skip = 1; |
3009 | 0 | else if( h->mb.i_mb_type_left[0] == P_SKIP || |
3010 | 0 | h->mb.i_mb_type_top == P_SKIP || |
3011 | 0 | h->mb.i_mb_type_topleft == P_SKIP || |
3012 | 0 | h->mb.i_mb_type_topright == P_SKIP ) |
3013 | 0 | b_skip = x264_macroblock_probe_pskip( h ); |
3014 | 0 | } |
3015 | 0 | } |
3016 | | |
3017 | 0 | h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 ); |
3018 | |
|
3019 | 0 | if( b_skip ) |
3020 | 0 | { |
3021 | 0 | h->mb.i_type = P_SKIP; |
3022 | 0 | h->mb.i_partition = D_16x16; |
3023 | 0 | assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); |
3024 | 0 | skip_analysis: |
3025 | | /* Set up MVs for future predictors */ |
3026 | 0 | for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) |
3027 | 0 | M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; |
3028 | 0 | } |
3029 | 0 | else |
3030 | 0 | { |
3031 | 0 | const unsigned int flags = h->param.analyse.inter; |
3032 | 0 | int i_type; |
3033 | 0 | int i_partition; |
3034 | 0 | int i_satd_inter, i_satd_intra; |
3035 | |
|
3036 | 0 | mb_analyse_load_costs( h, &analysis ); |
3037 | |
|
3038 | 0 | mb_analyse_inter_p16x16( h, &analysis ); |
3039 | |
|
3040 | 0 | if( h->mb.i_type == P_SKIP ) |
3041 | 0 | { |
3042 | 0 | for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) |
3043 | 0 | M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; |
3044 | 0 | return; |
3045 | 0 | } |
3046 | | |
3047 | 0 | if( flags & X264_ANALYSE_PSUB16x16 ) |
3048 | 0 | { |
3049 | 0 | if( h->param.analyse.b_mixed_references ) |
3050 | 0 | mb_analyse_inter_p8x8_mixed_ref( h, &analysis ); |
3051 | 0 | else |
3052 | 0 | mb_analyse_inter_p8x8( h, &analysis ); |
3053 | 0 | } |
3054 | | |
3055 | | /* Select best inter mode */ |
3056 | 0 | i_type = P_L0; |
3057 | 0 | i_partition = D_16x16; |
3058 | 0 | i_cost = analysis.l0.me16x16.cost; |
3059 | |
|
3060 | 0 | if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate || |
3061 | 0 | analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) ) |
3062 | 0 | { |
3063 | 0 | i_type = P_8x8; |
3064 | 0 | i_partition = D_8x8; |
3065 | 0 | i_cost = analysis.l0.i_cost8x8; |
3066 | | |
3067 | | /* Do sub 8x8 */ |
3068 | 0 | if( flags & X264_ANALYSE_PSUB8x8 ) |
3069 | 0 | { |
3070 | 0 | for( int i = 0; i < 4; i++ ) |
3071 | 0 | { |
3072 | 0 | mb_analyse_inter_p4x4( h, &analysis, i ); |
3073 | 0 | int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv; |
3074 | 0 | if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 ) |
3075 | 0 | { |
3076 | 0 | int i_cost8x8 = analysis.l0.i_cost4x4[i]; |
3077 | 0 | h->mb.i_sub_partition[i] = D_L0_4x4; |
3078 | |
|
3079 | 0 | mb_analyse_inter_p8x4( h, &analysis, i ); |
3080 | 0 | COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i], |
3081 | 0 | h->mb.i_sub_partition[i], D_L0_8x4 ); |
3082 | |
|
3083 | 0 | mb_analyse_inter_p4x8( h, &analysis, i ); |
3084 | 0 | COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i], |
3085 | 0 | h->mb.i_sub_partition[i], D_L0_4x8 ); |
3086 | |
|
3087 | 0 | i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost; |
3088 | 0 | } |
3089 | 0 | mb_cache_mv_p8x8( h, &analysis, i ); |
3090 | 0 | } |
3091 | 0 | analysis.l0.i_cost8x8 = i_cost; |
3092 | 0 | } |
3093 | 0 | } |
3094 | | |
3095 | | /* Now do 16x8/8x16 */ |
3096 | 0 | int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv; |
3097 | 0 | if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate || |
3098 | 0 | analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) ) |
3099 | 0 | { |
3100 | 0 | int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost |
3101 | 0 | + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1; |
3102 | 0 | analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost; |
3103 | |
|
3104 | 0 | mb_analyse_inter_p16x8( h, &analysis, i_cost ); |
3105 | 0 | COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 ); |
3106 | |
|
3107 | 0 | i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost |
3108 | 0 | + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1; |
3109 | 0 | analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost; |
3110 | |
|
3111 | 0 | mb_analyse_inter_p8x16( h, &analysis, i_cost ); |
3112 | 0 | COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 ); |
3113 | 0 | } |
3114 | |
|
3115 | 0 | h->mb.i_partition = i_partition; |
3116 | | |
3117 | | /* refine qpel */ |
3118 | | //FIXME mb_type costs? |
3119 | 0 | if( analysis.i_mbrd || !h->mb.i_subpel_refine ) |
3120 | 0 | { |
3121 | | /* refine later */ |
3122 | 0 | } |
3123 | 0 | else if( i_partition == D_16x16 ) |
3124 | 0 | { |
3125 | 0 | x264_me_refine_qpel( h, &analysis.l0.me16x16 ); |
3126 | 0 | i_cost = analysis.l0.me16x16.cost; |
3127 | 0 | } |
3128 | 0 | else if( i_partition == D_16x8 ) |
3129 | 0 | { |
3130 | 0 | x264_me_refine_qpel( h, &analysis.l0.me16x8[0] ); |
3131 | 0 | x264_me_refine_qpel( h, &analysis.l0.me16x8[1] ); |
3132 | 0 | i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost; |
3133 | 0 | } |
3134 | 0 | else if( i_partition == D_8x16 ) |
3135 | 0 | { |
3136 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x16[0] ); |
3137 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x16[1] ); |
3138 | 0 | i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost; |
3139 | 0 | } |
3140 | 0 | else if( i_partition == D_8x8 ) |
3141 | 0 | { |
3142 | 0 | i_cost = 0; |
3143 | 0 | for( int i8x8 = 0; i8x8 < 4; i8x8++ ) |
3144 | 0 | { |
3145 | 0 | switch( h->mb.i_sub_partition[i8x8] ) |
3146 | 0 | { |
3147 | 0 | case D_L0_8x8: |
3148 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] ); |
3149 | 0 | i_cost += analysis.l0.me8x8[i8x8].cost; |
3150 | 0 | break; |
3151 | 0 | case D_L0_8x4: |
3152 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] ); |
3153 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] ); |
3154 | 0 | i_cost += analysis.l0.me8x4[i8x8][0].cost + |
3155 | 0 | analysis.l0.me8x4[i8x8][1].cost; |
3156 | 0 | break; |
3157 | 0 | case D_L0_4x8: |
3158 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] ); |
3159 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] ); |
3160 | 0 | i_cost += analysis.l0.me4x8[i8x8][0].cost + |
3161 | 0 | analysis.l0.me4x8[i8x8][1].cost; |
3162 | 0 | break; |
3163 | | |
3164 | 0 | case D_L0_4x4: |
3165 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] ); |
3166 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] ); |
3167 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] ); |
3168 | 0 | x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] ); |
3169 | 0 | i_cost += analysis.l0.me4x4[i8x8][0].cost + |
3170 | 0 | analysis.l0.me4x4[i8x8][1].cost + |
3171 | 0 | analysis.l0.me4x4[i8x8][2].cost + |
3172 | 0 | analysis.l0.me4x4[i8x8][3].cost; |
3173 | 0 | break; |
3174 | 0 | default: |
3175 | 0 | x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" ); |
3176 | 0 | break; |
3177 | 0 | } |
3178 | 0 | } |
3179 | 0 | } |
3180 | | |
3181 | 0 | if( h->mb.b_chroma_me ) |
3182 | 0 | { |
3183 | 0 | if( CHROMA444 ) |
3184 | 0 | { |
3185 | 0 | mb_analyse_intra( h, &analysis, i_cost ); |
3186 | 0 | mb_analyse_intra_chroma( h, &analysis ); |
3187 | 0 | } |
3188 | 0 | else |
3189 | 0 | { |
3190 | 0 | mb_analyse_intra_chroma( h, &analysis ); |
3191 | 0 | mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma ); |
3192 | 0 | } |
3193 | 0 | analysis.i_satd_i16x16 += analysis.i_satd_chroma; |
3194 | 0 | analysis.i_satd_i8x8 += analysis.i_satd_chroma; |
3195 | 0 | analysis.i_satd_i4x4 += analysis.i_satd_chroma; |
3196 | 0 | } |
3197 | 0 | else |
3198 | 0 | mb_analyse_intra( h, &analysis, i_cost ); |
3199 | |
|
3200 | 0 | i_satd_inter = i_cost; |
3201 | 0 | i_satd_intra = X264_MIN3( analysis.i_satd_i16x16, |
3202 | 0 | analysis.i_satd_i8x8, |
3203 | 0 | analysis.i_satd_i4x4 ); |
3204 | |
|
3205 | 0 | if( analysis.i_mbrd ) |
3206 | 0 | { |
3207 | 0 | mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) ); |
3208 | 0 | i_type = P_L0; |
3209 | 0 | i_partition = D_16x16; |
3210 | 0 | i_cost = analysis.l0.i_rd16x16; |
3211 | 0 | COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 ); |
3212 | 0 | COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 ); |
3213 | 0 | COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 ); |
3214 | 0 | h->mb.i_type = i_type; |
3215 | 0 | h->mb.i_partition = i_partition; |
3216 | 0 | if( i_cost < COST_MAX ) |
3217 | 0 | mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost ); |
3218 | 0 | intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 ); |
3219 | 0 | } |
3220 | |
|
3221 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 ); |
3222 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 ); |
3223 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 ); |
3224 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM ); |
3225 | |
|
3226 | 0 | h->mb.i_type = i_type; |
3227 | |
|
3228 | 0 | if( analysis.b_force_intra && !IS_INTRA(i_type) ) |
3229 | 0 | { |
3230 | | /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if |
3231 | | * it was an inter block. */ |
3232 | 0 | analyse_update_cache( h, &analysis ); |
3233 | 0 | x264_macroblock_encode( h ); |
3234 | 0 | for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) |
3235 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 ); |
3236 | 0 | if( !CHROMA444 ) |
3237 | 0 | { |
3238 | 0 | int height = 16 >> CHROMA_V_SHIFT; |
3239 | 0 | h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height ); |
3240 | 0 | h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height ); |
3241 | 0 | } |
3242 | 0 | mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); |
3243 | 0 | goto intra_analysis; |
3244 | 0 | } |
3245 | | |
3246 | 0 | if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM ) |
3247 | 0 | { |
3248 | 0 | if( IS_INTRA( h->mb.i_type ) ) |
3249 | 0 | { |
3250 | 0 | intra_rd_refine( h, &analysis ); |
3251 | 0 | } |
3252 | 0 | else if( i_partition == D_16x16 ) |
3253 | 0 | { |
3254 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref ); |
3255 | 0 | analysis.l0.me16x16.cost = i_cost; |
3256 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 ); |
3257 | 0 | } |
3258 | 0 | else if( i_partition == D_16x8 ) |
3259 | 0 | { |
3260 | 0 | M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; |
3261 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref ); |
3262 | 0 | x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref ); |
3263 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 ); |
3264 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 ); |
3265 | 0 | } |
3266 | 0 | else if( i_partition == D_8x16 ) |
3267 | 0 | { |
3268 | 0 | M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101; |
3269 | 0 | x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref ); |
3270 | 0 | x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref ); |
3271 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 ); |
3272 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 ); |
3273 | 0 | } |
3274 | 0 | else if( i_partition == D_8x8 ) |
3275 | 0 | { |
3276 | 0 | analyse_update_cache( h, &analysis ); |
3277 | 0 | for( int i8x8 = 0; i8x8 < 4; i8x8++ ) |
3278 | 0 | { |
3279 | 0 | if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 ) |
3280 | 0 | { |
3281 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 ); |
3282 | 0 | } |
3283 | 0 | else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 ) |
3284 | 0 | { |
3285 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); |
3286 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 ); |
3287 | 0 | } |
3288 | 0 | else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 ) |
3289 | 0 | { |
3290 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); |
3291 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 ); |
3292 | 0 | } |
3293 | 0 | else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 ) |
3294 | 0 | { |
3295 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 ); |
3296 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 ); |
3297 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 ); |
3298 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 ); |
3299 | 0 | } |
3300 | 0 | } |
3301 | 0 | } |
3302 | 0 | } |
3303 | 0 | } |
3304 | 0 | } |
3305 | 0 | else if( h->sh.i_type == SLICE_TYPE_B ) |
3306 | 0 | { |
3307 | 0 | int i_bskip_cost = COST_MAX; |
3308 | 0 | int b_skip = 0; |
3309 | |
|
3310 | 0 | if( analysis.i_mbrd ) |
3311 | 0 | mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); |
3312 | |
|
3313 | 0 | h->mb.i_type = B_SKIP; |
3314 | 0 | if( h->mb.b_direct_auto_write ) |
3315 | 0 | { |
3316 | | /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */ |
3317 | 0 | for( int i = 0; i < 2; i++ ) |
3318 | 0 | { |
3319 | 0 | int b_changed = 1; |
3320 | 0 | h->sh.b_direct_spatial_mv_pred ^= 1; |
3321 | 0 | analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL ); |
3322 | 0 | if( analysis.b_direct_available ) |
3323 | 0 | { |
3324 | 0 | if( b_changed ) |
3325 | 0 | { |
3326 | 0 | x264_mb_mc( h ); |
3327 | 0 | b_skip = x264_macroblock_probe_bskip( h ); |
3328 | 0 | } |
3329 | 0 | h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip; |
3330 | 0 | } |
3331 | 0 | else |
3332 | 0 | b_skip = 0; |
3333 | 0 | } |
3334 | 0 | } |
3335 | 0 | else |
3336 | 0 | analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL ); |
3337 | |
|
3338 | 0 | analysis.b_try_skip = 0; |
3339 | 0 | if( analysis.b_direct_available ) |
3340 | 0 | { |
3341 | 0 | if( !h->mb.b_direct_auto_write ) |
3342 | 0 | x264_mb_mc( h ); |
3343 | | /* If the current macroblock is off the frame, just skip it. */ |
3344 | 0 | if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height ) |
3345 | 0 | b_skip = 1; |
3346 | 0 | else if( analysis.i_mbrd ) |
3347 | 0 | { |
3348 | 0 | i_bskip_cost = ssd_mb( h ); |
3349 | | /* 6 = minimum cavlc cost of a non-skipped MB */ |
3350 | 0 | b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8); |
3351 | 0 | } |
3352 | 0 | else if( !h->mb.b_direct_auto_write ) |
3353 | 0 | { |
3354 | | /* Conditioning the probe on neighboring block types |
3355 | | * doesn't seem to help speed or quality. */ |
3356 | 0 | analysis.b_try_skip = x264_macroblock_probe_bskip( h ); |
3357 | 0 | if( h->param.analyse.i_subpel_refine < 3 ) |
3358 | 0 | b_skip = analysis.b_try_skip; |
3359 | 0 | } |
3360 | | /* Set up MVs for future predictors */ |
3361 | 0 | if( b_skip ) |
3362 | 0 | { |
3363 | 0 | for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) |
3364 | 0 | M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; |
3365 | 0 | for( int i = 0; i < h->mb.pic.i_fref[1]; i++ ) |
3366 | 0 | M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0; |
3367 | 0 | } |
3368 | 0 | } |
3369 | |
|
3370 | 0 | if( !b_skip ) |
3371 | 0 | { |
3372 | 0 | const unsigned int flags = h->param.analyse.inter; |
3373 | 0 | int i_type; |
3374 | 0 | int i_partition; |
3375 | 0 | int i_satd_inter; |
3376 | 0 | h->mb.b_skip_mc = 0; |
3377 | 0 | h->mb.i_type = B_DIRECT; |
3378 | |
|
3379 | 0 | mb_analyse_load_costs( h, &analysis ); |
3380 | | |
3381 | | /* select best inter mode */ |
3382 | | /* direct must be first */ |
3383 | 0 | if( analysis.b_direct_available ) |
3384 | 0 | mb_analyse_inter_direct( h, &analysis ); |
3385 | |
|
3386 | 0 | mb_analyse_inter_b16x16( h, &analysis ); |
3387 | |
|
3388 | 0 | if( h->mb.i_type == B_SKIP ) |
3389 | 0 | { |
3390 | 0 | for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) |
3391 | 0 | M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; |
3392 | 0 | for( int i = 1; i < h->mb.pic.i_fref[1]; i++ ) |
3393 | 0 | M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0; |
3394 | 0 | return; |
3395 | 0 | } |
3396 | | |
3397 | 0 | i_type = B_L0_L0; |
3398 | 0 | i_partition = D_16x16; |
3399 | 0 | i_cost = analysis.l0.me16x16.cost; |
3400 | 0 | COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 ); |
3401 | 0 | COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI ); |
3402 | 0 | COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT ); |
3403 | |
|
3404 | 0 | if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 ) |
3405 | 0 | { |
3406 | 0 | mb_analyse_b_rd( h, &analysis, i_cost ); |
3407 | 0 | if( i_bskip_cost < analysis.i_rd16x16direct && |
3408 | 0 | i_bskip_cost < analysis.i_rd16x16bi && |
3409 | 0 | i_bskip_cost < analysis.l0.i_rd16x16 && |
3410 | 0 | i_bskip_cost < analysis.l1.i_rd16x16 ) |
3411 | 0 | { |
3412 | 0 | h->mb.i_type = B_SKIP; |
3413 | 0 | analyse_update_cache( h, &analysis ); |
3414 | 0 | return; |
3415 | 0 | } |
3416 | 0 | } |
3417 | | |
3418 | 0 | if( flags & X264_ANALYSE_BSUB16x16 ) |
3419 | 0 | { |
3420 | 0 | if( h->param.analyse.b_mixed_references ) |
3421 | 0 | mb_analyse_inter_b8x8_mixed_ref( h, &analysis ); |
3422 | 0 | else |
3423 | 0 | mb_analyse_inter_b8x8( h, &analysis ); |
3424 | |
|
3425 | 0 | COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 ); |
3426 | | |
3427 | | /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */ |
3428 | 0 | int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0; |
3429 | 0 | int i_mb_type, i_partition16x8[2], i_partition8x16[2]; |
3430 | 0 | for( int i = 0; i < 2; i++ ) |
3431 | 0 | { |
3432 | 0 | int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost; |
3433 | 0 | int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost; |
3434 | | // 16x8 |
3435 | 0 | i_best_cost = COST_MAX; |
3436 | 0 | i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1]; |
3437 | 0 | i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1]; |
3438 | 0 | i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1]; |
3439 | 0 | avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost |
3440 | 0 | + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1; |
3441 | 0 | avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost |
3442 | 0 | + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1; |
3443 | 0 | COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 ); |
3444 | 0 | COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 ); |
3445 | 0 | COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 ); |
3446 | 0 | analysis.i_cost_est16x8[i] = i_best_cost; |
3447 | | |
3448 | | // 8x16 |
3449 | 0 | i_best_cost = COST_MAX; |
3450 | 0 | i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2]; |
3451 | 0 | i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2]; |
3452 | 0 | i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2]; |
3453 | 0 | avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost |
3454 | 0 | + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1; |
3455 | 0 | avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost |
3456 | 0 | + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1; |
3457 | 0 | COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 ); |
3458 | 0 | COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 ); |
3459 | 0 | COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 ); |
3460 | 0 | analysis.i_cost_est8x16[i] = i_best_cost; |
3461 | 0 | } |
3462 | 0 | i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2); |
3463 | 0 | analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type]; |
3464 | 0 | i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1]; |
3465 | 0 | i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2); |
3466 | 0 | analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type]; |
3467 | 0 | i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1]; |
3468 | | |
3469 | | /* We can gain a little speed by checking the mode with the lowest estimated cost first */ |
3470 | 0 | int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total; |
3471 | 0 | if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) ) |
3472 | 0 | { |
3473 | 0 | mb_analyse_inter_b16x8( h, &analysis, i_cost ); |
3474 | 0 | COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); |
3475 | 0 | } |
3476 | 0 | if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost ) |
3477 | 0 | { |
3478 | 0 | mb_analyse_inter_b8x16( h, &analysis, i_cost ); |
3479 | 0 | COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 ); |
3480 | 0 | } |
3481 | 0 | if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) ) |
3482 | 0 | { |
3483 | 0 | mb_analyse_inter_b16x8( h, &analysis, i_cost ); |
3484 | 0 | COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); |
3485 | 0 | } |
3486 | 0 | } |
3487 | |
|
3488 | 0 | if( analysis.i_mbrd || !h->mb.i_subpel_refine ) |
3489 | 0 | { |
3490 | | /* refine later */ |
3491 | 0 | } |
3492 | | /* refine qpel */ |
3493 | 0 | else if( i_partition == D_16x16 ) |
3494 | 0 | { |
3495 | 0 | analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0]; |
3496 | 0 | analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1]; |
3497 | 0 | if( i_type == B_L0_L0 ) |
3498 | 0 | { |
3499 | 0 | x264_me_refine_qpel( h, &analysis.l0.me16x16 ); |
3500 | 0 | i_cost = analysis.l0.me16x16.cost |
3501 | 0 | + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0]; |
3502 | 0 | } |
3503 | 0 | else if( i_type == B_L1_L1 ) |
3504 | 0 | { |
3505 | 0 | x264_me_refine_qpel( h, &analysis.l1.me16x16 ); |
3506 | 0 | i_cost = analysis.l1.me16x16.cost |
3507 | 0 | + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1]; |
3508 | 0 | } |
3509 | 0 | else if( i_type == B_BI_BI ) |
3510 | 0 | { |
3511 | 0 | x264_me_refine_qpel( h, &analysis.l0.bi16x16 ); |
3512 | 0 | x264_me_refine_qpel( h, &analysis.l1.bi16x16 ); |
3513 | 0 | } |
3514 | 0 | } |
3515 | 0 | else if( i_partition == D_16x8 ) |
3516 | 0 | { |
3517 | 0 | for( int i = 0; i < 2; i++ ) |
3518 | 0 | { |
3519 | 0 | if( analysis.i_mb_partition16x8[i] != D_L1_8x8 ) |
3520 | 0 | x264_me_refine_qpel( h, &analysis.l0.me16x8[i] ); |
3521 | 0 | if( analysis.i_mb_partition16x8[i] != D_L0_8x8 ) |
3522 | 0 | x264_me_refine_qpel( h, &analysis.l1.me16x8[i] ); |
3523 | 0 | } |
3524 | 0 | } |
3525 | 0 | else if( i_partition == D_8x16 ) |
3526 | 0 | { |
3527 | 0 | for( int i = 0; i < 2; i++ ) |
3528 | 0 | { |
3529 | 0 | if( analysis.i_mb_partition8x16[i] != D_L1_8x8 ) |
3530 | 0 | x264_me_refine_qpel( h, &analysis.l0.me8x16[i] ); |
3531 | 0 | if( analysis.i_mb_partition8x16[i] != D_L0_8x8 ) |
3532 | 0 | x264_me_refine_qpel( h, &analysis.l1.me8x16[i] ); |
3533 | 0 | } |
3534 | 0 | } |
3535 | 0 | else if( i_partition == D_8x8 ) |
3536 | 0 | { |
3537 | 0 | for( int i = 0; i < 4; i++ ) |
3538 | 0 | { |
3539 | 0 | x264_me_t *m; |
3540 | 0 | int i_part_cost_old; |
3541 | 0 | int i_type_cost; |
3542 | 0 | int i_part_type = h->mb.i_sub_partition[i]; |
3543 | 0 | int b_bidir = (i_part_type == D_BI_8x8); |
3544 | |
|
3545 | 0 | if( i_part_type == D_DIRECT_8x8 ) |
3546 | 0 | continue; |
3547 | 0 | if( x264_mb_partition_listX_table[0][i_part_type] ) |
3548 | 0 | { |
3549 | 0 | m = &analysis.l0.me8x8[i]; |
3550 | 0 | i_part_cost_old = m->cost; |
3551 | 0 | i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; |
3552 | 0 | m->cost -= i_type_cost; |
3553 | 0 | x264_me_refine_qpel( h, m ); |
3554 | 0 | if( !b_bidir ) |
3555 | 0 | analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old; |
3556 | 0 | } |
3557 | 0 | if( x264_mb_partition_listX_table[1][i_part_type] ) |
3558 | 0 | { |
3559 | 0 | m = &analysis.l1.me8x8[i]; |
3560 | 0 | i_part_cost_old = m->cost; |
3561 | 0 | i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; |
3562 | 0 | m->cost -= i_type_cost; |
3563 | 0 | x264_me_refine_qpel( h, m ); |
3564 | 0 | if( !b_bidir ) |
3565 | 0 | analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old; |
3566 | 0 | } |
3567 | | /* TODO: update mvp? */ |
3568 | 0 | } |
3569 | 0 | } |
3570 | |
|
3571 | 0 | i_satd_inter = i_cost; |
3572 | |
|
3573 | 0 | if( analysis.i_mbrd ) |
3574 | 0 | { |
3575 | 0 | mb_analyse_b_rd( h, &analysis, i_satd_inter ); |
3576 | 0 | i_type = B_SKIP; |
3577 | 0 | i_cost = i_bskip_cost; |
3578 | 0 | i_partition = D_16x16; |
3579 | 0 | COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 ); |
3580 | 0 | COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 ); |
3581 | 0 | COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI ); |
3582 | 0 | COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT ); |
3583 | 0 | COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 ); |
3584 | 0 | COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 ); |
3585 | 0 | COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 ); |
3586 | |
|
3587 | 0 | h->mb.i_type = i_type; |
3588 | 0 | h->mb.i_partition = i_partition; |
3589 | 0 | } |
3590 | |
|
3591 | 0 | if( h->mb.b_chroma_me ) |
3592 | 0 | { |
3593 | 0 | if( CHROMA444 ) |
3594 | 0 | { |
3595 | 0 | mb_analyse_intra( h, &analysis, i_satd_inter ); |
3596 | 0 | mb_analyse_intra_chroma( h, &analysis ); |
3597 | 0 | } |
3598 | 0 | else |
3599 | 0 | { |
3600 | 0 | mb_analyse_intra_chroma( h, &analysis ); |
3601 | 0 | mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma ); |
3602 | 0 | } |
3603 | 0 | analysis.i_satd_i16x16 += analysis.i_satd_chroma; |
3604 | 0 | analysis.i_satd_i8x8 += analysis.i_satd_chroma; |
3605 | 0 | analysis.i_satd_i4x4 += analysis.i_satd_chroma; |
3606 | 0 | } |
3607 | 0 | else |
3608 | 0 | mb_analyse_intra( h, &analysis, i_satd_inter ); |
3609 | |
|
3610 | 0 | if( analysis.i_mbrd ) |
3611 | 0 | { |
3612 | 0 | mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost ); |
3613 | 0 | intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 ); |
3614 | 0 | } |
3615 | |
|
3616 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 ); |
3617 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 ); |
3618 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 ); |
3619 | 0 | COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM ); |
3620 | |
|
3621 | 0 | h->mb.i_type = i_type; |
3622 | 0 | h->mb.i_partition = i_partition; |
3623 | |
|
3624 | 0 | if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM ) |
3625 | 0 | intra_rd_refine( h, &analysis ); |
3626 | 0 | if( h->mb.i_subpel_refine >= 5 ) |
3627 | 0 | refine_bidir( h, &analysis ); |
3628 | |
|
3629 | 0 | if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP ) |
3630 | 0 | { |
3631 | 0 | int i_biweight; |
3632 | 0 | analyse_update_cache( h, &analysis ); |
3633 | |
|
3634 | 0 | if( i_partition == D_16x16 ) |
3635 | 0 | { |
3636 | 0 | if( i_type == B_L0_L0 ) |
3637 | 0 | { |
3638 | 0 | analysis.l0.me16x16.cost = i_cost; |
3639 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 ); |
3640 | 0 | } |
3641 | 0 | else if( i_type == B_L1_L1 ) |
3642 | 0 | { |
3643 | 0 | analysis.l1.me16x16.cost = i_cost; |
3644 | 0 | x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 ); |
3645 | 0 | } |
3646 | 0 | else if( i_type == B_BI_BI ) |
3647 | 0 | { |
3648 | 0 | i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref]; |
3649 | 0 | x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 ); |
3650 | 0 | } |
3651 | 0 | } |
3652 | 0 | else if( i_partition == D_16x8 ) |
3653 | 0 | { |
3654 | 0 | for( int i = 0; i < 2; i++ ) |
3655 | 0 | { |
3656 | 0 | h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i]; |
3657 | 0 | if( analysis.i_mb_partition16x8[i] == D_L0_8x8 ) |
3658 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 ); |
3659 | 0 | else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 ) |
3660 | 0 | x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 ); |
3661 | 0 | else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 ) |
3662 | 0 | { |
3663 | 0 | i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref]; |
3664 | 0 | x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 ); |
3665 | 0 | } |
3666 | 0 | } |
3667 | 0 | } |
3668 | 0 | else if( i_partition == D_8x16 ) |
3669 | 0 | { |
3670 | 0 | for( int i = 0; i < 2; i++ ) |
3671 | 0 | { |
3672 | 0 | h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i]; |
3673 | 0 | if( analysis.i_mb_partition8x16[i] == D_L0_8x8 ) |
3674 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 ); |
3675 | 0 | else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 ) |
3676 | 0 | x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 ); |
3677 | 0 | else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 ) |
3678 | 0 | { |
3679 | 0 | i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref]; |
3680 | 0 | x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 ); |
3681 | 0 | } |
3682 | 0 | } |
3683 | 0 | } |
3684 | 0 | else if( i_partition == D_8x8 ) |
3685 | 0 | { |
3686 | 0 | for( int i = 0; i < 4; i++ ) |
3687 | 0 | { |
3688 | 0 | if( h->mb.i_sub_partition[i] == D_L0_8x8 ) |
3689 | 0 | x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 ); |
3690 | 0 | else if( h->mb.i_sub_partition[i] == D_L1_8x8 ) |
3691 | 0 | x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 ); |
3692 | 0 | else if( h->mb.i_sub_partition[i] == D_BI_8x8 ) |
3693 | 0 | { |
3694 | 0 | i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref]; |
3695 | 0 | x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 ); |
3696 | 0 | } |
3697 | 0 | } |
3698 | 0 | } |
3699 | 0 | } |
3700 | 0 | } |
3701 | 0 | } |
3702 | | |
3703 | 0 | analyse_update_cache( h, &analysis ); |
3704 | | |
3705 | | /* In rare cases we can end up qpel-RDing our way back to a larger partition size |
3706 | | * without realizing it. Check for this and account for it if necessary. */ |
3707 | 0 | if( analysis.i_mbrd >= 2 ) |
3708 | 0 | { |
3709 | | /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */ |
3710 | 0 | static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2}; |
3711 | 0 | int list = check_mv_lists[h->mb.i_type] - 1; |
3712 | 0 | if( list >= 0 && h->mb.i_partition != D_16x16 && |
3713 | 0 | M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) && |
3714 | 0 | h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] ) |
3715 | 0 | h->mb.i_partition = D_16x16; |
3716 | 0 | } |
3717 | |
|
3718 | 0 | if( !analysis.i_mbrd ) |
3719 | 0 | mb_analyse_transform( h ); |
3720 | |
|
3721 | 0 | if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) ) |
3722 | 0 | mb_analyse_qp_rd( h, &analysis ); |
3723 | |
|
3724 | 0 | h->mb.b_trellis = h->param.analyse.i_trellis; |
3725 | 0 | h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type )); |
3726 | |
|
3727 | 0 | if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 ) |
3728 | 0 | psy_trellis_init( h, 0 ); |
3729 | 0 | if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) |
3730 | 0 | h->mb.i_skip_intra = 0; |
3731 | 0 | } Unexecuted instantiation: x264_8_macroblock_analyse Unexecuted instantiation: x264_10_macroblock_analyse |
3732 | | |
3733 | | /*-------------------- Update MB from the analysis ----------------------*/ |
3734 | | static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) |
3735 | 0 | { |
3736 | 0 | switch( h->mb.i_type ) |
3737 | 0 | { |
3738 | 0 | case I_4x4: |
3739 | 0 | for( int i = 0; i < 16; i++ ) |
3740 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i]; |
3741 | |
|
3742 | 0 | mb_analyse_intra_chroma( h, a ); |
3743 | 0 | break; |
3744 | 0 | case I_8x8: |
3745 | 0 | for( int i = 0; i < 4; i++ ) |
3746 | 0 | x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] ); |
3747 | |
|
3748 | 0 | mb_analyse_intra_chroma( h, a ); |
3749 | 0 | break; |
3750 | 0 | case I_16x16: |
3751 | 0 | h->mb.i_intra16x16_pred_mode = a->i_predict16x16; |
3752 | 0 | mb_analyse_intra_chroma( h, a ); |
3753 | 0 | break; |
3754 | | |
3755 | 0 | case I_PCM: |
3756 | 0 | break; |
3757 | | |
3758 | 0 | case P_L0: |
3759 | 0 | switch( h->mb.i_partition ) |
3760 | 0 | { |
3761 | 0 | case D_16x16: |
3762 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); |
3763 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); |
3764 | 0 | break; |
3765 | | |
3766 | 0 | case D_16x8: |
3767 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref ); |
3768 | 0 | x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref ); |
3769 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv ); |
3770 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv ); |
3771 | 0 | break; |
3772 | | |
3773 | 0 | case D_8x16: |
3774 | 0 | x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref ); |
3775 | 0 | x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref ); |
3776 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv ); |
3777 | 0 | x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv ); |
3778 | 0 | break; |
3779 | | |
3780 | 0 | default: |
3781 | 0 | x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition ); |
3782 | 0 | break; |
3783 | 0 | } |
3784 | 0 | break; |
3785 | | |
3786 | 0 | case P_8x8: |
3787 | 0 | x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref ); |
3788 | 0 | x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); |
3789 | 0 | x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); |
3790 | 0 | x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); |
3791 | 0 | for( int i = 0; i < 4; i++ ) |
3792 | 0 | mb_cache_mv_p8x8( h, a, i ); |
3793 | 0 | break; |
3794 | | |
3795 | 0 | case P_SKIP: |
3796 | 0 | { |
3797 | 0 | h->mb.i_partition = D_16x16; |
3798 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 ); |
3799 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv ); |
3800 | 0 | break; |
3801 | 0 | } |
3802 | | |
3803 | 0 | case B_SKIP: |
3804 | 0 | case B_DIRECT: |
3805 | 0 | h->mb.i_partition = h->mb.cache.direct_partition; |
3806 | 0 | mb_load_mv_direct8x8( h, 0 ); |
3807 | 0 | mb_load_mv_direct8x8( h, 1 ); |
3808 | 0 | mb_load_mv_direct8x8( h, 2 ); |
3809 | 0 | mb_load_mv_direct8x8( h, 3 ); |
3810 | 0 | break; |
3811 | | |
3812 | 0 | case B_8x8: |
3813 | | /* optimize: cache might not need to be rewritten */ |
3814 | 0 | for( int i = 0; i < 4; i++ ) |
3815 | 0 | mb_cache_mv_b8x8( h, a, i, 1 ); |
3816 | 0 | break; |
3817 | | |
3818 | 0 | default: /* the rest of the B types */ |
3819 | 0 | switch( h->mb.i_partition ) |
3820 | 0 | { |
3821 | 0 | case D_16x16: |
3822 | 0 | switch( h->mb.i_type ) |
3823 | 0 | { |
3824 | 0 | case B_L0_L0: |
3825 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); |
3826 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); |
3827 | |
|
3828 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 ); |
3829 | 0 | x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 ); |
3830 | 0 | x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 ); |
3831 | 0 | break; |
3832 | 0 | case B_L1_L1: |
3833 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 ); |
3834 | 0 | x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 ); |
3835 | 0 | x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 ); |
3836 | |
|
3837 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref ); |
3838 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); |
3839 | 0 | break; |
3840 | 0 | case B_BI_BI: |
3841 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref ); |
3842 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv ); |
3843 | |
|
3844 | 0 | x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref ); |
3845 | 0 | x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv ); |
3846 | 0 | break; |
3847 | 0 | } |
3848 | 0 | break; |
3849 | 0 | case D_16x8: |
3850 | 0 | mb_cache_mv_b16x8( h, a, 0, 1 ); |
3851 | 0 | mb_cache_mv_b16x8( h, a, 1, 1 ); |
3852 | 0 | break; |
3853 | 0 | case D_8x16: |
3854 | 0 | mb_cache_mv_b8x16( h, a, 0, 1 ); |
3855 | 0 | mb_cache_mv_b8x16( h, a, 1, 1 ); |
3856 | 0 | break; |
3857 | 0 | default: |
3858 | 0 | x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" ); |
3859 | 0 | break; |
3860 | 0 | } |
3861 | 0 | } |
3862 | | |
3863 | 0 | #ifndef NDEBUG |
3864 | 0 | if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) ) |
3865 | 0 | { |
3866 | 0 | for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) |
3867 | 0 | { |
3868 | 0 | int completed; |
3869 | 0 | int ref = h->mb.cache.ref[l][x264_scan8[0]]; |
3870 | 0 | if( ref < 0 ) |
3871 | 0 | continue; |
3872 | 0 | completed = x264_frame_cond_wait( h->fref[l][ ref >> MB_INTERLACED ]->orig, -1 ); |
3873 | 0 | if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed ) |
3874 | 0 | { |
3875 | 0 | x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n"); |
3876 | 0 | x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type); |
3877 | 0 | x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref, |
3878 | 0 | h->mb.cache.mv[l][x264_scan8[15]][0], |
3879 | 0 | h->mb.cache.mv[l][x264_scan8[15]][1] ); |
3880 | 0 | x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]); |
3881 | 0 | x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y); |
3882 | 0 | x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed ); |
3883 | 0 | x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n"); |
3884 | 0 | mb_analyse_intra( h, a, COST_MAX ); |
3885 | 0 | h->mb.i_type = I_16x16; |
3886 | 0 | h->mb.i_intra16x16_pred_mode = a->i_predict16x16; |
3887 | 0 | mb_analyse_intra_chroma( h, a ); |
3888 | 0 | } |
3889 | 0 | } |
3890 | 0 | } |
3891 | 0 | #endif |
3892 | 0 | } |
3893 | | |
3894 | | #include "slicetype.c" |
3895 | | |