/src/x264/common/macroblock.c
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * macroblock.c: macroblock common functions |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Fiona Glaser <fiona@x264.com> |
7 | | * Laurent Aimar <fenrir@via.ecp.fr> |
8 | | * Loren Merritt <lorenm@u.washington.edu> |
9 | | * Henrik Gramner <henrik@gramner.com> |
10 | | * |
11 | | * This program is free software; you can redistribute it and/or modify |
12 | | * it under the terms of the GNU General Public License as published by |
13 | | * the Free Software Foundation; either version 2 of the License, or |
14 | | * (at your option) any later version. |
15 | | * |
16 | | * This program is distributed in the hope that it will be useful, |
17 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 | | * GNU General Public License for more details. |
20 | | * |
21 | | * You should have received a copy of the GNU General Public License |
22 | | * along with this program; if not, write to the Free Software |
23 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
24 | | * |
25 | | * This program is also available under a commercial proprietary license. |
26 | | * For more information, contact us at licensing@x264.com. |
27 | | *****************************************************************************/ |
28 | | |
29 | | #include "common.h" |
30 | | |
31 | | #define MC_LUMA(list,p) \ |
32 | 0 | h->mc.mc_luma( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \ |
33 | 0 | &h->mb.pic.p_fref[list][i_ref][p*4], h->mb.pic.i_stride[p], \ |
34 | 0 | mvx, mvy, 4*width, 4*height, \ |
35 | 0 | list ? x264_weight_none : &h->sh.weight[i_ref][p] ); |
36 | | |
37 | | static NOINLINE void mb_mc_0xywh( x264_t *h, int x, int y, int width, int height ) |
38 | 0 | { |
39 | 0 | int i8 = x264_scan8[0]+x+8*y; |
40 | 0 | int i_ref = h->mb.cache.ref[0][i8]; |
41 | 0 | int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; |
42 | 0 | int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; |
43 | |
|
44 | 0 | MC_LUMA( 0, 0 ); |
45 | |
|
46 | 0 | if( CHROMA444 ) |
47 | 0 | { |
48 | 0 | MC_LUMA( 0, 1 ); |
49 | 0 | MC_LUMA( 0, 2 ); |
50 | 0 | } |
51 | 0 | else if( CHROMA_FORMAT ) |
52 | 0 | { |
53 | 0 | int v_shift = CHROMA_V_SHIFT; |
54 | | // Chroma in 4:2:0 is offset if MCing from a field of opposite parity |
55 | 0 | if( v_shift & MB_INTERLACED & i_ref ) |
56 | 0 | mvy += (h->mb.i_mb_y & 1)*4 - 2; |
57 | |
|
58 | 0 | int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; |
59 | 0 | height = 4*height >> v_shift; |
60 | |
|
61 | 0 | h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], |
62 | 0 | &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, |
63 | 0 | h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], |
64 | 0 | mvx, 2*mvy>>v_shift, 2*width, height ); |
65 | |
|
66 | 0 | if( h->sh.weight[i_ref][1].weightfn ) |
67 | 0 | h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, |
68 | 0 | &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, |
69 | 0 | &h->sh.weight[i_ref][1], height ); |
70 | 0 | if( h->sh.weight[i_ref][2].weightfn ) |
71 | 0 | h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, |
72 | 0 | &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, |
73 | 0 | &h->sh.weight[i_ref][2], height ); |
74 | 0 | } |
75 | 0 | } |
76 | | static NOINLINE void mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) |
77 | 0 | { |
78 | 0 | int i8 = x264_scan8[0]+x+8*y; |
79 | 0 | int i_ref = h->mb.cache.ref[1][i8]; |
80 | 0 | int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; |
81 | 0 | int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; |
82 | |
|
83 | 0 | MC_LUMA( 1, 0 ); |
84 | |
|
85 | 0 | if( CHROMA444 ) |
86 | 0 | { |
87 | 0 | MC_LUMA( 1, 1 ); |
88 | 0 | MC_LUMA( 1, 2 ); |
89 | 0 | } |
90 | 0 | else if( CHROMA_FORMAT ) |
91 | 0 | { |
92 | 0 | int v_shift = CHROMA_V_SHIFT; |
93 | 0 | if( v_shift & MB_INTERLACED & i_ref ) |
94 | 0 | mvy += (h->mb.i_mb_y & 1)*4 - 2; |
95 | |
|
96 | 0 | int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; |
97 | 0 | h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], |
98 | 0 | &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, |
99 | 0 | h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], |
100 | 0 | mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift ); |
101 | 0 | } |
102 | 0 | } |
103 | | |
104 | | #define MC_LUMA_BI(p) \ |
105 | 0 | src0 = h->mc.get_ref( tmp0, &i_stride0, &h->mb.pic.p_fref[0][i_ref0][p*4], h->mb.pic.i_stride[p], \ |
106 | 0 | mvx0, mvy0, 4*width, 4*height, x264_weight_none ); \ |
107 | 0 | src1 = h->mc.get_ref( tmp1, &i_stride1, &h->mb.pic.p_fref[1][i_ref1][p*4], h->mb.pic.i_stride[p], \ |
108 | 0 | mvx1, mvy1, 4*width, 4*height, x264_weight_none ); \ |
109 | 0 | h->mc.avg[i_mode]( &h->mb.pic.p_fdec[p][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, \ |
110 | 0 | src0, i_stride0, src1, i_stride1, weight ); |
111 | | |
112 | | static NOINLINE void mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) |
113 | 0 | { |
114 | 0 | int i8 = x264_scan8[0]+x+8*y; |
115 | 0 | int i_ref0 = h->mb.cache.ref[0][i8]; |
116 | 0 | int i_ref1 = h->mb.cache.ref[1][i8]; |
117 | 0 | int weight = h->mb.bipred_weight[i_ref0][i_ref1]; |
118 | 0 | int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; |
119 | 0 | int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ) + 4*4*x; |
120 | 0 | int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; |
121 | 0 | int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; |
122 | 0 | int i_mode = x264_size2pixel[height][width]; |
123 | 0 | intptr_t i_stride0 = 16, i_stride1 = 16; |
124 | 0 | ALIGNED_ARRAY_32( pixel, tmp0,[16*16] ); |
125 | 0 | ALIGNED_ARRAY_32( pixel, tmp1,[16*16] ); |
126 | 0 | pixel *src0, *src1; |
127 | |
|
128 | 0 | MC_LUMA_BI( 0 ); |
129 | |
|
130 | 0 | if( CHROMA444 ) |
131 | 0 | { |
132 | 0 | MC_LUMA_BI( 1 ); |
133 | 0 | MC_LUMA_BI( 2 ); |
134 | 0 | } |
135 | 0 | else if( CHROMA_FORMAT ) |
136 | 0 | { |
137 | 0 | int v_shift = CHROMA_V_SHIFT; |
138 | 0 | if( v_shift & MB_INTERLACED & i_ref0 ) |
139 | 0 | mvy0 += (h->mb.i_mb_y & 1)*4 - 2; |
140 | 0 | if( v_shift & MB_INTERLACED & i_ref1 ) |
141 | 0 | mvy1 += (h->mb.i_mb_y & 1)*4 - 2; |
142 | |
|
143 | 0 | h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], |
144 | 0 | mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift ); |
145 | 0 | h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], |
146 | 0 | mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift ); |
147 | |
|
148 | 0 | int chromapix = h->luma2chroma_pixel[i_mode]; |
149 | 0 | int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; |
150 | 0 | h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); |
151 | 0 | h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); |
152 | 0 | } |
153 | 0 | } |
154 | | |
155 | | #undef MC_LUMA |
156 | | #undef MC_LUMA_BI |
157 | | |
158 | | void x264_mb_mc_8x8( x264_t *h, int i8 ) |
159 | 0 | { |
160 | 0 | int x = 2*(i8&1); |
161 | 0 | int y = 2*(i8>>1); |
162 | |
|
163 | 0 | if( h->sh.i_type == SLICE_TYPE_P ) |
164 | 0 | { |
165 | 0 | switch( h->mb.i_sub_partition[i8] ) |
166 | 0 | { |
167 | 0 | case D_L0_8x8: |
168 | 0 | mb_mc_0xywh( h, x, y, 2, 2 ); |
169 | 0 | break; |
170 | 0 | case D_L0_8x4: |
171 | 0 | mb_mc_0xywh( h, x, y+0, 2, 1 ); |
172 | 0 | mb_mc_0xywh( h, x, y+1, 2, 1 ); |
173 | 0 | break; |
174 | 0 | case D_L0_4x8: |
175 | 0 | mb_mc_0xywh( h, x+0, y, 1, 2 ); |
176 | 0 | mb_mc_0xywh( h, x+1, y, 1, 2 ); |
177 | 0 | break; |
178 | 0 | case D_L0_4x4: |
179 | 0 | mb_mc_0xywh( h, x+0, y+0, 1, 1 ); |
180 | 0 | mb_mc_0xywh( h, x+1, y+0, 1, 1 ); |
181 | 0 | mb_mc_0xywh( h, x+0, y+1, 1, 1 ); |
182 | 0 | mb_mc_0xywh( h, x+1, y+1, 1, 1 ); |
183 | 0 | break; |
184 | 0 | } |
185 | 0 | } |
186 | 0 | else |
187 | 0 | { |
188 | 0 | int scan8 = x264_scan8[0] + x + 8*y; |
189 | |
|
190 | 0 | if( h->mb.cache.ref[0][scan8] >= 0 ) |
191 | 0 | if( h->mb.cache.ref[1][scan8] >= 0 ) |
192 | 0 | mb_mc_01xywh( h, x, y, 2, 2 ); |
193 | 0 | else |
194 | 0 | mb_mc_0xywh( h, x, y, 2, 2 ); |
195 | 0 | else |
196 | 0 | mb_mc_1xywh( h, x, y, 2, 2 ); |
197 | 0 | } |
198 | 0 | } Unexecuted instantiation: x264_8_mb_mc_8x8 Unexecuted instantiation: x264_10_mb_mc_8x8 |
199 | | |
200 | | void x264_mb_mc( x264_t *h ) |
201 | 0 | { |
202 | 0 | if( h->mb.i_partition == D_8x8 ) |
203 | 0 | { |
204 | 0 | for( int i = 0; i < 4; i++ ) |
205 | 0 | x264_mb_mc_8x8( h, i ); |
206 | 0 | } |
207 | 0 | else |
208 | 0 | { |
209 | 0 | int ref0a = h->mb.cache.ref[0][x264_scan8[ 0]]; |
210 | 0 | int ref0b = h->mb.cache.ref[0][x264_scan8[12]]; |
211 | 0 | int ref1a = h->mb.cache.ref[1][x264_scan8[ 0]]; |
212 | 0 | int ref1b = h->mb.cache.ref[1][x264_scan8[12]]; |
213 | |
|
214 | 0 | if( h->mb.i_partition == D_16x16 ) |
215 | 0 | { |
216 | 0 | if( ref0a >= 0 ) |
217 | 0 | if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 4, 4 ); |
218 | 0 | else mb_mc_0xywh ( h, 0, 0, 4, 4 ); |
219 | 0 | else mb_mc_1xywh ( h, 0, 0, 4, 4 ); |
220 | 0 | } |
221 | 0 | else if( h->mb.i_partition == D_16x8 ) |
222 | 0 | { |
223 | 0 | if( ref0a >= 0 ) |
224 | 0 | if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 4, 2 ); |
225 | 0 | else mb_mc_0xywh ( h, 0, 0, 4, 2 ); |
226 | 0 | else mb_mc_1xywh ( h, 0, 0, 4, 2 ); |
227 | |
|
228 | 0 | if( ref0b >= 0 ) |
229 | 0 | if( ref1b >= 0 ) mb_mc_01xywh( h, 0, 2, 4, 2 ); |
230 | 0 | else mb_mc_0xywh ( h, 0, 2, 4, 2 ); |
231 | 0 | else mb_mc_1xywh ( h, 0, 2, 4, 2 ); |
232 | 0 | } |
233 | 0 | else if( h->mb.i_partition == D_8x16 ) |
234 | 0 | { |
235 | 0 | if( ref0a >= 0 ) |
236 | 0 | if( ref1a >= 0 ) mb_mc_01xywh( h, 0, 0, 2, 4 ); |
237 | 0 | else mb_mc_0xywh ( h, 0, 0, 2, 4 ); |
238 | 0 | else mb_mc_1xywh ( h, 0, 0, 2, 4 ); |
239 | |
|
240 | 0 | if( ref0b >= 0 ) |
241 | 0 | if( ref1b >= 0 ) mb_mc_01xywh( h, 2, 0, 2, 4 ); |
242 | 0 | else mb_mc_0xywh ( h, 2, 0, 2, 4 ); |
243 | 0 | else mb_mc_1xywh ( h, 2, 0, 2, 4 ); |
244 | 0 | } |
245 | 0 | } |
246 | 0 | } Unexecuted instantiation: x264_8_mb_mc Unexecuted instantiation: x264_10_mb_mc |
247 | | |
248 | | int x264_macroblock_cache_allocate( x264_t *h ) |
249 | 0 | { |
250 | 0 | int i_mb_count = h->mb.i_mb_count; |
251 | |
|
252 | 0 | h->mb.i_mb_stride = h->mb.i_mb_width; |
253 | 0 | h->mb.i_b8_stride = h->mb.i_mb_width * 2; |
254 | 0 | h->mb.i_b4_stride = h->mb.i_mb_width * 4; |
255 | |
|
256 | 0 | h->mb.b_interlaced = PARAM_INTERLACED; |
257 | |
|
258 | 0 | PREALLOC_INIT |
259 | |
|
260 | 0 | PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); |
261 | 0 | PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); |
262 | 0 | PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); |
263 | 0 | PREALLOC( h->mb.slice_table, i_mb_count * sizeof(int32_t) ); |
264 | | |
265 | | /* 0 -> 3 top(4), 4 -> 6 : left(3) */ |
266 | 0 | PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); |
267 | | |
268 | | /* all coeffs */ |
269 | 0 | PREALLOC( h->mb.non_zero_count, i_mb_count * 48 * sizeof(uint8_t) ); |
270 | |
|
271 | 0 | if( h->param.b_cabac ) |
272 | 0 | { |
273 | 0 | PREALLOC( h->mb.skipbp, i_mb_count * sizeof(int8_t) ); |
274 | 0 | PREALLOC( h->mb.chroma_pred_mode, i_mb_count * sizeof(int8_t) ); |
275 | 0 | PREALLOC( h->mb.mvd[0], i_mb_count * sizeof( **h->mb.mvd ) ); |
276 | 0 | if( h->param.i_bframe ) |
277 | 0 | PREALLOC( h->mb.mvd[1], i_mb_count * sizeof( **h->mb.mvd ) ); |
278 | 0 | } |
279 | |
|
280 | 0 | for( int i = 0; i < 2; i++ ) |
281 | 0 | { |
282 | 0 | int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; |
283 | 0 | if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) |
284 | 0 | i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit |
285 | |
|
286 | 0 | for( int j = !i; j < i_refs; j++ ) |
287 | 0 | PREALLOC( h->mb.mvr[i][j], 2 * (i_mb_count + 1) * sizeof(int16_t) ); |
288 | 0 | } |
289 | |
|
290 | 0 | if( h->param.analyse.i_weighted_pred ) |
291 | 0 | { |
292 | 0 | int i_padv = PADV << PARAM_INTERLACED; |
293 | 0 | int luma_plane_size = 0; |
294 | 0 | int numweightbuf; |
295 | |
|
296 | 0 | if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) |
297 | 0 | { |
298 | | // only need buffer for lookahead |
299 | 0 | if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] ) |
300 | 0 | { |
301 | | // Fake analysis only works on lowres |
302 | 0 | luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv); |
303 | | // Only need 1 buffer for analysis |
304 | 0 | numweightbuf = 1; |
305 | 0 | } |
306 | 0 | else |
307 | 0 | numweightbuf = 0; |
308 | 0 | } |
309 | 0 | else |
310 | 0 | { |
311 | | /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4 |
312 | | * needs the same amount of space and 4:2:2 needs twice that much */ |
313 | 0 | luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv); |
314 | |
|
315 | 0 | if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) |
316 | | //smart can weight one ref and one offset -1 in 8-bit |
317 | 0 | numweightbuf = 1 + (BIT_DEPTH == 8); |
318 | 0 | else |
319 | | //simple only has one weighted ref |
320 | 0 | numweightbuf = 1; |
321 | 0 | } |
322 | |
|
323 | 0 | for( int i = 0; i < numweightbuf; i++ ) |
324 | 0 | PREALLOC( h->mb.p_weight_buf[i], luma_plane_size * SIZEOF_PIXEL ); |
325 | 0 | } |
326 | |
|
327 | 0 | PREALLOC_END( h->mb.base ); |
328 | | |
329 | 0 | memset( h->mb.slice_table, -1, i_mb_count * sizeof(int32_t) ); |
330 | |
|
331 | 0 | for( int i = 0; i < 2; i++ ) |
332 | 0 | { |
333 | 0 | int i_refs = X264_MIN(X264_REF_MAX, (i ? 1 + !!h->param.i_bframe_pyramid : h->param.i_frame_reference) ) << PARAM_INTERLACED; |
334 | 0 | if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) |
335 | 0 | i_refs = X264_MIN(X264_REF_MAX, i_refs + 1 + (BIT_DEPTH == 8)); //smart weights add two duplicate frames, one in >8-bit |
336 | |
|
337 | 0 | for( int j = !i; j < i_refs; j++ ) |
338 | 0 | { |
339 | 0 | M32( h->mb.mvr[i][j][0] ) = 0; |
340 | 0 | h->mb.mvr[i][j]++; |
341 | 0 | } |
342 | 0 | } |
343 | |
|
344 | 0 | return 0; |
345 | 0 | fail: |
346 | 0 | return -1; |
347 | 0 | } Unexecuted instantiation: x264_8_macroblock_cache_allocate Unexecuted instantiation: x264_10_macroblock_cache_allocate |
348 | | void x264_macroblock_cache_free( x264_t *h ) |
349 | 0 | { |
350 | 0 | x264_free( h->mb.base ); |
351 | 0 | } Unexecuted instantiation: x264_8_macroblock_cache_free Unexecuted instantiation: x264_10_macroblock_cache_free |
352 | | |
353 | | int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) |
354 | 0 | { |
355 | 0 | if( !b_lookahead ) |
356 | 0 | { |
357 | 0 | for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ ) |
358 | 0 | for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ ) |
359 | 0 | { |
360 | 0 | CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * SIZEOF_PIXEL ); |
361 | 0 | h->intra_border_backup[i][j] += 16; |
362 | 0 | } |
363 | 0 | for( int i = 0; i <= PARAM_INTERLACED; i++ ) |
364 | 0 | { |
365 | 0 | if( h->param.b_sliced_threads ) |
366 | 0 | { |
367 | | /* Only allocate the first one, and allocate it for the whole frame, because we |
368 | | * won't be deblocking until after the frame is fully encoded. */ |
369 | 0 | if( h == h->thread[0] && !i ) |
370 | 0 | CHECKED_MALLOC( h->deblock_strength[0], sizeof(**h->deblock_strength) * h->mb.i_mb_count ); |
371 | 0 | else |
372 | 0 | h->deblock_strength[i] = h->thread[0]->deblock_strength[0]; |
373 | 0 | } |
374 | 0 | else |
375 | 0 | CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width ); |
376 | 0 | h->deblock_strength[1] = h->deblock_strength[i]; |
377 | 0 | } |
378 | 0 | } |
379 | | |
380 | | /* Allocate scratch buffer */ |
381 | 0 | int scratch_size = 0; |
382 | 0 | if( !b_lookahead ) |
383 | 0 | { |
384 | 0 | int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t); |
385 | 0 | int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); |
386 | 0 | int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); |
387 | 0 | int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * |
388 | 0 | ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); |
389 | 0 | scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); |
390 | 0 | } |
391 | 0 | int buf_mbtree = h->param.rc.b_mb_tree * ALIGN( h->mb.i_mb_width * sizeof(int16_t), NATIVE_ALIGN ); |
392 | 0 | scratch_size = X264_MAX( scratch_size, buf_mbtree ); |
393 | 0 | if( scratch_size ) |
394 | 0 | CHECKED_MALLOC( h->scratch_buffer, scratch_size ); |
395 | 0 | else |
396 | 0 | h->scratch_buffer = NULL; |
397 | | |
398 | 0 | int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2; |
399 | 0 | int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */ |
400 | 0 | scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 ); |
401 | 0 | CHECKED_MALLOC( h->scratch_buffer2, scratch_size ); |
402 | | |
403 | 0 | return 0; |
404 | 0 | fail: |
405 | 0 | return -1; |
406 | 0 | } Unexecuted instantiation: x264_8_macroblock_thread_allocate Unexecuted instantiation: x264_10_macroblock_thread_allocate |
407 | | |
408 | | void x264_macroblock_thread_free( x264_t *h, int b_lookahead ) |
409 | 0 | { |
410 | 0 | if( !b_lookahead ) |
411 | 0 | { |
412 | 0 | for( int i = 0; i <= PARAM_INTERLACED; i++ ) |
413 | 0 | if( !h->param.b_sliced_threads || (h == h->thread[0] && !i) ) |
414 | 0 | x264_free( h->deblock_strength[i] ); |
415 | 0 | for( int i = 0; i < (PARAM_INTERLACED ? 5 : 2); i++ ) |
416 | 0 | for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ ) |
417 | 0 | x264_free( h->intra_border_backup[i][j] - 16 ); |
418 | 0 | } |
419 | 0 | x264_free( h->scratch_buffer ); |
420 | 0 | x264_free( h->scratch_buffer2 ); |
421 | 0 | } Unexecuted instantiation: x264_8_macroblock_thread_free Unexecuted instantiation: x264_10_macroblock_thread_free |
422 | | |
423 | | void x264_macroblock_slice_init( x264_t *h ) |
424 | 0 | { |
425 | 0 | h->mb.mv[0] = h->fdec->mv[0]; |
426 | 0 | h->mb.mv[1] = h->fdec->mv[1]; |
427 | 0 | h->mb.mvr[0][0] = h->fdec->mv16x16; |
428 | 0 | h->mb.ref[0] = h->fdec->ref[0]; |
429 | 0 | h->mb.ref[1] = h->fdec->ref[1]; |
430 | 0 | h->mb.type = h->fdec->mb_type; |
431 | 0 | h->mb.partition = h->fdec->mb_partition; |
432 | 0 | h->mb.field = h->fdec->field; |
433 | |
|
434 | 0 | h->fdec->i_ref[0] = h->i_ref[0]; |
435 | 0 | h->fdec->i_ref[1] = h->i_ref[1]; |
436 | 0 | for( int i = 0; i < h->i_ref[0]; i++ ) |
437 | 0 | h->fdec->ref_poc[0][i] = h->fref[0][i]->i_poc; |
438 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
439 | 0 | { |
440 | 0 | for( int i = 0; i < h->i_ref[1]; i++ ) |
441 | 0 | h->fdec->ref_poc[1][i] = h->fref[1][i]->i_poc; |
442 | |
|
443 | 0 | map_col_to_list0(-1) = -1; |
444 | 0 | map_col_to_list0(-2) = -2; |
445 | 0 | for( int i = 0; i < h->fref[1][0]->i_ref[0]; i++ ) |
446 | 0 | { |
447 | 0 | int poc = h->fref[1][0]->ref_poc[0][i]; |
448 | 0 | map_col_to_list0(i) = -2; |
449 | 0 | for( int j = 0; j < h->i_ref[0]; j++ ) |
450 | 0 | if( h->fref[0][j]->i_poc == poc ) |
451 | 0 | { |
452 | 0 | map_col_to_list0(i) = j; |
453 | 0 | break; |
454 | 0 | } |
455 | 0 | } |
456 | 0 | } |
457 | 0 | else if( h->sh.i_type == SLICE_TYPE_P ) |
458 | 0 | { |
459 | 0 | if( h->sh.i_disable_deblocking_filter_idc != 1 && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) |
460 | 0 | { |
461 | 0 | deblock_ref_table(-2) = -2; |
462 | 0 | deblock_ref_table(-1) = -1; |
463 | 0 | for( int i = 0; i < h->i_ref[0] << SLICE_MBAFF; i++ ) |
464 | 0 | { |
465 | | /* Mask off high bits to avoid frame num collisions with -1/-2. |
466 | | * In current x264 frame num values don't cover a range of more |
467 | | * than 32, so 6 bits is enough for uniqueness. */ |
468 | 0 | if( !MB_INTERLACED ) |
469 | 0 | deblock_ref_table(i) = h->fref[0][i]->i_frame_num&63; |
470 | 0 | else |
471 | 0 | deblock_ref_table(i) = ((h->fref[0][i>>1]->i_frame_num&63)<<1) + (i&1); |
472 | 0 | } |
473 | 0 | } |
474 | 0 | } |
475 | | |
476 | | /* init with not available (for top right idx=7,15) */ |
477 | 0 | memset( h->mb.cache.ref, -2, sizeof( h->mb.cache.ref ) ); |
478 | |
|
479 | 0 | if( h->i_ref[0] > 0 ) |
480 | 0 | for( int field = 0; field <= SLICE_MBAFF; field++ ) |
481 | 0 | { |
482 | 0 | int curpoc = h->fdec->i_poc + h->fdec->i_delta_poc[field]; |
483 | 0 | int refpoc = h->fref[0][0]->i_poc + h->fref[0][0]->i_delta_poc[field]; |
484 | 0 | int delta = curpoc - refpoc; |
485 | |
|
486 | 0 | h->fdec->inv_ref_poc[field] = (256 + delta/2) / delta; |
487 | 0 | } |
488 | |
|
489 | 0 | h->mb.i_neighbour4[6] = |
490 | 0 | h->mb.i_neighbour4[9] = |
491 | 0 | h->mb.i_neighbour4[12] = |
492 | 0 | h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT; |
493 | 0 | h->mb.i_neighbour4[3] = |
494 | 0 | h->mb.i_neighbour4[7] = |
495 | 0 | h->mb.i_neighbour4[11] = |
496 | 0 | h->mb.i_neighbour4[13] = |
497 | 0 | h->mb.i_neighbour4[15] = |
498 | 0 | h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT; |
499 | 0 | } Unexecuted instantiation: x264_8_macroblock_slice_init Unexecuted instantiation: x264_10_macroblock_slice_init |
500 | | |
501 | | void x264_macroblock_thread_init( x264_t *h ) |
502 | 0 | { |
503 | 0 | h->mb.i_me_method = h->param.analyse.i_me_method; |
504 | 0 | h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine; |
505 | 0 | if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) ) |
506 | 0 | h->mb.i_subpel_refine--; |
507 | 0 | h->mb.b_chroma_me = h->param.analyse.b_chroma_me && |
508 | 0 | ((h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5) || |
509 | 0 | (h->sh.i_type == SLICE_TYPE_B && h->mb.i_subpel_refine >= 9)); |
510 | 0 | h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B || |
511 | 0 | (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I); |
512 | 0 | h->mb.i_mb_prev_xy = -1; |
513 | | |
514 | | /* 4:2:0 4:2:2 4:4:4 |
515 | | * fdec fenc fdec fenc fdec fenc |
516 | | * y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y |
517 | | * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y |
518 | | * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y |
519 | | * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y |
520 | | * y Y Y Y Y U U V V y Y Y Y Y U U V V y Y Y Y Y U U U U |
521 | | * u u u v v v U U V V u u u v v v U U V V u u u u u u u U U U U |
522 | | * u U U v V V u U U v V V U U V V u U U U U U U U U |
523 | | * u U U v V V u U U v V V U U V V u U U U U U U U U |
524 | | * u U U v V V u U U U U V V V V |
525 | | * u U U v V V u U U U U V V V V |
526 | | * v v v v v v v V V V V |
527 | | * v V V V V V V V V |
528 | | * v V V V V |
529 | | * v V V V V |
530 | | * v V V V V |
531 | | */ |
532 | 0 | h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; |
533 | 0 | h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; |
534 | 0 | if( CHROMA_FORMAT ) |
535 | 0 | { |
536 | 0 | h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; |
537 | 0 | h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE; |
538 | 0 | if( CHROMA444 ) |
539 | 0 | { |
540 | 0 | h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE; |
541 | 0 | h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE; |
542 | 0 | } |
543 | 0 | else |
544 | 0 | { |
545 | 0 | h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; |
546 | 0 | h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16; |
547 | 0 | } |
548 | 0 | } |
549 | 0 | } Unexecuted instantiation: x264_8_macroblock_thread_init Unexecuted instantiation: x264_10_macroblock_thread_init |
550 | | |
551 | | void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) |
552 | 0 | { |
553 | 0 | int stride_y = fenc->i_stride[0]; |
554 | 0 | int stride_uv = fenc->i_stride[1]; |
555 | 0 | int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; |
556 | 0 | int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> CHROMA_V_SHIFT); |
557 | 0 | h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, |
558 | 0 | fenc->plane[1] != NULL ? fenc->plane[1]+off_uv : NULL, stride_uv, i_mb_x ); |
559 | 0 | } Unexecuted instantiation: x264_8_prefetch_fenc Unexecuted instantiation: x264_10_prefetch_fenc |
560 | | |
561 | | NOINLINE void x264_copy_column8( pixel *dst, pixel *src ) |
562 | 0 | { |
563 | | // input pointers are offset by 4 rows because that's faster (smaller instruction size on x86) |
564 | 0 | for( int i = -4; i < 4; i++ ) |
565 | 0 | dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE]; |
566 | 0 | } Unexecuted instantiation: x264_8_copy_column8 Unexecuted instantiation: x264_10_copy_column8 |
567 | | |
568 | | static ALWAYS_INLINE void macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) |
569 | 0 | { |
570 | 0 | int mb_interlaced = b_mbaff && MB_INTERLACED; |
571 | 0 | int height = b_chroma ? 16 >> CHROMA_V_SHIFT : 16; |
572 | 0 | int i_stride = h->fdec->i_stride[i]; |
573 | 0 | int i_stride2 = i_stride << mb_interlaced; |
574 | 0 | int i_pix_offset = mb_interlaced |
575 | 0 | ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride |
576 | 0 | : 16 * mb_x + height * mb_y * i_stride; |
577 | 0 | pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; |
578 | 0 | int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : !(mb_y&1); |
579 | 0 | pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16]; |
580 | 0 | int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; |
581 | | /* ref_pix_offset[0] references the current field and [1] the opposite field. */ |
582 | 0 | if( mb_interlaced ) |
583 | 0 | ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride; |
584 | 0 | h->mb.pic.i_stride[i] = i_stride2; |
585 | 0 | h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; |
586 | 0 | if( b_chroma ) |
587 | 0 | { |
588 | 0 | h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height ); |
589 | 0 | memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*SIZEOF_PIXEL ); |
590 | 0 | memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*SIZEOF_PIXEL ); |
591 | 0 | h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = intra_fdec[-1-8]; |
592 | 0 | h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = intra_fdec[-1]; |
593 | 0 | } |
594 | 0 | else |
595 | 0 | { |
596 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, h->mb.pic.p_fenc_plane[i], i_stride2, 16 ); |
597 | 0 | memcpy( h->mb.pic.p_fdec[i]-FDEC_STRIDE, intra_fdec, 24*SIZEOF_PIXEL ); |
598 | 0 | h->mb.pic.p_fdec[i][-FDEC_STRIDE-1] = intra_fdec[-1]; |
599 | 0 | } |
600 | 0 | if( b_mbaff || h->mb.b_reencode_mb ) |
601 | 0 | { |
602 | 0 | for( int j = 0; j < height; j++ ) |
603 | 0 | if( b_chroma ) |
604 | 0 | { |
605 | 0 | h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; |
606 | 0 | h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; |
607 | 0 | } |
608 | 0 | else |
609 | 0 | h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; |
610 | 0 | } |
611 | 0 | pixel *plane_src, **filtered_src; |
612 | 0 | for( int j = 0; j < h->mb.pic.i_fref[0]; j++ ) |
613 | 0 | { |
614 | | // Interpolate between pixels in same field. |
615 | 0 | if( mb_interlaced ) |
616 | 0 | { |
617 | 0 | plane_src = h->fref[0][j>>1]->plane_fld[i]; |
618 | 0 | filtered_src = h->fref[0][j>>1]->filtered_fld[i]; |
619 | 0 | } |
620 | 0 | else |
621 | 0 | { |
622 | 0 | plane_src = h->fref[0][j]->plane[i]; |
623 | 0 | filtered_src = h->fref[0][j]->filtered[i]; |
624 | 0 | } |
625 | 0 | h->mb.pic.p_fref[0][j][i*4] = plane_src + ref_pix_offset[j&1]; |
626 | |
|
627 | 0 | if( !b_chroma ) |
628 | 0 | { |
629 | 0 | if( h->param.analyse.i_subpel_refine ) |
630 | 0 | for( int k = 1; k < 4; k++ ) |
631 | 0 | h->mb.pic.p_fref[0][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1]; |
632 | 0 | if( !i ) |
633 | 0 | { |
634 | 0 | if( h->sh.weight[j][0].weightfn ) |
635 | 0 | h->mb.pic.p_fref_w[j] = &h->fenc->weighted[j >> mb_interlaced][ref_pix_offset[j&1]]; |
636 | 0 | else |
637 | 0 | h->mb.pic.p_fref_w[j] = h->mb.pic.p_fref[0][j][0]; |
638 | 0 | } |
639 | 0 | } |
640 | 0 | } |
641 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
642 | 0 | for( int j = 0; j < h->mb.pic.i_fref[1]; j++ ) |
643 | 0 | { |
644 | 0 | if( mb_interlaced ) |
645 | 0 | { |
646 | 0 | plane_src = h->fref[1][j>>1]->plane_fld[i]; |
647 | 0 | filtered_src = h->fref[1][j>>1]->filtered_fld[i]; |
648 | 0 | } |
649 | 0 | else |
650 | 0 | { |
651 | 0 | plane_src = h->fref[1][j]->plane[i]; |
652 | 0 | filtered_src = h->fref[1][j]->filtered[i]; |
653 | 0 | } |
654 | 0 | h->mb.pic.p_fref[1][j][i*4] = plane_src + ref_pix_offset[j&1]; |
655 | |
|
656 | 0 | if( !b_chroma && h->param.analyse.i_subpel_refine ) |
657 | 0 | for( int k = 1; k < 4; k++ ) |
658 | 0 | h->mb.pic.p_fref[1][j][i*4+k] = filtered_src[k] + ref_pix_offset[j&1]; |
659 | 0 | } |
660 | 0 | } |
661 | | |
662 | | static const x264_left_table_t left_indices[4] = |
663 | | { |
664 | | /* Current is progressive */ |
665 | | {{ 4, 4, 5, 5}, { 3, 3, 7, 7}, {16+1, 16+1, 32+1, 32+1}, {0, 0, 1, 1}, {0, 0, 0, 0}}, |
666 | | {{ 6, 6, 3, 3}, {11, 11, 15, 15}, {16+5, 16+5, 32+5, 32+5}, {2, 2, 3, 3}, {1, 1, 1, 1}}, |
667 | | /* Current is interlaced */ |
668 | | {{ 4, 6, 4, 6}, { 3, 11, 3, 11}, {16+1, 16+1, 32+1, 32+1}, {0, 2, 0, 2}, {0, 1, 0, 1}}, |
669 | | /* Both same */ |
670 | | {{ 4, 5, 6, 3}, { 3, 7, 11, 15}, {16+1, 16+5, 32+1, 32+5}, {0, 1, 2, 3}, {0, 0, 1, 1}} |
671 | | }; |
672 | | |
673 | | static ALWAYS_INLINE void macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y, int b_interlaced ) |
674 | 0 | { |
675 | 0 | const int mb_interlaced = b_interlaced && MB_INTERLACED; |
676 | 0 | int top_y = mb_y - (1 << mb_interlaced); |
677 | 0 | int top = top_y * h->mb.i_mb_stride + mb_x; |
678 | |
|
679 | 0 | h->mb.i_mb_x = mb_x; |
680 | 0 | h->mb.i_mb_y = mb_y; |
681 | 0 | h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x; |
682 | 0 | h->mb.i_b8_xy = 2*(mb_y * h->mb.i_b8_stride + mb_x); |
683 | 0 | h->mb.i_b4_xy = 4*(mb_y * h->mb.i_b4_stride + mb_x); |
684 | 0 | h->mb.left_b8[0] = |
685 | 0 | h->mb.left_b8[1] = -1; |
686 | 0 | h->mb.left_b4[0] = |
687 | 0 | h->mb.left_b4[1] = -1; |
688 | 0 | h->mb.i_neighbour = 0; |
689 | 0 | h->mb.i_neighbour_intra = 0; |
690 | 0 | h->mb.i_neighbour_frame = 0; |
691 | 0 | h->mb.i_mb_top_xy = -1; |
692 | 0 | h->mb.i_mb_top_y = -1; |
693 | 0 | h->mb.i_mb_left_xy[0] = h->mb.i_mb_left_xy[1] = -1; |
694 | 0 | h->mb.i_mb_topleft_xy = -1; |
695 | 0 | h->mb.i_mb_topright_xy = -1; |
696 | 0 | h->mb.i_mb_type_top = -1; |
697 | 0 | h->mb.i_mb_type_left[0] = h->mb.i_mb_type_left[1] = -1; |
698 | 0 | h->mb.i_mb_type_topleft = -1; |
699 | 0 | h->mb.i_mb_type_topright = -1; |
700 | 0 | h->mb.left_index_table = &left_indices[3]; |
701 | 0 | h->mb.topleft_partition = 0; |
702 | |
|
703 | 0 | int topleft_y = top_y; |
704 | 0 | int topright_y = top_y; |
705 | 0 | int left[2]; |
706 | |
|
707 | 0 | left[0] = left[1] = h->mb.i_mb_xy - 1; |
708 | 0 | h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2; |
709 | 0 | h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4; |
710 | |
|
711 | 0 | if( b_interlaced ) |
712 | 0 | { |
713 | 0 | h->mb.i_mb_top_mbpair_xy = h->mb.i_mb_xy - 2*h->mb.i_mb_stride; |
714 | 0 | h->mb.i_mb_topleft_y = -1; |
715 | 0 | h->mb.i_mb_topright_y = -1; |
716 | |
|
717 | 0 | if( mb_y&1 ) |
718 | 0 | { |
719 | 0 | if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) |
720 | 0 | { |
721 | 0 | left[0] = left[1] = h->mb.i_mb_xy - 1 - h->mb.i_mb_stride; |
722 | 0 | h->mb.left_b8[0] = h->mb.left_b8[1] = h->mb.i_b8_xy - 2 - 2*h->mb.i_b8_stride; |
723 | 0 | h->mb.left_b4[0] = h->mb.left_b4[1] = h->mb.i_b4_xy - 4 - 4*h->mb.i_b4_stride; |
724 | |
|
725 | 0 | if( mb_interlaced ) |
726 | 0 | { |
727 | 0 | h->mb.left_index_table = &left_indices[2]; |
728 | 0 | left[1] += h->mb.i_mb_stride; |
729 | 0 | h->mb.left_b8[1] += 2*h->mb.i_b8_stride; |
730 | 0 | h->mb.left_b4[1] += 4*h->mb.i_b4_stride; |
731 | 0 | } |
732 | 0 | else |
733 | 0 | { |
734 | 0 | h->mb.left_index_table = &left_indices[1]; |
735 | 0 | topleft_y++; |
736 | 0 | h->mb.topleft_partition = 1; |
737 | 0 | } |
738 | 0 | } |
739 | 0 | if( !mb_interlaced ) |
740 | 0 | topright_y = -1; |
741 | 0 | } |
742 | 0 | else |
743 | 0 | { |
744 | 0 | if( mb_interlaced && top >= 0 ) |
745 | 0 | { |
746 | 0 | if( !h->mb.field[top] ) |
747 | 0 | { |
748 | 0 | top += h->mb.i_mb_stride; |
749 | 0 | top_y++; |
750 | 0 | } |
751 | 0 | if( mb_x ) |
752 | 0 | topleft_y += !h->mb.field[h->mb.i_mb_stride*topleft_y + mb_x - 1]; |
753 | 0 | if( mb_x < h->mb.i_mb_width-1 ) |
754 | 0 | topright_y += !h->mb.field[h->mb.i_mb_stride*topright_y + mb_x + 1]; |
755 | 0 | } |
756 | 0 | if( mb_x && mb_interlaced != h->mb.field[h->mb.i_mb_xy-1] ) |
757 | 0 | { |
758 | 0 | if( mb_interlaced ) |
759 | 0 | { |
760 | 0 | h->mb.left_index_table = &left_indices[2]; |
761 | 0 | left[1] += h->mb.i_mb_stride; |
762 | 0 | h->mb.left_b8[1] += 2*h->mb.i_b8_stride; |
763 | 0 | h->mb.left_b4[1] += 4*h->mb.i_b4_stride; |
764 | 0 | } |
765 | 0 | else |
766 | 0 | h->mb.left_index_table = &left_indices[0]; |
767 | 0 | } |
768 | 0 | } |
769 | 0 | } |
770 | |
|
771 | 0 | if( mb_x > 0 ) |
772 | 0 | { |
773 | 0 | h->mb.i_neighbour_frame |= MB_LEFT; |
774 | 0 | h->mb.i_mb_left_xy[0] = left[0]; |
775 | 0 | h->mb.i_mb_left_xy[1] = left[1]; |
776 | 0 | h->mb.i_mb_type_left[0] = h->mb.type[h->mb.i_mb_left_xy[0]]; |
777 | 0 | h->mb.i_mb_type_left[1] = h->mb.type[h->mb.i_mb_left_xy[1]]; |
778 | 0 | if( h->mb.slice_table[left[0]] == h->sh.i_first_mb ) |
779 | 0 | { |
780 | 0 | h->mb.i_neighbour |= MB_LEFT; |
781 | | |
782 | | // FIXME: We don't currently support constrained intra + mbaff. |
783 | 0 | if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_left[0] ) ) |
784 | 0 | h->mb.i_neighbour_intra |= MB_LEFT; |
785 | 0 | } |
786 | 0 | } |
787 | | |
788 | | /* We can't predict from the previous threadslice since it hasn't been encoded yet. */ |
789 | 0 | if( (h->i_threadslice_start >> mb_interlaced) != (mb_y >> mb_interlaced) ) |
790 | 0 | { |
791 | 0 | if( top >= 0 ) |
792 | 0 | { |
793 | 0 | h->mb.i_neighbour_frame |= MB_TOP; |
794 | 0 | h->mb.i_mb_top_xy = top; |
795 | 0 | h->mb.i_mb_top_y = top_y; |
796 | 0 | h->mb.i_mb_type_top = h->mb.type[h->mb.i_mb_top_xy]; |
797 | 0 | if( h->mb.slice_table[top] == h->sh.i_first_mb ) |
798 | 0 | { |
799 | 0 | h->mb.i_neighbour |= MB_TOP; |
800 | |
|
801 | 0 | if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) ) |
802 | 0 | h->mb.i_neighbour_intra |= MB_TOP; |
803 | | |
804 | | /* We only need to prefetch the top blocks because the left was just written |
805 | | * to as part of the previous cache_save. Since most target CPUs use write-allocate |
806 | | * caches, left blocks are near-guaranteed to be in L1 cache. Top--not so much. */ |
807 | 0 | x264_prefetch( &h->mb.cbp[top] ); |
808 | 0 | x264_prefetch( h->mb.intra4x4_pred_mode[top] ); |
809 | 0 | x264_prefetch( &h->mb.non_zero_count[top][12] ); |
810 | 0 | x264_prefetch( &h->mb.mb_transform_size[top] ); |
811 | 0 | if( h->param.b_cabac ) |
812 | 0 | x264_prefetch( &h->mb.skipbp[top] ); |
813 | 0 | } |
814 | 0 | } |
815 | |
|
816 | 0 | if( mb_x > 0 && topleft_y >= 0 ) |
817 | 0 | { |
818 | 0 | h->mb.i_neighbour_frame |= MB_TOPLEFT; |
819 | 0 | h->mb.i_mb_topleft_xy = h->mb.i_mb_stride*topleft_y + mb_x - 1; |
820 | 0 | h->mb.i_mb_topleft_y = topleft_y; |
821 | 0 | h->mb.i_mb_type_topleft = h->mb.type[h->mb.i_mb_topleft_xy]; |
822 | 0 | if( h->mb.slice_table[h->mb.i_mb_topleft_xy] == h->sh.i_first_mb ) |
823 | 0 | { |
824 | 0 | h->mb.i_neighbour |= MB_TOPLEFT; |
825 | |
|
826 | 0 | if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topleft ) ) |
827 | 0 | h->mb.i_neighbour_intra |= MB_TOPLEFT; |
828 | 0 | } |
829 | 0 | } |
830 | |
|
831 | 0 | if( mb_x < h->mb.i_mb_width - 1 && topright_y >= 0 ) |
832 | 0 | { |
833 | 0 | h->mb.i_neighbour_frame |= MB_TOPRIGHT; |
834 | 0 | h->mb.i_mb_topright_xy = h->mb.i_mb_stride*topright_y + mb_x + 1; |
835 | 0 | h->mb.i_mb_topright_y = topright_y; |
836 | 0 | h->mb.i_mb_type_topright = h->mb.type[h->mb.i_mb_topright_xy]; |
837 | 0 | if( h->mb.slice_table[h->mb.i_mb_topright_xy] == h->sh.i_first_mb ) |
838 | 0 | { |
839 | 0 | h->mb.i_neighbour |= MB_TOPRIGHT; |
840 | |
|
841 | 0 | if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_topright ) ) |
842 | 0 | h->mb.i_neighbour_intra |= MB_TOPRIGHT; |
843 | 0 | } |
844 | 0 | } |
845 | 0 | } |
846 | 0 | } |
847 | | |
848 | 0 | #define LTOP 0 |
849 | | #if HAVE_INTERLACED |
850 | 0 | # define LBOT 1 |
851 | | #else |
852 | | # define LBOT 0 |
853 | | #endif |
854 | | |
855 | | static ALWAYS_INLINE void macroblock_cache_load( x264_t *h, int mb_x, int mb_y, int b_mbaff ) |
856 | 0 | { |
857 | 0 | macroblock_cache_load_neighbours( h, mb_x, mb_y, b_mbaff ); |
858 | |
|
859 | 0 | int *left = h->mb.i_mb_left_xy; |
860 | 0 | int top = h->mb.i_mb_top_xy; |
861 | 0 | int top_y = h->mb.i_mb_top_y; |
862 | 0 | int s8x8 = h->mb.i_b8_stride; |
863 | 0 | int s4x4 = h->mb.i_b4_stride; |
864 | 0 | int top_8x8 = (2*top_y+1) * s8x8 + 2*mb_x; |
865 | 0 | int top_4x4 = (4*top_y+3) * s4x4 + 4*mb_x; |
866 | 0 | int lists = (1 << h->sh.i_type) & 3; |
867 | | |
868 | | /* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */ |
869 | | /* By only dereferencing them once, we avoid this issue. */ |
870 | 0 | int8_t (*i4x4)[8] = h->mb.intra4x4_pred_mode; |
871 | 0 | uint8_t (*nnz)[48] = h->mb.non_zero_count; |
872 | 0 | int16_t *cbp = h->mb.cbp; |
873 | |
|
874 | 0 | const x264_left_table_t *left_index_table = h->mb.left_index_table; |
875 | |
|
876 | 0 | h->mb.cache.deblock_strength = h->deblock_strength[mb_y&1][h->param.b_sliced_threads?h->mb.i_mb_xy:mb_x]; |
877 | | |
878 | | /* load cache */ |
879 | 0 | if( h->mb.i_neighbour & MB_TOP ) |
880 | 0 | { |
881 | 0 | h->mb.cache.i_cbp_top = cbp[top]; |
882 | | /* load intra4x4 */ |
883 | 0 | CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &i4x4[top][0] ); |
884 | | |
885 | | /* load non_zero_count */ |
886 | 0 | CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] ); |
887 | 0 | CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>CHROMA_V_SHIFT)] ); |
888 | 0 | CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>CHROMA_V_SHIFT)] ); |
889 | | |
890 | | /* Finish the prefetching */ |
891 | 0 | for( int l = 0; l < lists; l++ ) |
892 | 0 | { |
893 | 0 | x264_prefetch( &h->mb.mv[l][top_4x4-1] ); |
894 | | /* Top right being not in the same cacheline as top left will happen |
895 | | * once every 4 MBs, so one extra prefetch is worthwhile */ |
896 | 0 | x264_prefetch( &h->mb.mv[l][top_4x4+4] ); |
897 | 0 | x264_prefetch( &h->mb.ref[l][top_8x8-1] ); |
898 | 0 | if( h->param.b_cabac ) |
899 | 0 | x264_prefetch( &h->mb.mvd[l][top] ); |
900 | 0 | } |
901 | 0 | } |
902 | 0 | else |
903 | 0 | { |
904 | 0 | h->mb.cache.i_cbp_top = -1; |
905 | | |
906 | | /* load intra4x4 */ |
907 | 0 | M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU; |
908 | | |
909 | | /* load non_zero_count */ |
910 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U; |
911 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8] ) = 0x80808080U; |
912 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8] ) = 0x80808080U; |
913 | 0 | } |
914 | |
|
915 | 0 | if( h->mb.i_neighbour & MB_LEFT ) |
916 | 0 | { |
917 | 0 | int ltop = left[LTOP]; |
918 | 0 | int lbot = b_mbaff ? left[LBOT] : ltop; |
919 | 0 | if( b_mbaff ) |
920 | 0 | { |
921 | 0 | const int16_t top_luma = (cbp[ltop] >> (left_index_table->mv[0]&(~1))) & 2; |
922 | 0 | const int16_t bot_luma = (cbp[lbot] >> (left_index_table->mv[2]&(~1))) & 2; |
923 | 0 | h->mb.cache.i_cbp_left = (cbp[ltop] & 0xfff0) | (bot_luma<<2) | top_luma; |
924 | 0 | } |
925 | 0 | else |
926 | 0 | h->mb.cache.i_cbp_left = cbp[ltop]; |
927 | | |
928 | | /* load intra4x4 */ |
929 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = i4x4[ltop][left_index_table->intra[0]]; |
930 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = i4x4[ltop][left_index_table->intra[1]]; |
931 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = i4x4[lbot][left_index_table->intra[2]]; |
932 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = i4x4[lbot][left_index_table->intra[3]]; |
933 | | |
934 | | /* load non_zero_count */ |
935 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = nnz[ltop][left_index_table->nnz[0]]; |
936 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = nnz[ltop][left_index_table->nnz[1]]; |
937 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]]; |
938 | 0 | h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]]; |
939 | |
|
940 | 0 | if( CHROMA_FORMAT >= CHROMA_422 ) |
941 | 0 | { |
942 | 0 | int offset = (4>>CHROMA_H_SHIFT) - 4; |
943 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset]; |
944 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset]; |
945 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset]; |
946 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset]; |
947 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset]; |
948 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset]; |
949 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset]; |
950 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset]; |
951 | 0 | } |
952 | 0 | else |
953 | 0 | { |
954 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[0]]; |
955 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[1]]; |
956 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz_chroma[2]]; |
957 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[lbot][left_index_table->nnz_chroma[3]]; |
958 | 0 | } |
959 | 0 | } |
960 | 0 | else |
961 | 0 | { |
962 | 0 | h->mb.cache.i_cbp_left = -1; |
963 | |
|
964 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 0] - 1] = |
965 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 2] - 1] = |
966 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[ 8] - 1] = |
967 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[10] - 1] = -1; |
968 | | |
969 | | /* load non_zero_count */ |
970 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 0] - 1] = |
971 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 2] - 1] = |
972 | 0 | h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = |
973 | 0 | h->mb.cache.non_zero_count[x264_scan8[10] - 1] = |
974 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = |
975 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = |
976 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = |
977 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80; |
978 | 0 | if( CHROMA_FORMAT >= CHROMA_422 ) |
979 | 0 | { |
980 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = |
981 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = |
982 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = |
983 | 0 | h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = 0x80; |
984 | 0 | } |
985 | 0 | } |
986 | |
|
987 | 0 | if( h->pps->b_transform_8x8_mode ) |
988 | 0 | { |
989 | 0 | h->mb.cache.i_neighbour_transform_size = |
990 | 0 | ( (h->mb.i_neighbour & MB_LEFT) && h->mb.mb_transform_size[left[0]] ) |
991 | 0 | + ( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ); |
992 | 0 | } |
993 | |
|
994 | 0 | if( b_mbaff ) |
995 | 0 | { |
996 | 0 | h->mb.pic.i_fref[0] = h->i_ref[0] << MB_INTERLACED; |
997 | 0 | h->mb.pic.i_fref[1] = h->i_ref[1] << MB_INTERLACED; |
998 | 0 | } |
999 | |
|
1000 | 0 | if( !b_mbaff ) |
1001 | 0 | { |
1002 | 0 | x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE ); |
1003 | 0 | x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE ); |
1004 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 0 ); |
1005 | 0 | if( CHROMA444 ) |
1006 | 0 | { |
1007 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+ 4*FDEC_STRIDE ); |
1008 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+15+12*FDEC_STRIDE ); |
1009 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+ 4*FDEC_STRIDE ); |
1010 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+15+12*FDEC_STRIDE ); |
1011 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 0 ); |
1012 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 0 ); |
1013 | 0 | } |
1014 | 0 | else if( CHROMA_FORMAT ) |
1015 | 0 | { |
1016 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); |
1017 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); |
1018 | 0 | if( CHROMA_FORMAT == CHROMA_422 ) |
1019 | 0 | { |
1020 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE ); |
1021 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE ); |
1022 | 0 | } |
1023 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 ); |
1024 | 0 | } |
1025 | 0 | } |
1026 | 0 | else |
1027 | 0 | { |
1028 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0, 1 ); |
1029 | 0 | if( CHROMA444 ) |
1030 | 0 | { |
1031 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0, 1 ); |
1032 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0, 1 ); |
1033 | 0 | } |
1034 | 0 | else if( CHROMA_FORMAT ) |
1035 | 0 | macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 1 ); |
1036 | 0 | } |
1037 | |
|
1038 | 0 | if( h->fdec->integral ) |
1039 | 0 | { |
1040 | 0 | int offset = 16 * (mb_x + mb_y * h->fdec->i_stride[0]); |
1041 | 0 | for( int list = 0; list < 2; list++ ) |
1042 | 0 | for( int i = 0; i < h->mb.pic.i_fref[list]; i++ ) |
1043 | 0 | h->mb.pic.p_integral[list][i] = &h->fref[list][i]->integral[offset]; |
1044 | 0 | } |
1045 | |
|
1046 | 0 | x264_prefetch_fenc( h, h->fenc, mb_x, mb_y ); |
1047 | | |
1048 | | /* load ref/mv/mvd */ |
1049 | 0 | for( int l = 0; l < lists; l++ ) |
1050 | 0 | { |
1051 | 0 | int16_t (*mv)[2] = h->mb.mv[l]; |
1052 | 0 | int8_t *ref = h->mb.ref[l]; |
1053 | |
|
1054 | 0 | int i8 = x264_scan8[0] - 1 - 1*8; |
1055 | 0 | if( h->mb.i_neighbour & MB_TOPLEFT ) |
1056 | 0 | { |
1057 | 0 | int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topleft_y + mb_x-1)+1+s8x8 : top_8x8 - 1; |
1058 | 0 | int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topleft_y + mb_x-1)+3+3*s4x4 : top_4x4 - 1; |
1059 | 0 | if( b_mbaff && h->mb.topleft_partition ) |
1060 | 0 | { |
1061 | | /* Take motion vector from the middle of macroblock instead of |
1062 | | * the bottom right as usual. */ |
1063 | 0 | iv -= 2*s4x4; |
1064 | 0 | ir -= s8x8; |
1065 | 0 | } |
1066 | 0 | h->mb.cache.ref[l][i8] = ref[ir]; |
1067 | 0 | CP32( h->mb.cache.mv[l][i8], mv[iv] ); |
1068 | 0 | } |
1069 | 0 | else |
1070 | 0 | { |
1071 | 0 | h->mb.cache.ref[l][i8] = -2; |
1072 | 0 | M32( h->mb.cache.mv[l][i8] ) = 0; |
1073 | 0 | } |
1074 | |
|
1075 | 0 | i8 = x264_scan8[0] - 8; |
1076 | 0 | if( h->mb.i_neighbour & MB_TOP ) |
1077 | 0 | { |
1078 | 0 | h->mb.cache.ref[l][i8+0] = |
1079 | 0 | h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0]; |
1080 | 0 | h->mb.cache.ref[l][i8+2] = |
1081 | 0 | h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1]; |
1082 | 0 | CP128( h->mb.cache.mv[l][i8], mv[top_4x4] ); |
1083 | 0 | } |
1084 | 0 | else |
1085 | 0 | { |
1086 | 0 | M128( h->mb.cache.mv[l][i8] ) = M128_ZERO; |
1087 | 0 | M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U; |
1088 | 0 | } |
1089 | |
|
1090 | 0 | i8 = x264_scan8[0] + 4 - 1*8; |
1091 | 0 | if( h->mb.i_neighbour & MB_TOPRIGHT ) |
1092 | 0 | { |
1093 | 0 | int ir = b_mbaff ? 2*(s8x8*h->mb.i_mb_topright_y + (mb_x+1))+s8x8 : top_8x8 + 2; |
1094 | 0 | int iv = b_mbaff ? 4*(s4x4*h->mb.i_mb_topright_y + (mb_x+1))+3*s4x4 : top_4x4 + 4; |
1095 | 0 | h->mb.cache.ref[l][i8] = ref[ir]; |
1096 | 0 | CP32( h->mb.cache.mv[l][i8], mv[iv] ); |
1097 | 0 | } |
1098 | 0 | else |
1099 | 0 | h->mb.cache.ref[l][i8] = -2; |
1100 | |
|
1101 | 0 | i8 = x264_scan8[0] - 1; |
1102 | 0 | if( h->mb.i_neighbour & MB_LEFT ) |
1103 | 0 | { |
1104 | 0 | if( b_mbaff ) |
1105 | 0 | { |
1106 | 0 | h->mb.cache.ref[l][i8+0*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[0]]; |
1107 | 0 | h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[LTOP] + 1 + s8x8*left_index_table->ref[1]]; |
1108 | 0 | h->mb.cache.ref[l][i8+2*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[2]]; |
1109 | 0 | h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[LBOT] + 1 + s8x8*left_index_table->ref[3]]; |
1110 | |
|
1111 | 0 | CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[0]] ); |
1112 | 0 | CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[LTOP] + 3 + s4x4*left_index_table->mv[1]] ); |
1113 | 0 | CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[2]] ); |
1114 | 0 | CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[LBOT] + 3 + s4x4*left_index_table->mv[3]] ); |
1115 | 0 | } |
1116 | 0 | else |
1117 | 0 | { |
1118 | 0 | const int ir = h->mb.i_b8_xy - 1; |
1119 | 0 | const int iv = h->mb.i_b4_xy - 1; |
1120 | 0 | h->mb.cache.ref[l][i8+0*8] = |
1121 | 0 | h->mb.cache.ref[l][i8+1*8] = ref[ir + 0*s8x8]; |
1122 | 0 | h->mb.cache.ref[l][i8+2*8] = |
1123 | 0 | h->mb.cache.ref[l][i8+3*8] = ref[ir + 1*s8x8]; |
1124 | |
|
1125 | 0 | CP32( h->mb.cache.mv[l][i8+0*8], mv[iv + 0*s4x4] ); |
1126 | 0 | CP32( h->mb.cache.mv[l][i8+1*8], mv[iv + 1*s4x4] ); |
1127 | 0 | CP32( h->mb.cache.mv[l][i8+2*8], mv[iv + 2*s4x4] ); |
1128 | 0 | CP32( h->mb.cache.mv[l][i8+3*8], mv[iv + 3*s4x4] ); |
1129 | 0 | } |
1130 | 0 | } |
1131 | 0 | else |
1132 | 0 | { |
1133 | 0 | for( int i = 0; i < 4; i++ ) |
1134 | 0 | { |
1135 | 0 | h->mb.cache.ref[l][i8+i*8] = -2; |
1136 | 0 | M32( h->mb.cache.mv[l][i8+i*8] ) = 0; |
1137 | 0 | } |
1138 | 0 | } |
1139 | | |
1140 | | /* Extra logic for top right mv in mbaff. |
1141 | | * . . . d . . a . |
1142 | | * . . . e . . . . |
1143 | | * . . . f b . c . |
1144 | | * . . . . . . . . |
1145 | | * |
1146 | | * If the top right of the 4x4 partitions labeled a, b and c in the |
1147 | | * above diagram do not exist, but the entries d, e and f exist (in |
1148 | | * the macroblock to the left) then use those instead. |
1149 | | */ |
1150 | 0 | if( b_mbaff && (h->mb.i_neighbour & MB_LEFT) ) |
1151 | 0 | { |
1152 | 0 | if( MB_INTERLACED && !h->mb.field[h->mb.i_mb_xy-1] ) |
1153 | 0 | { |
1154 | 0 | h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*0]; |
1155 | 0 | h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*1]; |
1156 | 0 | h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[1] + 1 + s8x8*0]; |
1157 | 0 | CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[0]+1)] ); |
1158 | 0 | CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*(left_index_table->mv[1]+1)] ); |
1159 | 0 | CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[1] + 3 + s4x4*(left_index_table->mv[2]+1)] ); |
1160 | 0 | } |
1161 | 0 | else if( !MB_INTERLACED && h->mb.field[h->mb.i_mb_xy-1] ) |
1162 | 0 | { |
1163 | | // Looking at the bottom field so always take the bottom macroblock of the pair. |
1164 | 0 | h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; |
1165 | 0 | h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]]; |
1166 | 0 | h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]]; |
1167 | 0 | CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] ); |
1168 | 0 | CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] ); |
1169 | 0 | CP32( h->mb.cache.topright_mv[l][2], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[2]] ); |
1170 | 0 | } |
1171 | 0 | } |
1172 | |
|
1173 | 0 | if( h->param.b_cabac ) |
1174 | 0 | { |
1175 | 0 | uint8_t (*mvd)[8][2] = h->mb.mvd[l]; |
1176 | 0 | if( h->mb.i_neighbour & MB_TOP ) |
1177 | 0 | CP64( h->mb.cache.mvd[l][x264_scan8[0] - 8], mvd[top][0] ); |
1178 | 0 | else |
1179 | 0 | M64( h->mb.cache.mvd[l][x264_scan8[0] - 8] ) = 0; |
1180 | |
|
1181 | 0 | if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1] >= 0) ) |
1182 | 0 | { |
1183 | 0 | CP16( h->mb.cache.mvd[l][x264_scan8[0 ] - 1], mvd[left[LTOP]][left_index_table->intra[0]] ); |
1184 | 0 | CP16( h->mb.cache.mvd[l][x264_scan8[2 ] - 1], mvd[left[LTOP]][left_index_table->intra[1]] ); |
1185 | 0 | } |
1186 | 0 | else |
1187 | 0 | { |
1188 | 0 | M16( h->mb.cache.mvd[l][x264_scan8[0]-1+0*8] ) = 0; |
1189 | 0 | M16( h->mb.cache.mvd[l][x264_scan8[0]-1+1*8] ) = 0; |
1190 | 0 | } |
1191 | 0 | if( h->mb.i_neighbour & MB_LEFT && (!b_mbaff || h->mb.cache.ref[l][x264_scan8[0]-1+2*8] >= 0) ) |
1192 | 0 | { |
1193 | 0 | CP16( h->mb.cache.mvd[l][x264_scan8[8 ] - 1], mvd[left[LBOT]][left_index_table->intra[2]] ); |
1194 | 0 | CP16( h->mb.cache.mvd[l][x264_scan8[10] - 1], mvd[left[LBOT]][left_index_table->intra[3]] ); |
1195 | 0 | } |
1196 | 0 | else |
1197 | 0 | { |
1198 | 0 | M16( h->mb.cache.mvd[l][x264_scan8[0]-1+2*8] ) = 0; |
1199 | 0 | M16( h->mb.cache.mvd[l][x264_scan8[0]-1+3*8] ) = 0; |
1200 | 0 | } |
1201 | 0 | } |
1202 | | |
1203 | | /* If motion vectors are cached from frame macroblocks but this |
1204 | | * macroblock is a field macroblock then the motion vector must be |
1205 | | * halved. Similarly, motion vectors from field macroblocks are doubled. */ |
1206 | 0 | if( b_mbaff ) |
1207 | 0 | { |
1208 | 0 | #define MAP_MVS\ |
1209 | 0 | if( FIELD_DIFFERENT(h->mb.i_mb_topleft_xy) )\ |
1210 | 0 | MAP_F2F(mv, ref, x264_scan8[0] - 1 - 1*8)\ |
1211 | 0 | if( FIELD_DIFFERENT(top) )\ |
1212 | 0 | {\ |
1213 | 0 | MAP_F2F(mv, ref, x264_scan8[0] + 0 - 1*8)\ |
1214 | 0 | MAP_F2F(mv, ref, x264_scan8[0] + 1 - 1*8)\ |
1215 | 0 | MAP_F2F(mv, ref, x264_scan8[0] + 2 - 1*8)\ |
1216 | 0 | MAP_F2F(mv, ref, x264_scan8[0] + 3 - 1*8)\ |
1217 | 0 | }\ |
1218 | 0 | if( FIELD_DIFFERENT(h->mb.i_mb_topright_xy) )\ |
1219 | 0 | MAP_F2F(mv, ref, x264_scan8[0] + 4 - 1*8)\ |
1220 | 0 | if( FIELD_DIFFERENT(left[0]) )\ |
1221 | 0 | {\ |
1222 | 0 | MAP_F2F(mv, ref, x264_scan8[0] - 1 + 0*8)\ |
1223 | 0 | MAP_F2F(mv, ref, x264_scan8[0] - 1 + 1*8)\ |
1224 | 0 | MAP_F2F(mv, ref, x264_scan8[0] - 1 + 2*8)\ |
1225 | 0 | MAP_F2F(mv, ref, x264_scan8[0] - 1 + 3*8)\ |
1226 | 0 | MAP_F2F(topright_mv, topright_ref, 0)\ |
1227 | 0 | MAP_F2F(topright_mv, topright_ref, 1)\ |
1228 | 0 | MAP_F2F(topright_mv, topright_ref, 2)\ |
1229 | 0 | } |
1230 | |
|
1231 | 0 | if( MB_INTERLACED ) |
1232 | 0 | { |
1233 | 0 | #define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && !h->mb.field[macroblock]) |
1234 | 0 | #define MAP_F2F(varmv, varref, index)\ |
1235 | 0 | if( h->mb.cache.varref[l][index] >= 0 )\ |
1236 | 0 | {\ |
1237 | 0 | h->mb.cache.varref[l][index] <<= 1;\ |
1238 | 0 | h->mb.cache.varmv[l][index][1] /= 2;\ |
1239 | 0 | h->mb.cache.mvd[l][index][1] >>= 1;\ |
1240 | 0 | } |
1241 | 0 | MAP_MVS |
1242 | 0 | #undef MAP_F2F |
1243 | 0 | #undef FIELD_DIFFERENT |
1244 | 0 | } |
1245 | 0 | else |
1246 | 0 | { |
1247 | 0 | #define FIELD_DIFFERENT(macroblock) (macroblock >= 0 && h->mb.field[macroblock]) |
1248 | 0 | #define MAP_F2F(varmv, varref, index)\ |
1249 | 0 | if( h->mb.cache.varref[l][index] >= 0 )\ |
1250 | 0 | {\ |
1251 | 0 | h->mb.cache.varref[l][index] >>= 1;\ |
1252 | 0 | h->mb.cache.varmv[l][index][1] *= 2;\ |
1253 | 0 | h->mb.cache.mvd[l][index][1] <<= 1;\ |
1254 | 0 | } |
1255 | 0 | MAP_MVS |
1256 | 0 | #undef MAP_F2F |
1257 | 0 | #undef FIELD_DIFFERENT |
1258 | 0 | } |
1259 | 0 | } |
1260 | 0 | } |
1261 | |
|
1262 | 0 | if( b_mbaff && mb_x == 0 && !(mb_y&1) ) |
1263 | 0 | { |
1264 | 0 | if( h->mb.i_mb_top_xy >= h->sh.i_first_mb ) |
1265 | 0 | h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy]; |
1266 | 0 | else |
1267 | 0 | h->mb.field_decoding_flag = 0; |
1268 | 0 | } |
1269 | | |
1270 | | /* Check whether skip here would cause decoder to predict interlace mode incorrectly. |
1271 | | * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */ |
1272 | 0 | h->mb.b_allow_skip = 1; |
1273 | 0 | if( b_mbaff ) |
1274 | 0 | { |
1275 | 0 | if( MB_INTERLACED != h->mb.field_decoding_flag && |
1276 | 0 | (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) |
1277 | 0 | h->mb.b_allow_skip = 0; |
1278 | 0 | } |
1279 | |
|
1280 | 0 | if( h->param.b_cabac ) |
1281 | 0 | { |
1282 | 0 | if( b_mbaff ) |
1283 | 0 | { |
1284 | 0 | int left_xy, top_xy; |
1285 | | /* Neighbours here are calculated based on field_decoding_flag */ |
1286 | 0 | int mb_xy = mb_x + (mb_y&~1)*h->mb.i_mb_stride; |
1287 | 0 | left_xy = mb_xy - 1; |
1288 | 0 | if( (mb_y&1) && mb_x > 0 && h->mb.field_decoding_flag == h->mb.field[left_xy] ) |
1289 | 0 | left_xy += h->mb.i_mb_stride; |
1290 | 0 | if( h->mb.field_decoding_flag ) |
1291 | 0 | { |
1292 | 0 | top_xy = mb_xy - h->mb.i_mb_stride; |
1293 | 0 | if( !(mb_y&1) && top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && h->mb.field[top_xy] ) |
1294 | 0 | top_xy -= h->mb.i_mb_stride; |
1295 | 0 | } |
1296 | 0 | else |
1297 | 0 | top_xy = mb_x + (mb_y-1)*h->mb.i_mb_stride; |
1298 | |
|
1299 | 0 | h->mb.cache.i_neighbour_skip = (mb_x > 0 && h->mb.slice_table[left_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[left_xy] )) |
1300 | 0 | + (top_xy >= 0 && h->mb.slice_table[top_xy] == h->sh.i_first_mb && !IS_SKIP( h->mb.type[top_xy] )); |
1301 | 0 | } |
1302 | 0 | else |
1303 | 0 | { |
1304 | 0 | h->mb.cache.i_neighbour_skip = ((h->mb.i_neighbour & MB_LEFT) && !IS_SKIP( h->mb.i_mb_type_left[0] )) |
1305 | 0 | + ((h->mb.i_neighbour & MB_TOP) && !IS_SKIP( h->mb.i_mb_type_top )); |
1306 | 0 | } |
1307 | 0 | } |
1308 | | |
1309 | | /* load skip */ |
1310 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1311 | 0 | { |
1312 | 0 | h->mb.bipred_weight = h->mb.bipred_weight_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; |
1313 | 0 | h->mb.dist_scale_factor = h->mb.dist_scale_factor_buf[MB_INTERLACED][MB_INTERLACED&(mb_y&1)]; |
1314 | 0 | if( h->param.b_cabac ) |
1315 | 0 | { |
1316 | 0 | uint8_t skipbp; |
1317 | 0 | x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 ); |
1318 | 0 | if( b_mbaff ) |
1319 | 0 | { |
1320 | 0 | skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LTOP]] : 0; |
1321 | 0 | h->mb.cache.skip[x264_scan8[0] - 1] = (skipbp >> (1+(left_index_table->mv[0]&~1))) & 1; |
1322 | 0 | skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[LBOT]] : 0; |
1323 | 0 | h->mb.cache.skip[x264_scan8[8] - 1] = (skipbp >> (1+(left_index_table->mv[2]&~1))) & 1; |
1324 | 0 | } |
1325 | 0 | else |
1326 | 0 | { |
1327 | 0 | skipbp = (h->mb.i_neighbour & MB_LEFT) ? h->mb.skipbp[left[0]] : 0; |
1328 | 0 | h->mb.cache.skip[x264_scan8[0] - 1] = skipbp & 0x2; |
1329 | 0 | h->mb.cache.skip[x264_scan8[8] - 1] = skipbp & 0x8; |
1330 | 0 | } |
1331 | 0 | skipbp = (h->mb.i_neighbour & MB_TOP) ? h->mb.skipbp[top] : 0; |
1332 | 0 | h->mb.cache.skip[x264_scan8[0] - 8] = skipbp & 0x4; |
1333 | 0 | h->mb.cache.skip[x264_scan8[4] - 8] = skipbp & 0x8; |
1334 | 0 | } |
1335 | 0 | } |
1336 | |
|
1337 | 0 | if( h->sh.i_type == SLICE_TYPE_P ) |
1338 | 0 | x264_mb_predict_mv_pskip( h, h->mb.cache.pskip_mv ); |
1339 | |
|
1340 | 0 | h->mb.i_neighbour4[0] = |
1341 | 0 | h->mb.i_neighbour8[0] = (h->mb.i_neighbour_intra & (MB_TOP|MB_LEFT|MB_TOPLEFT)) |
1342 | 0 | | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOPRIGHT : 0); |
1343 | 0 | h->mb.i_neighbour4[4] = |
1344 | 0 | h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour_intra & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0); |
1345 | 0 | h->mb.i_neighbour4[2] = |
1346 | 0 | h->mb.i_neighbour4[8] = |
1347 | 0 | h->mb.i_neighbour4[10] = |
1348 | 0 | h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour_intra & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0); |
1349 | 0 | h->mb.i_neighbour4[5] = |
1350 | 0 | h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour_intra & MB_TOPRIGHT) |
1351 | 0 | | ((h->mb.i_neighbour_intra & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); |
1352 | 0 | } |
1353 | | |
1354 | | void x264_macroblock_cache_load_progressive( x264_t *h, int mb_x, int mb_y ) |
1355 | 0 | { |
1356 | 0 | macroblock_cache_load( h, mb_x, mb_y, 0 ); |
1357 | 0 | } Unexecuted instantiation: x264_8_macroblock_cache_load_progressive Unexecuted instantiation: x264_10_macroblock_cache_load_progressive |
1358 | | |
1359 | | void x264_macroblock_cache_load_interlaced( x264_t *h, int mb_x, int mb_y ) |
1360 | 0 | { |
1361 | 0 | macroblock_cache_load( h, mb_x, mb_y, 1 ); |
1362 | 0 | } Unexecuted instantiation: x264_8_macroblock_cache_load_interlaced Unexecuted instantiation: x264_10_macroblock_cache_load_interlaced |
1363 | | |
1364 | | static void macroblock_deblock_strength_mbaff( x264_t *h, uint8_t (*bs)[8][4] ) |
1365 | 0 | { |
1366 | 0 | if( (h->mb.i_neighbour & MB_LEFT) && h->mb.field[h->mb.i_mb_left_xy[0]] != MB_INTERLACED ) |
1367 | 0 | { |
1368 | 0 | static const uint8_t offset[2][2][8] = |
1369 | 0 | { { { 0, 0, 0, 0, 1, 1, 1, 1 }, |
1370 | 0 | { 2, 2, 2, 2, 3, 3, 3, 3 }, }, |
1371 | 0 | { { 0, 1, 2, 3, 0, 1, 2, 3 }, |
1372 | 0 | { 0, 1, 2, 3, 0, 1, 2, 3 }, } |
1373 | 0 | }; |
1374 | 0 | ALIGNED_ARRAY_8( uint8_t, tmpbs, [8] ); |
1375 | |
|
1376 | 0 | const uint8_t *off = offset[MB_INTERLACED][h->mb.i_mb_y&1]; |
1377 | 0 | uint8_t (*nnz)[48] = h->mb.non_zero_count; |
1378 | |
|
1379 | 0 | for( int i = 0; i < 8; i++ ) |
1380 | 0 | { |
1381 | 0 | int left = h->mb.i_mb_left_xy[MB_INTERLACED ? i>>2 : i&1]; |
1382 | 0 | int nnz_this = h->mb.cache.non_zero_count[x264_scan8[0]+8*(i>>1)]; |
1383 | 0 | int nnz_left = nnz[left][3 + 4*off[i]]; |
1384 | 0 | if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) |
1385 | 0 | { |
1386 | 0 | int j = off[i]&~1; |
1387 | 0 | if( h->mb.mb_transform_size[left] ) |
1388 | 0 | nnz_left = !!(M16( &nnz[left][2+4*j] ) | M16( &nnz[left][2+4*(1+j)] )); |
1389 | 0 | } |
1390 | 0 | tmpbs[i] = (nnz_left || nnz_this) ? 2 : 1; |
1391 | 0 | } |
1392 | |
|
1393 | 0 | if( MB_INTERLACED ) |
1394 | 0 | { |
1395 | 0 | CP32( bs[0][0], &tmpbs[0] ); |
1396 | 0 | CP32( bs[0][4], &tmpbs[4] ); |
1397 | 0 | } |
1398 | 0 | else |
1399 | 0 | { |
1400 | 0 | for( int i = 0; i < 4; i++ ) bs[0][0][i] = tmpbs[2*i]; |
1401 | 0 | for( int i = 0; i < 4; i++ ) bs[0][4][i] = tmpbs[1+2*i]; |
1402 | 0 | } |
1403 | 0 | } |
1404 | |
|
1405 | 0 | if( (h->mb.i_neighbour & MB_TOP) && MB_INTERLACED != h->mb.field[h->mb.i_mb_top_xy] ) |
1406 | 0 | { |
1407 | 0 | if( !(h->mb.i_mb_y&1) && !MB_INTERLACED ) |
1408 | 0 | { |
1409 | | /* Need to filter both fields (even for frame macroblocks). |
1410 | | * Filter top two rows using the top macroblock of the above |
1411 | | * pair and then the bottom one. */ |
1412 | 0 | int mbn_xy = h->mb.i_mb_xy - 2 * h->mb.i_mb_stride; |
1413 | 0 | uint8_t *nnz_cur = &h->mb.cache.non_zero_count[x264_scan8[0]]; |
1414 | |
|
1415 | 0 | for( int j = 0; j < 2; j++, mbn_xy += h->mb.i_mb_stride ) |
1416 | 0 | { |
1417 | 0 | uint8_t (*nnz)[48] = h->mb.non_zero_count; |
1418 | |
|
1419 | 0 | ALIGNED_4( uint8_t nnz_top[4] ); |
1420 | 0 | CP32( nnz_top, &nnz[mbn_xy][3*4] ); |
1421 | |
|
1422 | 0 | if( !h->param.b_cabac && h->pps->b_transform_8x8_mode && h->mb.mb_transform_size[mbn_xy] ) |
1423 | 0 | { |
1424 | 0 | nnz_top[0] = nnz_top[1] = M16( &nnz[mbn_xy][ 8] ) || M16( &nnz[mbn_xy][12] ); |
1425 | 0 | nnz_top[2] = nnz_top[3] = M16( &nnz[mbn_xy][10] ) || M16( &nnz[mbn_xy][14] ); |
1426 | 0 | } |
1427 | |
|
1428 | 0 | for( int i = 0; i < 4; i++ ) |
1429 | 0 | bs[1][4*j][i] = (nnz_cur[i] || nnz_top[i]) ? 2 : 1; |
1430 | 0 | } |
1431 | 0 | } |
1432 | 0 | else |
1433 | 0 | for( int i = 0; i < 4; i++ ) |
1434 | 0 | bs[1][0][i] = X264_MAX( bs[1][0][i], 1 ); |
1435 | 0 | } |
1436 | 0 | } |
1437 | | |
1438 | | void x264_macroblock_deblock_strength( x264_t *h ) |
1439 | 0 | { |
1440 | 0 | uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; |
1441 | 0 | if( IS_INTRA( h->mb.i_type ) ) |
1442 | 0 | { |
1443 | 0 | M32( bs[0][1] ) = 0x03030303; |
1444 | 0 | M64( bs[0][2] ) = 0x0303030303030303ULL; |
1445 | 0 | M32( bs[1][1] ) = 0x03030303; |
1446 | 0 | M64( bs[1][2] ) = 0x0303030303030303ULL; |
1447 | 0 | return; |
1448 | 0 | } |
1449 | | |
1450 | | /* Early termination: in this case, nnz guarantees all edges use strength 2.*/ |
1451 | 0 | if( h->mb.b_transform_8x8 && !CHROMA444 ) |
1452 | 0 | { |
1453 | 0 | int cbp_mask = 0xf >> CHROMA_V_SHIFT; |
1454 | 0 | if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask ) |
1455 | 0 | { |
1456 | 0 | M32( bs[0][0] ) = 0x02020202; |
1457 | 0 | M32( bs[0][2] ) = 0x02020202; |
1458 | 0 | M32( bs[0][4] ) = 0x02020202; |
1459 | 0 | M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */ |
1460 | 0 | M64( bs[1][2] ) = 0x0202020202020202ULL; |
1461 | 0 | M32( bs[1][4] ) = 0x02020202; |
1462 | 0 | return; |
1463 | 0 | } |
1464 | 0 | } |
1465 | | |
1466 | 0 | int neighbour_changed = 0; |
1467 | 0 | if( h->sh.i_disable_deblocking_filter_idc != 2 ) |
1468 | 0 | { |
1469 | 0 | neighbour_changed = h->mb.i_neighbour_frame&~h->mb.i_neighbour; |
1470 | 0 | h->mb.i_neighbour = h->mb.i_neighbour_frame; |
1471 | 0 | } |
1472 | | |
1473 | | /* MBAFF deblock uses different left neighbors from encoding */ |
1474 | 0 | if( SLICE_MBAFF && (h->mb.i_neighbour & MB_LEFT) && (h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED) ) |
1475 | 0 | { |
1476 | 0 | h->mb.i_mb_left_xy[1] = |
1477 | 0 | h->mb.i_mb_left_xy[0] = h->mb.i_mb_xy - 1; |
1478 | 0 | if( h->mb.i_mb_y&1 ) |
1479 | 0 | h->mb.i_mb_left_xy[0] -= h->mb.i_mb_stride; |
1480 | 0 | else |
1481 | 0 | h->mb.i_mb_left_xy[1] += h->mb.i_mb_stride; |
1482 | 0 | } |
1483 | | |
1484 | | /* If we have multiple slices and we're deblocking on slice edges, we |
1485 | | * have to reload neighbour data. */ |
1486 | 0 | if( neighbour_changed ) |
1487 | 0 | { |
1488 | 0 | int top_y = h->mb.i_mb_top_y; |
1489 | 0 | int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*h->mb.i_mb_x; |
1490 | 0 | int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*h->mb.i_mb_x; |
1491 | 0 | int s8x8 = h->mb.i_b8_stride; |
1492 | 0 | int s4x4 = h->mb.i_b4_stride; |
1493 | |
|
1494 | 0 | uint8_t (*nnz)[48] = h->mb.non_zero_count; |
1495 | 0 | const x264_left_table_t *left_index_table = SLICE_MBAFF ? h->mb.left_index_table : &left_indices[3]; |
1496 | |
|
1497 | 0 | if( neighbour_changed & MB_TOP ) |
1498 | 0 | CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &nnz[h->mb.i_mb_top_xy][12] ); |
1499 | |
|
1500 | 0 | if( neighbour_changed & MB_LEFT ) |
1501 | 0 | { |
1502 | 0 | int *left = h->mb.i_mb_left_xy; |
1503 | 0 | h->mb.cache.non_zero_count[x264_scan8[0 ] - 1] = nnz[left[0]][left_index_table->nnz[0]]; |
1504 | 0 | h->mb.cache.non_zero_count[x264_scan8[2 ] - 1] = nnz[left[0]][left_index_table->nnz[1]]; |
1505 | 0 | h->mb.cache.non_zero_count[x264_scan8[8 ] - 1] = nnz[left[1]][left_index_table->nnz[2]]; |
1506 | 0 | h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[left[1]][left_index_table->nnz[3]]; |
1507 | 0 | } |
1508 | |
|
1509 | 0 | for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) |
1510 | 0 | { |
1511 | 0 | int16_t (*mv)[2] = h->mb.mv[l]; |
1512 | 0 | int8_t *ref = h->mb.ref[l]; |
1513 | |
|
1514 | 0 | int i8 = x264_scan8[0] - 8; |
1515 | 0 | if( neighbour_changed & MB_TOP ) |
1516 | 0 | { |
1517 | 0 | h->mb.cache.ref[l][i8+0] = |
1518 | 0 | h->mb.cache.ref[l][i8+1] = ref[top_8x8 + 0]; |
1519 | 0 | h->mb.cache.ref[l][i8+2] = |
1520 | 0 | h->mb.cache.ref[l][i8+3] = ref[top_8x8 + 1]; |
1521 | 0 | CP128( h->mb.cache.mv[l][i8], mv[top_4x4] ); |
1522 | 0 | } |
1523 | |
|
1524 | 0 | i8 = x264_scan8[0] - 1; |
1525 | 0 | if( neighbour_changed & MB_LEFT ) |
1526 | 0 | { |
1527 | 0 | h->mb.cache.ref[l][i8+0*8] = |
1528 | 0 | h->mb.cache.ref[l][i8+1*8] = ref[h->mb.left_b8[0] + 1 + s8x8*left_index_table->ref[0]]; |
1529 | 0 | h->mb.cache.ref[l][i8+2*8] = |
1530 | 0 | h->mb.cache.ref[l][i8+3*8] = ref[h->mb.left_b8[1] + 1 + s8x8*left_index_table->ref[2]]; |
1531 | |
|
1532 | 0 | CP32( h->mb.cache.mv[l][i8+0*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[0]] ); |
1533 | 0 | CP32( h->mb.cache.mv[l][i8+1*8], mv[h->mb.left_b4[0] + 3 + s4x4*left_index_table->mv[1]] ); |
1534 | 0 | CP32( h->mb.cache.mv[l][i8+2*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[2]] ); |
1535 | 0 | CP32( h->mb.cache.mv[l][i8+3*8], mv[h->mb.left_b4[1] + 3 + s4x4*left_index_table->mv[3]] ); |
1536 | 0 | } |
1537 | 0 | } |
1538 | 0 | } |
1539 | |
|
1540 | 0 | if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->sh.i_type == SLICE_TYPE_P ) |
1541 | 0 | { |
1542 | | /* Handle reference frame duplicates */ |
1543 | 0 | int i8 = x264_scan8[0] - 8; |
1544 | 0 | h->mb.cache.ref[0][i8+0] = |
1545 | 0 | h->mb.cache.ref[0][i8+1] = deblock_ref_table(h->mb.cache.ref[0][i8+0]); |
1546 | 0 | h->mb.cache.ref[0][i8+2] = |
1547 | 0 | h->mb.cache.ref[0][i8+3] = deblock_ref_table(h->mb.cache.ref[0][i8+2]); |
1548 | |
|
1549 | 0 | i8 = x264_scan8[0] - 1; |
1550 | 0 | h->mb.cache.ref[0][i8+0*8] = |
1551 | 0 | h->mb.cache.ref[0][i8+1*8] = deblock_ref_table(h->mb.cache.ref[0][i8+0*8]); |
1552 | 0 | h->mb.cache.ref[0][i8+2*8] = |
1553 | 0 | h->mb.cache.ref[0][i8+3*8] = deblock_ref_table(h->mb.cache.ref[0][i8+2*8]); |
1554 | |
|
1555 | 0 | int ref0 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 0]]); |
1556 | 0 | int ref1 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 4]]); |
1557 | 0 | int ref2 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[ 8]]); |
1558 | 0 | int ref3 = deblock_ref_table(h->mb.cache.ref[0][x264_scan8[12]]); |
1559 | 0 | uint32_t reftop = pack16to32( (uint8_t)ref0, (uint8_t)ref1 ) * 0x0101; |
1560 | 0 | uint32_t refbot = pack16to32( (uint8_t)ref2, (uint8_t)ref3 ) * 0x0101; |
1561 | |
|
1562 | 0 | M32( &h->mb.cache.ref[0][x264_scan8[0]+8*0] ) = reftop; |
1563 | 0 | M32( &h->mb.cache.ref[0][x264_scan8[0]+8*1] ) = reftop; |
1564 | 0 | M32( &h->mb.cache.ref[0][x264_scan8[0]+8*2] ) = refbot; |
1565 | 0 | M32( &h->mb.cache.ref[0][x264_scan8[0]+8*3] ) = refbot; |
1566 | 0 | } |
1567 | | |
1568 | | /* Munge NNZ for cavlc + 8x8dct */ |
1569 | 0 | if( !h->param.b_cabac && h->pps->b_transform_8x8_mode ) |
1570 | 0 | { |
1571 | 0 | uint8_t (*nnz)[48] = h->mb.non_zero_count; |
1572 | 0 | int top = h->mb.i_mb_top_xy; |
1573 | 0 | int *left = h->mb.i_mb_left_xy; |
1574 | |
|
1575 | 0 | if( (h->mb.i_neighbour & MB_TOP) && h->mb.mb_transform_size[top] ) |
1576 | 0 | { |
1577 | 0 | int i8 = x264_scan8[0] - 8; |
1578 | 0 | int nnz_top0 = M16( &nnz[top][8] ) | M16( &nnz[top][12] ); |
1579 | 0 | int nnz_top1 = M16( &nnz[top][10] ) | M16( &nnz[top][14] ); |
1580 | 0 | M16( &h->mb.cache.non_zero_count[i8+0] ) = nnz_top0 ? 0x0101 : 0; |
1581 | 0 | M16( &h->mb.cache.non_zero_count[i8+2] ) = nnz_top1 ? 0x0101 : 0; |
1582 | 0 | } |
1583 | |
|
1584 | 0 | if( h->mb.i_neighbour & MB_LEFT ) |
1585 | 0 | { |
1586 | 0 | int i8 = x264_scan8[0] - 1; |
1587 | 0 | if( h->mb.mb_transform_size[left[0]] ) |
1588 | 0 | { |
1589 | 0 | int nnz_left0 = M16( &nnz[left[0]][2] ) | M16( &nnz[left[0]][6] ); |
1590 | 0 | h->mb.cache.non_zero_count[i8+8*0] = !!nnz_left0; |
1591 | 0 | h->mb.cache.non_zero_count[i8+8*1] = !!nnz_left0; |
1592 | 0 | } |
1593 | 0 | if( h->mb.mb_transform_size[left[1]] ) |
1594 | 0 | { |
1595 | 0 | int nnz_left1 = M16( &nnz[left[1]][10] ) | M16( &nnz[left[1]][14] ); |
1596 | 0 | h->mb.cache.non_zero_count[i8+8*2] = !!nnz_left1; |
1597 | 0 | h->mb.cache.non_zero_count[i8+8*3] = !!nnz_left1; |
1598 | 0 | } |
1599 | 0 | } |
1600 | |
|
1601 | 0 | if( h->mb.b_transform_8x8 ) |
1602 | 0 | { |
1603 | 0 | int nnz0 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); |
1604 | 0 | int nnz1 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 4]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[ 6]] ); |
1605 | 0 | int nnz2 = M16( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[10]] ); |
1606 | 0 | int nnz3 = M16( &h->mb.cache.non_zero_count[x264_scan8[12]] ) | M16( &h->mb.cache.non_zero_count[x264_scan8[14]] ); |
1607 | 0 | uint32_t nnztop = pack16to32( !!nnz0, !!nnz1 ) * 0x0101; |
1608 | 0 | uint32_t nnzbot = pack16to32( !!nnz2, !!nnz3 ) * 0x0101; |
1609 | |
|
1610 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*0] ) = nnztop; |
1611 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*1] ) = nnztop; |
1612 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*2] ) = nnzbot; |
1613 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[0]+8*3] ) = nnzbot; |
1614 | 0 | } |
1615 | 0 | } |
1616 | |
|
1617 | 0 | h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, |
1618 | 0 | bs, 4 >> MB_INTERLACED, h->sh.i_type == SLICE_TYPE_B ); |
1619 | |
|
1620 | 0 | if( SLICE_MBAFF ) |
1621 | 0 | macroblock_deblock_strength_mbaff( h, bs ); |
1622 | 0 | } Unexecuted instantiation: x264_8_macroblock_deblock_strength Unexecuted instantiation: x264_10_macroblock_deblock_strength |
1623 | | |
1624 | | static ALWAYS_INLINE void macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) |
1625 | 0 | { |
1626 | 0 | int height = b_chroma ? 16>>CHROMA_V_SHIFT : 16; |
1627 | 0 | int i_stride = h->fdec->i_stride[i]; |
1628 | 0 | int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED); |
1629 | 0 | int i_pix_offset = (b_mbaff && MB_INTERLACED) |
1630 | 0 | ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride |
1631 | 0 | : 16 * mb_x + height * mb_y * i_stride; |
1632 | 0 | if( b_chroma ) |
1633 | 0 | h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height ); |
1634 | 0 | else |
1635 | 0 | h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 ); |
1636 | 0 | } |
1637 | | |
1638 | | static ALWAYS_INLINE void macroblock_backup_intra( x264_t *h, int mb_x, int mb_y, int b_mbaff ) |
1639 | 0 | { |
1640 | | /* In MBAFF we store the last two rows in intra_border_backup[0] and [1]. |
1641 | | * For progressive mbs this is the bottom two rows, and for interlaced the |
1642 | | * bottom row of each field. We also store samples needed for the next |
1643 | | * mbpair in intra_border_backup[2]. */ |
1644 | 0 | int backup_dst = !b_mbaff ? (mb_y&1) : (mb_y&1) ? 1 : MB_INTERLACED ? 0 : 2; |
1645 | 0 | memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); |
1646 | 0 | if( CHROMA444 ) |
1647 | 0 | { |
1648 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); |
1649 | 0 | memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+FDEC_STRIDE*15, 16*SIZEOF_PIXEL ); |
1650 | 0 | } |
1651 | 0 | else if( CHROMA_FORMAT ) |
1652 | 0 | { |
1653 | 0 | int backup_src = (15>>CHROMA_V_SHIFT) * FDEC_STRIDE; |
1654 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL ); |
1655 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL ); |
1656 | 0 | } |
1657 | 0 | if( b_mbaff ) |
1658 | 0 | { |
1659 | 0 | if( mb_y&1 ) |
1660 | 0 | { |
1661 | 0 | int backup_src = (MB_INTERLACED ? 7 : 14) * FDEC_STRIDE; |
1662 | 0 | backup_dst = MB_INTERLACED ? 2 : 0; |
1663 | 0 | memcpy( &h->intra_border_backup[backup_dst][0][mb_x*16 ], h->mb.pic.p_fdec[0]+backup_src, 16*SIZEOF_PIXEL ); |
1664 | 0 | if( CHROMA444 ) |
1665 | 0 | { |
1666 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 16*SIZEOF_PIXEL ); |
1667 | 0 | memcpy( &h->intra_border_backup[backup_dst][2][mb_x*16 ], h->mb.pic.p_fdec[2]+backup_src, 16*SIZEOF_PIXEL ); |
1668 | 0 | } |
1669 | 0 | else if( CHROMA_FORMAT ) |
1670 | 0 | { |
1671 | 0 | if( CHROMA_FORMAT == CHROMA_420 ) |
1672 | 0 | backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; |
1673 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*SIZEOF_PIXEL ); |
1674 | 0 | memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*SIZEOF_PIXEL ); |
1675 | 0 | } |
1676 | 0 | } |
1677 | 0 | } |
1678 | 0 | } |
1679 | | |
1680 | | void x264_macroblock_cache_save( x264_t *h ) |
1681 | 0 | { |
1682 | 0 | const int i_mb_xy = h->mb.i_mb_xy; |
1683 | 0 | const int i_mb_type = x264_mb_type_fix[h->mb.i_type]; |
1684 | 0 | const int s8x8 = h->mb.i_b8_stride; |
1685 | 0 | const int s4x4 = h->mb.i_b4_stride; |
1686 | 0 | const int i_mb_4x4 = h->mb.i_b4_xy; |
1687 | 0 | const int i_mb_8x8 = h->mb.i_b8_xy; |
1688 | | |
1689 | | /* GCC pessimizes direct stores to heap-allocated arrays due to aliasing. */ |
1690 | | /* By only dereferencing them once, we avoid this issue. */ |
1691 | 0 | int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy]; |
1692 | 0 | uint8_t *nnz = h->mb.non_zero_count[i_mb_xy]; |
1693 | |
|
1694 | 0 | if( SLICE_MBAFF ) |
1695 | 0 | { |
1696 | 0 | macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 1 ); |
1697 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 1 ); |
1698 | 0 | if( CHROMA444 ) |
1699 | 0 | { |
1700 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 1 ); |
1701 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 1 ); |
1702 | 0 | } |
1703 | 0 | else if( CHROMA_FORMAT ) |
1704 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 1 ); |
1705 | 0 | } |
1706 | 0 | else |
1707 | 0 | { |
1708 | 0 | macroblock_backup_intra( h, h->mb.i_mb_x, h->mb.i_mb_y, 0 ); |
1709 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0, 0 ); |
1710 | 0 | if( CHROMA444 ) |
1711 | 0 | { |
1712 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0, 0 ); |
1713 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 2, 0, 0 ); |
1714 | 0 | } |
1715 | 0 | else if( CHROMA_FORMAT ) |
1716 | 0 | macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1, 0 ); |
1717 | 0 | } |
1718 | |
|
1719 | 0 | x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y ); |
1720 | |
|
1721 | 0 | h->mb.type[i_mb_xy] = i_mb_type; |
1722 | 0 | h->mb.slice_table[i_mb_xy] = h->sh.i_first_mb; |
1723 | 0 | h->mb.partition[i_mb_xy] = IS_INTRA( i_mb_type ) ? D_16x16 : h->mb.i_partition; |
1724 | 0 | h->mb.i_mb_prev_xy = i_mb_xy; |
1725 | | |
1726 | | /* save intra4x4 */ |
1727 | 0 | if( i_mb_type == I_4x4 ) |
1728 | 0 | { |
1729 | 0 | CP32( &i4x4[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] ); |
1730 | 0 | M32( &i4x4[4] ) = pack8to32( h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ], |
1731 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ], |
1732 | 0 | h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0); |
1733 | 0 | } |
1734 | 0 | else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) ) |
1735 | 0 | M64( i4x4 ) = I_PRED_4x4_DC * 0x0101010101010101ULL; |
1736 | 0 | else |
1737 | 0 | M64( i4x4 ) = (uint8_t)(-1) * 0x0101010101010101ULL; |
1738 | | |
1739 | |
|
1740 | 0 | if( i_mb_type == I_PCM ) |
1741 | 0 | { |
1742 | 0 | h->mb.qp[i_mb_xy] = 0; |
1743 | 0 | h->mb.i_last_dqp = 0; |
1744 | 0 | h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2; |
1745 | 0 | h->mb.i_cbp_luma = 0xf; |
1746 | 0 | h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700; |
1747 | 0 | h->mb.b_transform_8x8 = 0; |
1748 | 0 | for( int i = 0; i < 48; i++ ) |
1749 | 0 | h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16; |
1750 | 0 | } |
1751 | 0 | else |
1752 | 0 | { |
1753 | 0 | if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 ) |
1754 | 0 | h->mb.i_qp = h->mb.i_last_qp; |
1755 | 0 | h->mb.qp[i_mb_xy] = h->mb.i_qp; |
1756 | 0 | h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp; |
1757 | 0 | h->mb.i_last_qp = h->mb.i_qp; |
1758 | 0 | } |
1759 | | |
1760 | | /* save non zero count */ |
1761 | 0 | CP32( &nnz[ 0+0*4], &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); |
1762 | 0 | CP32( &nnz[ 0+1*4], &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); |
1763 | 0 | CP32( &nnz[ 0+2*4], &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); |
1764 | 0 | CP32( &nnz[ 0+3*4], &h->mb.cache.non_zero_count[x264_scan8[10]] ); |
1765 | 0 | CP32( &nnz[16+0*4], &h->mb.cache.non_zero_count[x264_scan8[16+0]] ); |
1766 | 0 | CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] ); |
1767 | 0 | CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] ); |
1768 | 0 | CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] ); |
1769 | 0 | if( CHROMA_FORMAT >= CHROMA_422 ) |
1770 | 0 | { |
1771 | 0 | CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ); |
1772 | 0 | CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] ); |
1773 | 0 | CP32( &nnz[32+2*4], &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ); |
1774 | 0 | CP32( &nnz[32+3*4], &h->mb.cache.non_zero_count[x264_scan8[32+10]] ); |
1775 | 0 | } |
1776 | |
|
1777 | 0 | if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 ) |
1778 | 0 | h->mb.b_transform_8x8 = 0; |
1779 | 0 | h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8; |
1780 | |
|
1781 | 0 | if( h->sh.i_type != SLICE_TYPE_I ) |
1782 | 0 | { |
1783 | 0 | int16_t (*mv0)[2] = &h->mb.mv[0][i_mb_4x4]; |
1784 | 0 | int8_t *ref0 = &h->mb.ref[0][i_mb_8x8]; |
1785 | 0 | if( !IS_INTRA( i_mb_type ) ) |
1786 | 0 | { |
1787 | 0 | ref0[0+0*s8x8] = h->mb.cache.ref[0][x264_scan8[0]]; |
1788 | 0 | ref0[1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]]; |
1789 | 0 | ref0[0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]]; |
1790 | 0 | ref0[1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]]; |
1791 | 0 | CP128( &mv0[0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] ); |
1792 | 0 | CP128( &mv0[1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] ); |
1793 | 0 | CP128( &mv0[2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] ); |
1794 | 0 | CP128( &mv0[3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] ); |
1795 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1796 | 0 | { |
1797 | 0 | int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4]; |
1798 | 0 | int8_t *ref1 = &h->mb.ref[1][i_mb_8x8]; |
1799 | 0 | ref1[0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]]; |
1800 | 0 | ref1[1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]]; |
1801 | 0 | ref1[0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]]; |
1802 | 0 | ref1[1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]]; |
1803 | 0 | CP128( &mv1[0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] ); |
1804 | 0 | CP128( &mv1[1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] ); |
1805 | 0 | CP128( &mv1[2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] ); |
1806 | 0 | CP128( &mv1[3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] ); |
1807 | 0 | } |
1808 | 0 | } |
1809 | 0 | else |
1810 | 0 | { |
1811 | 0 | M16( &ref0[0*s8x8] ) = (uint8_t)(-1) * 0x0101; |
1812 | 0 | M16( &ref0[1*s8x8] ) = (uint8_t)(-1) * 0x0101; |
1813 | 0 | M128( &mv0[0*s4x4] ) = M128_ZERO; |
1814 | 0 | M128( &mv0[1*s4x4] ) = M128_ZERO; |
1815 | 0 | M128( &mv0[2*s4x4] ) = M128_ZERO; |
1816 | 0 | M128( &mv0[3*s4x4] ) = M128_ZERO; |
1817 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1818 | 0 | { |
1819 | 0 | int16_t (*mv1)[2] = &h->mb.mv[1][i_mb_4x4]; |
1820 | 0 | int8_t *ref1 = &h->mb.ref[1][i_mb_8x8]; |
1821 | 0 | M16( &ref1[0*s8x8] ) = (uint8_t)(-1) * 0x0101; |
1822 | 0 | M16( &ref1[1*s8x8] ) = (uint8_t)(-1) * 0x0101; |
1823 | 0 | M128( &mv1[0*s4x4] ) = M128_ZERO; |
1824 | 0 | M128( &mv1[1*s4x4] ) = M128_ZERO; |
1825 | 0 | M128( &mv1[2*s4x4] ) = M128_ZERO; |
1826 | 0 | M128( &mv1[3*s4x4] ) = M128_ZERO; |
1827 | 0 | } |
1828 | 0 | } |
1829 | 0 | } |
1830 | |
|
1831 | 0 | if( h->param.b_cabac ) |
1832 | 0 | { |
1833 | 0 | uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy]; |
1834 | 0 | if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) |
1835 | 0 | h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; |
1836 | 0 | else |
1837 | 0 | h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC; |
1838 | |
|
1839 | 0 | if( (0x3FF30 >> i_mb_type) & 1 ) /* !INTRA && !SKIP && !DIRECT */ |
1840 | 0 | { |
1841 | 0 | CP64( mvd0[0], h->mb.cache.mvd[0][x264_scan8[10]] ); |
1842 | 0 | CP16( mvd0[4], h->mb.cache.mvd[0][x264_scan8[5 ]] ); |
1843 | 0 | CP16( mvd0[5], h->mb.cache.mvd[0][x264_scan8[7 ]] ); |
1844 | 0 | CP16( mvd0[6], h->mb.cache.mvd[0][x264_scan8[13]] ); |
1845 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1846 | 0 | { |
1847 | 0 | uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; |
1848 | 0 | CP64( mvd1[0], h->mb.cache.mvd[1][x264_scan8[10]] ); |
1849 | 0 | CP16( mvd1[4], h->mb.cache.mvd[1][x264_scan8[5 ]] ); |
1850 | 0 | CP16( mvd1[5], h->mb.cache.mvd[1][x264_scan8[7 ]] ); |
1851 | 0 | CP16( mvd1[6], h->mb.cache.mvd[1][x264_scan8[13]] ); |
1852 | 0 | } |
1853 | 0 | } |
1854 | 0 | else |
1855 | 0 | { |
1856 | 0 | M128( mvd0[0] ) = M128_ZERO; |
1857 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1858 | 0 | { |
1859 | 0 | uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; |
1860 | 0 | M128( mvd1[0] ) = M128_ZERO; |
1861 | 0 | } |
1862 | 0 | } |
1863 | |
|
1864 | 0 | if( h->sh.i_type == SLICE_TYPE_B ) |
1865 | 0 | { |
1866 | 0 | if( i_mb_type == B_SKIP || i_mb_type == B_DIRECT ) |
1867 | 0 | h->mb.skipbp[i_mb_xy] = 0xf; |
1868 | 0 | else if( i_mb_type == B_8x8 ) |
1869 | 0 | { |
1870 | 0 | int skipbp = ( h->mb.i_sub_partition[0] == D_DIRECT_8x8 ) << 0; |
1871 | 0 | skipbp |= ( h->mb.i_sub_partition[1] == D_DIRECT_8x8 ) << 1; |
1872 | 0 | skipbp |= ( h->mb.i_sub_partition[2] == D_DIRECT_8x8 ) << 2; |
1873 | 0 | skipbp |= ( h->mb.i_sub_partition[3] == D_DIRECT_8x8 ) << 3; |
1874 | 0 | h->mb.skipbp[i_mb_xy] = skipbp; |
1875 | 0 | } |
1876 | 0 | else |
1877 | 0 | h->mb.skipbp[i_mb_xy] = 0; |
1878 | 0 | } |
1879 | 0 | } |
1880 | 0 | } Unexecuted instantiation: x264_8_macroblock_cache_save Unexecuted instantiation: x264_10_macroblock_cache_save |
1881 | | |
1882 | | |
1883 | | void x264_macroblock_bipred_init( x264_t *h ) |
1884 | 0 | { |
1885 | 0 | for( int mbfield = 0; mbfield <= SLICE_MBAFF; mbfield++ ) |
1886 | 0 | for( int field = 0; field <= SLICE_MBAFF; field++ ) |
1887 | 0 | for( int i_ref0 = 0; i_ref0 < (h->i_ref[0]<<mbfield); i_ref0++ ) |
1888 | 0 | { |
1889 | 0 | x264_frame_t *l0 = h->fref[0][i_ref0>>mbfield]; |
1890 | 0 | int poc0 = l0->i_poc + mbfield*l0->i_delta_poc[field^(i_ref0&1)]; |
1891 | 0 | for( int i_ref1 = 0; i_ref1 < (h->i_ref[1]<<mbfield); i_ref1++ ) |
1892 | 0 | { |
1893 | 0 | x264_frame_t *l1 = h->fref[1][i_ref1>>mbfield]; |
1894 | 0 | int cur_poc = h->fdec->i_poc + mbfield*h->fdec->i_delta_poc[field]; |
1895 | 0 | int poc1 = l1->i_poc + mbfield*l1->i_delta_poc[field^(i_ref1&1)]; |
1896 | 0 | int td = x264_clip3( poc1 - poc0, -128, 127 ); |
1897 | 0 | if( td == 0 /* || pic0 is a long-term ref */ ) |
1898 | 0 | { |
1899 | 0 | h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = 256; |
1900 | 0 | h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; |
1901 | 0 | } |
1902 | 0 | else |
1903 | 0 | { |
1904 | 0 | int tb = x264_clip3( cur_poc - poc0, -128, 127 ); |
1905 | 0 | int tx = (16384 + (abs(td) >> 1)) / td; |
1906 | 0 | int dist_scale_factor = x264_clip3( (tb * tx + 32) >> 6, -1024, 1023 ); |
1907 | |
|
1908 | 0 | h->mb.dist_scale_factor_buf[mbfield][field][i_ref0][i_ref1] = dist_scale_factor; |
1909 | |
|
1910 | 0 | dist_scale_factor >>= 2; |
1911 | 0 | if( h->param.analyse.b_weighted_bipred /* && pic1 is not a long-term ref */ |
1912 | 0 | && dist_scale_factor >= -64 |
1913 | 0 | && dist_scale_factor <= 128 ) |
1914 | 0 | { |
1915 | 0 | h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 64 - dist_scale_factor; |
1916 | | // ssse3 implementation of biweight doesn't support the extrema. |
1917 | | // if we ever generate them, we'll have to drop that optimization. |
1918 | 0 | assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 ); |
1919 | 0 | } |
1920 | 0 | else |
1921 | 0 | h->mb.bipred_weight_buf[mbfield][field][i_ref0][i_ref1] = 32; |
1922 | 0 | } |
1923 | 0 | } |
1924 | 0 | } |
1925 | 0 | } Unexecuted instantiation: x264_8_macroblock_bipred_init Unexecuted instantiation: x264_10_macroblock_bipred_init |
1926 | | |