/src/x264/encoder/macroblock.c
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * macroblock.c: macroblock encoding |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2003-2025 x264 project |
5 | | * |
6 | | * Authors: Laurent Aimar <fenrir@via.ecp.fr> |
7 | | * Loren Merritt <lorenm@u.washington.edu> |
8 | | * Fiona Glaser <fiona@x264.com> |
9 | | * Henrik Gramner <henrik@gramner.com> |
10 | | * |
11 | | * This program is free software; you can redistribute it and/or modify |
12 | | * it under the terms of the GNU General Public License as published by |
13 | | * the Free Software Foundation; either version 2 of the License, or |
14 | | * (at your option) any later version. |
15 | | * |
16 | | * This program is distributed in the hope that it will be useful, |
17 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 | | * GNU General Public License for more details. |
20 | | * |
21 | | * You should have received a copy of the GNU General Public License |
22 | | * along with this program; if not, write to the Free Software |
23 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. |
24 | | * |
25 | | * This program is also available under a commercial proprietary license. |
26 | | * For more information, contact us at licensing@x264.com. |
27 | | *****************************************************************************/ |
28 | | |
29 | | #include "common/common.h" |
30 | | #include "macroblock.h" |
31 | | |
32 | | /* These chroma DC functions don't have assembly versions and are only used here. */ |
33 | | |
34 | 0 | #define ZIG(i,y,x) level[i] = dct[x*2+y]; |
35 | | static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] ) |
36 | 0 | { |
37 | 0 | ZIG(0,0,0) |
38 | 0 | ZIG(1,0,1) |
39 | 0 | ZIG(2,1,0) |
40 | 0 | ZIG(3,1,1) |
41 | 0 | } |
42 | | #undef ZIG |
43 | | |
44 | | static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] ) |
45 | 0 | { |
46 | 0 | level[0] = dct[0]; |
47 | 0 | level[1] = dct[2]; |
48 | 0 | level[2] = dct[1]; |
49 | 0 | level[3] = dct[4]; |
50 | 0 | level[4] = dct[6]; |
51 | 0 | level[5] = dct[3]; |
52 | 0 | level[6] = dct[5]; |
53 | 0 | level[7] = dct[7]; |
54 | 0 | } |
55 | | |
56 | | #define IDCT_DEQUANT_2X2_START \ |
57 | 0 | int d0 = dct[0] + dct[1]; \ |
58 | 0 | int d1 = dct[2] + dct[3]; \ |
59 | 0 | int d2 = dct[0] - dct[1]; \ |
60 | 0 | int d3 = dct[2] - dct[3]; \ |
61 | 0 | int dmf = dequant_mf[i_qp%6][0] << i_qp/6; |
62 | | |
63 | | static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp ) |
64 | 0 | { |
65 | 0 | IDCT_DEQUANT_2X2_START |
66 | 0 | dct4x4[0][0] = (d0 + d1) * dmf >> 5; |
67 | 0 | dct4x4[1][0] = (d0 - d1) * dmf >> 5; |
68 | 0 | dct4x4[2][0] = (d2 + d3) * dmf >> 5; |
69 | 0 | dct4x4[3][0] = (d2 - d3) * dmf >> 5; |
70 | 0 | } |
71 | | |
72 | | static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp ) |
73 | 0 | { |
74 | 0 | IDCT_DEQUANT_2X2_START |
75 | 0 | dct[0] = (d0 + d1) * dmf >> 5; |
76 | 0 | dct[1] = (d0 - d1) * dmf >> 5; |
77 | 0 | dct[2] = (d2 + d3) * dmf >> 5; |
78 | 0 | dct[3] = (d2 - d3) * dmf >> 5; |
79 | 0 | } |
80 | | #undef IDCT_2X2_DEQUANT_START |
81 | | |
82 | | static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] ) |
83 | 0 | { |
84 | 0 | int d0 = dct4x4[0][0] + dct4x4[1][0]; |
85 | 0 | int d1 = dct4x4[2][0] + dct4x4[3][0]; |
86 | 0 | int d2 = dct4x4[0][0] - dct4x4[1][0]; |
87 | 0 | int d3 = dct4x4[2][0] - dct4x4[3][0]; |
88 | 0 | d[0] = d0 + d1; |
89 | 0 | d[2] = d2 + d3; |
90 | 0 | d[1] = d0 - d1; |
91 | 0 | d[3] = d2 - d3; |
92 | 0 | dct4x4[0][0] = 0; |
93 | 0 | dct4x4[1][0] = 0; |
94 | 0 | dct4x4[2][0] = 0; |
95 | 0 | dct4x4[3][0] = 0; |
96 | 0 | } |
97 | | |
98 | | static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count ) |
99 | 0 | { |
100 | 0 | if( WORD_SIZE == 8 ) |
101 | 0 | { |
102 | 0 | for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) ) |
103 | 0 | if( M64( &v[i] ) ) |
104 | 0 | return 1; |
105 | 0 | } |
106 | 0 | else |
107 | 0 | { |
108 | 0 | for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) ) |
109 | 0 | if( M32( &v[i] ) ) |
110 | 0 | return 1; |
111 | 0 | } |
112 | 0 | return 0; |
113 | 0 | } |
114 | | |
115 | | /* All encoding functions must output the correct CBP and NNZ values. |
116 | | * The entropy coding functions will check CBP first, then NNZ, before |
117 | | * actually reading the DCT coefficients. NNZ still must be correct even |
118 | | * if CBP is zero because of the use of NNZ values for context selection. |
119 | | * "NNZ" need only be 0 or 1 rather than the exact coefficient count because |
120 | | * that is only needed in CAVLC, and will be calculated by CAVLC's residual |
121 | | * coding and stored as necessary. */ |
122 | | |
123 | | /* This means that decimation can be done merely by adjusting the CBP and NNZ |
124 | | * rather than memsetting the coefficients. */ |
125 | | |
126 | | static void mb_encode_i16x16( x264_t *h, int p, int i_qp ) |
127 | 0 | { |
128 | 0 | pixel *p_src = h->mb.pic.p_fenc[p]; |
129 | 0 | pixel *p_dst = h->mb.pic.p_fdec[p]; |
130 | |
|
131 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); |
132 | 0 | ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] ); |
133 | |
|
134 | 0 | int nz, block_cbp = 0; |
135 | 0 | int decimate_score = h->mb.b_dct_decimate ? 0 : 9; |
136 | 0 | int i_quant_cat = p ? CQM_4IC : CQM_4IY; |
137 | 0 | int i_mode = h->mb.i_intra16x16_pred_mode; |
138 | |
|
139 | 0 | if( h->mb.b_lossless ) |
140 | 0 | x264_predict_lossless_16x16( h, p, i_mode ); |
141 | 0 | else |
142 | 0 | h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] ); |
143 | |
|
144 | 0 | if( h->mb.b_lossless ) |
145 | 0 | { |
146 | 0 | for( int i = 0; i < 16; i++ ) |
147 | 0 | { |
148 | 0 | int oe = block_idx_xy_fenc[i]; |
149 | 0 | int od = block_idx_xy_fdec[i]; |
150 | 0 | nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] ); |
151 | 0 | h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz; |
152 | 0 | block_cbp |= nz; |
153 | 0 | } |
154 | 0 | h->mb.i_cbp_luma |= block_cbp * 0xf; |
155 | 0 | h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 ); |
156 | 0 | h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 ); |
157 | 0 | return; |
158 | 0 | } |
159 | | |
160 | 0 | CLEAR_16x16_NNZ( p ); |
161 | |
|
162 | 0 | h->dctf.sub16x16_dct( dct4x4, p_src, p_dst ); |
163 | |
|
164 | 0 | if( h->mb.b_noise_reduction ) |
165 | 0 | for( int idx = 0; idx < 16; idx++ ) |
166 | 0 | h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); |
167 | |
|
168 | 0 | for( int idx = 0; idx < 16; idx++ ) |
169 | 0 | { |
170 | 0 | dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0]; |
171 | 0 | dct4x4[idx][0] = 0; |
172 | 0 | } |
173 | |
|
174 | 0 | if( h->mb.b_trellis ) |
175 | 0 | { |
176 | 0 | for( int idx = 0; idx < 16; idx++ ) |
177 | 0 | if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) ) |
178 | 0 | { |
179 | 0 | block_cbp = 0xf; |
180 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); |
181 | 0 | h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); |
182 | 0 | if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); |
183 | 0 | h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; |
184 | 0 | } |
185 | 0 | } |
186 | 0 | else |
187 | 0 | { |
188 | 0 | for( int i8x8 = 0; i8x8 < 4; i8x8++ ) |
189 | 0 | { |
190 | 0 | nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); |
191 | 0 | if( nz ) |
192 | 0 | { |
193 | 0 | block_cbp = 0xf; |
194 | 0 | FOREACH_BIT( idx, i8x8*4, nz ) |
195 | 0 | { |
196 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] ); |
197 | 0 | h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp ); |
198 | 0 | if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] ); |
199 | 0 | h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1; |
200 | 0 | } |
201 | 0 | } |
202 | 0 | } |
203 | 0 | } |
204 | | |
205 | | /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */ |
206 | | /* More useful with CAVLC, but still useful with CABAC. */ |
207 | 0 | if( decimate_score < 6 ) |
208 | 0 | { |
209 | 0 | CLEAR_16x16_NNZ( p ); |
210 | 0 | block_cbp = 0; |
211 | 0 | } |
212 | 0 | else |
213 | 0 | h->mb.i_cbp_luma |= block_cbp; |
214 | |
|
215 | 0 | h->dctf.dct4x4dc( dct_dc4x4 ); |
216 | 0 | if( h->mb.b_trellis ) |
217 | 0 | nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p ); |
218 | 0 | else |
219 | 0 | nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 ); |
220 | |
|
221 | 0 | h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz; |
222 | 0 | if( nz ) |
223 | 0 | { |
224 | 0 | h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 ); |
225 | | |
226 | | /* output samples to fdec */ |
227 | 0 | h->dctf.idct4x4dc( dct_dc4x4 ); |
228 | 0 | h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */ |
229 | 0 | if( block_cbp ) |
230 | 0 | for( int i = 0; i < 16; i++ ) |
231 | 0 | dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]]; |
232 | 0 | } |
233 | | |
234 | | /* put pixels to fdec */ |
235 | 0 | if( block_cbp ) |
236 | 0 | h->dctf.add16x16_idct( p_dst, dct4x4 ); |
237 | 0 | else if( nz ) |
238 | 0 | h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); |
239 | 0 | } |
240 | | |
241 | | /* Round down coefficients losslessly in DC-only chroma blocks. |
242 | | * Unlike luma blocks, this can't be done with a lookup table or |
243 | | * other shortcut technique because of the interdependencies |
244 | | * between the coefficients due to the chroma DC transform. */ |
245 | | static ALWAYS_INLINE int mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 ) |
246 | 0 | { |
247 | 0 | int dmf = dequant_mf[i_qp%6][0] << i_qp/6; |
248 | | |
249 | | /* If the QP is too high, there's no benefit to rounding optimization. */ |
250 | 0 | if( dmf > 32*64 ) |
251 | 0 | return 1; |
252 | | |
253 | 0 | if( chroma422 ) |
254 | 0 | return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf ); |
255 | 0 | else |
256 | 0 | return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf ); |
257 | 0 | } |
258 | | |
259 | | static ALWAYS_INLINE void mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 ) |
260 | 0 | { |
261 | 0 | int nz, nz_dc; |
262 | 0 | int b_decimate = b_inter && h->mb.b_dct_decimate; |
263 | 0 | int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter]; |
264 | 0 | ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); |
265 | 0 | h->mb.i_cbp_chroma = 0; |
266 | 0 | h->nr_count[2] += h->mb.b_noise_reduction * 4; |
267 | |
|
268 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; |
269 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; |
270 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; |
271 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; |
272 | 0 | if( chroma422 ) |
273 | 0 | { |
274 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; |
275 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; |
276 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; |
277 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; |
278 | 0 | } |
279 | | |
280 | | /* Early termination: check variance of chroma residual before encoding. |
281 | | * Don't bother trying early termination at low QPs. |
282 | | * Values are experimentally derived. */ |
283 | 0 | if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction ) |
284 | 0 | { |
285 | 0 | int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; |
286 | 0 | ALIGNED_ARRAY_8( int, ssd,[2] ); |
287 | 0 | int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8; |
288 | |
|
289 | 0 | if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 ) |
290 | 0 | { |
291 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; |
292 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; |
293 | |
|
294 | 0 | for( int ch = 0; ch < 2; ch++ ) |
295 | 0 | { |
296 | 0 | if( ssd[ch] > thresh ) |
297 | 0 | { |
298 | 0 | pixel *p_src = h->mb.pic.p_fenc[1+ch]; |
299 | 0 | pixel *p_dst = h->mb.pic.p_fdec[1+ch]; |
300 | |
|
301 | 0 | if( chroma422 ) |
302 | | /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */ |
303 | 0 | h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); |
304 | 0 | else |
305 | 0 | h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); |
306 | |
|
307 | 0 | if( h->mb.b_trellis ) |
308 | 0 | nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); |
309 | 0 | else |
310 | 0 | { |
311 | 0 | nz_dc = 0; |
312 | 0 | for( int i = 0; i <= chroma422; i++ ) |
313 | 0 | nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, |
314 | 0 | h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); |
315 | 0 | } |
316 | |
|
317 | 0 | if( nz_dc ) |
318 | 0 | { |
319 | 0 | if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) |
320 | 0 | continue; |
321 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1; |
322 | 0 | if( chroma422 ) |
323 | 0 | { |
324 | 0 | zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); |
325 | 0 | h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); |
326 | 0 | } |
327 | 0 | else |
328 | 0 | { |
329 | 0 | zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); |
330 | 0 | idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); |
331 | 0 | } |
332 | |
|
333 | 0 | for( int i = 0; i <= chroma422; i++ ) |
334 | 0 | h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); |
335 | 0 | h->mb.i_cbp_chroma = 1; |
336 | 0 | } |
337 | 0 | } |
338 | 0 | } |
339 | 0 | return; |
340 | 0 | } |
341 | 0 | } |
342 | | |
343 | 0 | for( int ch = 0; ch < 2; ch++ ) |
344 | 0 | { |
345 | 0 | pixel *p_src = h->mb.pic.p_fenc[1+ch]; |
346 | 0 | pixel *p_dst = h->mb.pic.p_fdec[1+ch]; |
347 | 0 | int i_decimate_score = b_decimate ? 0 : 7; |
348 | 0 | int nz_ac = 0; |
349 | |
|
350 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); |
351 | |
|
352 | 0 | if( h->mb.b_lossless ) |
353 | 0 | { |
354 | 0 | static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 }; |
355 | |
|
356 | 0 | for( int i = 0; i < (chroma422?8:4); i++ ) |
357 | 0 | { |
358 | 0 | int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE; |
359 | 0 | int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE; |
360 | 0 | nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od, |
361 | 0 | &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] ); |
362 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; |
363 | 0 | h->mb.i_cbp_chroma |= nz; |
364 | 0 | } |
365 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 ); |
366 | 0 | continue; |
367 | 0 | } |
368 | | |
369 | 0 | for( int i = 0; i <= chroma422; i++ ) |
370 | 0 | h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); |
371 | |
|
372 | 0 | if( h->mb.b_noise_reduction ) |
373 | 0 | for( int i = 0; i < (chroma422?8:4); i++ ) |
374 | 0 | h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 ); |
375 | |
|
376 | 0 | if( chroma422 ) |
377 | 0 | h->dctf.dct2x4dc( dct_dc, dct4x4 ); |
378 | 0 | else |
379 | 0 | dct2x2dc( dct_dc, dct4x4 ); |
380 | | |
381 | | /* calculate dct coeffs */ |
382 | 0 | for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ ) |
383 | 0 | { |
384 | 0 | if( h->mb.b_trellis ) |
385 | 0 | { |
386 | 0 | for( int i4x4 = 0; i4x4 < 4; i4x4++ ) |
387 | 0 | { |
388 | 0 | if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) ) |
389 | 0 | { |
390 | 0 | int idx = 16+ch*16+i8x8*8+i4x4; |
391 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); |
392 | 0 | h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); |
393 | 0 | if( i_decimate_score < 7 ) |
394 | 0 | i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); |
395 | 0 | h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; |
396 | 0 | nz_ac = 1; |
397 | 0 | } |
398 | 0 | } |
399 | 0 | } |
400 | 0 | else |
401 | 0 | { |
402 | 0 | nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp], |
403 | 0 | h->quant4_bias[CQM_4IC+b_inter][i_qp] ); |
404 | 0 | nz_ac |= nz; |
405 | |
|
406 | 0 | FOREACH_BIT( i4x4, 0, nz ) |
407 | 0 | { |
408 | 0 | int idx = 16+ch*16+i8x8*8+i4x4; |
409 | |
|
410 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] ); |
411 | 0 | h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp ); |
412 | 0 | if( i_decimate_score < 7 ) |
413 | 0 | i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] ); |
414 | 0 | h->mb.cache.non_zero_count[x264_scan8[idx]] = 1; |
415 | 0 | } |
416 | 0 | } |
417 | 0 | } |
418 | |
|
419 | 0 | if( h->mb.b_trellis ) |
420 | 0 | nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); |
421 | 0 | else |
422 | 0 | { |
423 | 0 | nz_dc = 0; |
424 | 0 | for( int i = 0; i <= chroma422; i++ ) |
425 | 0 | nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, |
426 | 0 | h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); |
427 | 0 | } |
428 | |
|
429 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc; |
430 | |
|
431 | 0 | if( i_decimate_score < 7 || !nz_ac ) |
432 | 0 | { |
433 | | /* Decimate the block */ |
434 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0; |
435 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0; |
436 | 0 | if( chroma422 ) |
437 | 0 | { |
438 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0; |
439 | 0 | M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0; |
440 | 0 | } |
441 | |
|
442 | 0 | if( !nz_dc ) /* Whole block is empty */ |
443 | 0 | continue; |
444 | 0 | if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) |
445 | 0 | { |
446 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0; |
447 | 0 | continue; |
448 | 0 | } |
449 | | /* DC-only */ |
450 | 0 | if( chroma422 ) |
451 | 0 | { |
452 | 0 | zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); |
453 | 0 | h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); |
454 | 0 | } |
455 | 0 | else |
456 | 0 | { |
457 | 0 | zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); |
458 | 0 | idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); |
459 | 0 | } |
460 | |
|
461 | 0 | for( int i = 0; i <= chroma422; i++ ) |
462 | 0 | h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); |
463 | 0 | } |
464 | 0 | else |
465 | 0 | { |
466 | 0 | h->mb.i_cbp_chroma = 1; |
467 | |
|
468 | 0 | if( nz_dc ) |
469 | 0 | { |
470 | 0 | if( chroma422 ) |
471 | 0 | { |
472 | 0 | zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); |
473 | 0 | h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 ); |
474 | 0 | } |
475 | 0 | else |
476 | 0 | { |
477 | 0 | zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); |
478 | 0 | idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp ); |
479 | 0 | } |
480 | 0 | } |
481 | |
|
482 | 0 | for( int i = 0; i <= chroma422; i++ ) |
483 | 0 | h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] ); |
484 | 0 | } |
485 | 0 | } |
486 | | |
487 | | /* 0 = none, 1 = DC only, 2 = DC+AC */ |
488 | 0 | h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] | |
489 | 0 | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma); |
490 | 0 | } |
491 | | |
492 | | void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ) |
493 | 0 | { |
494 | 0 | if( CHROMA_FORMAT == CHROMA_420 ) |
495 | 0 | mb_encode_chroma_internal( h, b_inter, i_qp, 0 ); |
496 | 0 | else |
497 | 0 | mb_encode_chroma_internal( h, b_inter, i_qp, 1 ); |
498 | 0 | } Unexecuted instantiation: x264_8_mb_encode_chroma Unexecuted instantiation: x264_10_mb_encode_chroma |
499 | | |
500 | | static void macroblock_encode_skip( x264_t *h ) |
501 | 0 | { |
502 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0; |
503 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0; |
504 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0; |
505 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0; |
506 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0; |
507 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0; |
508 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0; |
509 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0; |
510 | 0 | if( CHROMA_FORMAT >= CHROMA_422 ) |
511 | 0 | { |
512 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0; |
513 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0; |
514 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0; |
515 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0; |
516 | 0 | } |
517 | 0 | h->mb.i_cbp_luma = 0; |
518 | 0 | h->mb.i_cbp_chroma = 0; |
519 | 0 | h->mb.cbp[h->mb.i_mb_xy] = 0; |
520 | 0 | } |
521 | | |
522 | | /***************************************************************************** |
523 | | * Intra prediction for predictive lossless mode. |
524 | | *****************************************************************************/ |
525 | | |
526 | | void x264_predict_lossless_chroma( x264_t *h, int i_mode ) |
527 | 0 | { |
528 | 0 | int height = 16 >> CHROMA_V_SHIFT; |
529 | 0 | if( i_mode == I_PRED_CHROMA_V ) |
530 | 0 | { |
531 | 0 | h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height ); |
532 | 0 | h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height ); |
533 | 0 | memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*SIZEOF_PIXEL ); |
534 | 0 | memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*SIZEOF_PIXEL ); |
535 | 0 | } |
536 | 0 | else if( i_mode == I_PRED_CHROMA_H ) |
537 | 0 | { |
538 | 0 | h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height ); |
539 | 0 | h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height ); |
540 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 ); |
541 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 ); |
542 | 0 | if( CHROMA_FORMAT == CHROMA_422 ) |
543 | 0 | { |
544 | 0 | x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 ); |
545 | 0 | x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 ); |
546 | 0 | } |
547 | 0 | } |
548 | 0 | else |
549 | 0 | { |
550 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); |
551 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); |
552 | 0 | } |
553 | 0 | } Unexecuted instantiation: x264_8_predict_lossless_chroma Unexecuted instantiation: x264_10_predict_lossless_chroma |
554 | | |
555 | | void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode ) |
556 | 0 | { |
557 | 0 | int stride = h->fenc->i_stride[p] << MB_INTERLACED; |
558 | 0 | pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; |
559 | |
|
560 | 0 | if( i_mode == I_PRED_4x4_V ) |
561 | 0 | { |
562 | 0 | h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); |
563 | 0 | memcpy( p_dst, p_dst-FDEC_STRIDE, 4*SIZEOF_PIXEL ); |
564 | 0 | } |
565 | 0 | else if( i_mode == I_PRED_4x4_H ) |
566 | 0 | { |
567 | 0 | h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); |
568 | 0 | for( int i = 0; i < 4; i++ ) |
569 | 0 | p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; |
570 | 0 | } |
571 | 0 | else |
572 | 0 | h->predict_4x4[i_mode]( p_dst ); |
573 | 0 | } Unexecuted instantiation: x264_8_predict_lossless_4x4 Unexecuted instantiation: x264_10_predict_lossless_4x4 |
574 | | |
575 | | void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] ) |
576 | 0 | { |
577 | 0 | int stride = h->fenc->i_stride[p] << MB_INTERLACED; |
578 | 0 | pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride; |
579 | |
|
580 | 0 | if( i_mode == I_PRED_8x8_V ) |
581 | 0 | { |
582 | 0 | h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); |
583 | 0 | memcpy( p_dst, &edge[16], 8*SIZEOF_PIXEL ); |
584 | 0 | } |
585 | 0 | else if( i_mode == I_PRED_8x8_H ) |
586 | 0 | { |
587 | 0 | h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); |
588 | 0 | for( int i = 0; i < 8; i++ ) |
589 | 0 | p_dst[i*FDEC_STRIDE] = edge[14-i]; |
590 | 0 | } |
591 | 0 | else |
592 | 0 | h->predict_8x8[i_mode]( p_dst, edge ); |
593 | 0 | } Unexecuted instantiation: x264_8_predict_lossless_8x8 Unexecuted instantiation: x264_10_predict_lossless_8x8 |
594 | | |
595 | | void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ) |
596 | 0 | { |
597 | 0 | int stride = h->fenc->i_stride[p] << MB_INTERLACED; |
598 | 0 | pixel *p_dst = h->mb.pic.p_fdec[p]; |
599 | |
|
600 | 0 | if( i_mode == I_PRED_16x16_V ) |
601 | 0 | { |
602 | 0 | h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); |
603 | 0 | memcpy( p_dst, p_dst-FDEC_STRIDE, 16*SIZEOF_PIXEL ); |
604 | 0 | } |
605 | 0 | else if( i_mode == I_PRED_16x16_H ) |
606 | 0 | { |
607 | 0 | h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); |
608 | 0 | for( int i = 0; i < 16; i++ ) |
609 | 0 | p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; |
610 | 0 | } |
611 | 0 | else |
612 | 0 | h->predict_16x16[i_mode]( p_dst ); |
613 | 0 | } Unexecuted instantiation: x264_8_predict_lossless_16x16 Unexecuted instantiation: x264_10_predict_lossless_16x16 |
614 | | |
615 | | /***************************************************************************** |
616 | | * x264_macroblock_encode: |
617 | | *****************************************************************************/ |
618 | | static ALWAYS_INLINE void macroblock_encode_internal( x264_t *h, int plane_count, int chroma ) |
619 | 0 | { |
620 | 0 | int i_qp = h->mb.i_qp; |
621 | 0 | int b_decimate = h->mb.b_dct_decimate; |
622 | 0 | int b_force_no_skip = 0; |
623 | 0 | int nz; |
624 | 0 | h->mb.i_cbp_luma = 0; |
625 | 0 | for( int p = 0; p < plane_count; p++ ) |
626 | 0 | h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0; |
627 | |
|
628 | 0 | if( h->mb.i_type == I_PCM ) |
629 | 0 | { |
630 | | /* if PCM is chosen, we need to store reconstructed frame data */ |
631 | 0 | for( int p = 0; p < plane_count; p++ ) |
632 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 ); |
633 | 0 | if( chroma ) |
634 | 0 | { |
635 | 0 | int height = 16 >> CHROMA_V_SHIFT; |
636 | 0 | h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height ); |
637 | 0 | h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height ); |
638 | 0 | } |
639 | 0 | return; |
640 | 0 | } |
641 | | |
642 | 0 | if( !h->mb.b_allow_skip ) |
643 | 0 | { |
644 | 0 | b_force_no_skip = 1; |
645 | 0 | if( IS_SKIP(h->mb.i_type) ) |
646 | 0 | { |
647 | 0 | if( h->mb.i_type == P_SKIP ) |
648 | 0 | h->mb.i_type = P_L0; |
649 | 0 | else if( h->mb.i_type == B_SKIP ) |
650 | 0 | h->mb.i_type = B_DIRECT; |
651 | 0 | } |
652 | 0 | } |
653 | |
|
654 | 0 | if( h->mb.i_type == P_SKIP ) |
655 | 0 | { |
656 | | /* don't do pskip motion compensation if it was already done in macroblock_analyse */ |
657 | 0 | if( !h->mb.b_skip_mc ) |
658 | 0 | { |
659 | 0 | int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0], |
660 | 0 | h->mb.mv_min[0], h->mb.mv_max[0] ); |
661 | 0 | int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1], |
662 | 0 | h->mb.mv_min[1], h->mb.mv_max[1] ); |
663 | |
|
664 | 0 | for( int p = 0; p < plane_count; p++ ) |
665 | 0 | h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE, |
666 | 0 | &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p], |
667 | 0 | mvx, mvy, 16, 16, &h->sh.weight[0][p] ); |
668 | |
|
669 | 0 | if( chroma ) |
670 | 0 | { |
671 | 0 | int v_shift = CHROMA_V_SHIFT; |
672 | 0 | int height = 16 >> v_shift; |
673 | | |
674 | | /* Special case for mv0, which is (of course) very common in P-skip mode. */ |
675 | 0 | if( mvx | mvy ) |
676 | 0 | h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, |
677 | 0 | h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], |
678 | 0 | mvx, 2*mvy>>v_shift, 8, height ); |
679 | 0 | else |
680 | 0 | h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], |
681 | 0 | h->mb.pic.i_stride[1], height ); |
682 | |
|
683 | 0 | if( h->sh.weight[0][1].weightfn ) |
684 | 0 | h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, |
685 | 0 | h->mb.pic.p_fdec[1], FDEC_STRIDE, |
686 | 0 | &h->sh.weight[0][1], height ); |
687 | 0 | if( h->sh.weight[0][2].weightfn ) |
688 | 0 | h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, |
689 | 0 | h->mb.pic.p_fdec[2], FDEC_STRIDE, |
690 | 0 | &h->sh.weight[0][2], height ); |
691 | 0 | } |
692 | 0 | } |
693 | |
|
694 | 0 | macroblock_encode_skip( h ); |
695 | 0 | return; |
696 | 0 | } |
697 | 0 | if( h->mb.i_type == B_SKIP ) |
698 | 0 | { |
699 | | /* don't do bskip motion compensation if it was already done in macroblock_analyse */ |
700 | 0 | if( !h->mb.b_skip_mc ) |
701 | 0 | x264_mb_mc( h ); |
702 | 0 | macroblock_encode_skip( h ); |
703 | 0 | return; |
704 | 0 | } |
705 | | |
706 | 0 | if( h->mb.i_type == I_16x16 ) |
707 | 0 | { |
708 | 0 | h->mb.b_transform_8x8 = 0; |
709 | |
|
710 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
711 | 0 | mb_encode_i16x16( h, p, i_qp ); |
712 | 0 | } |
713 | 0 | else if( h->mb.i_type == I_8x8 ) |
714 | 0 | { |
715 | 0 | h->mb.b_transform_8x8 = 1; |
716 | | /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */ |
717 | 0 | if( h->mb.i_skip_intra ) |
718 | 0 | { |
719 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 ); |
720 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0]; |
721 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1]; |
722 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2]; |
723 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3]; |
724 | 0 | h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp; |
725 | | /* In RD mode, restore the now-overwritten DCT data. */ |
726 | 0 | if( h->mb.i_skip_intra == 2 ) |
727 | 0 | h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) ); |
728 | 0 | } |
729 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
730 | 0 | { |
731 | 0 | for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0; i < 4; i++ ) |
732 | 0 | { |
733 | 0 | int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; |
734 | 0 | x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 ); |
735 | 0 | } |
736 | 0 | } |
737 | 0 | } |
738 | 0 | else if( h->mb.i_type == I_4x4 ) |
739 | 0 | { |
740 | 0 | h->mb.b_transform_8x8 = 0; |
741 | | /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */ |
742 | 0 | if( h->mb.i_skip_intra ) |
743 | 0 | { |
744 | 0 | h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 ); |
745 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0]; |
746 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1]; |
747 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2]; |
748 | 0 | M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3]; |
749 | 0 | h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp; |
750 | | /* In RD mode, restore the now-overwritten DCT data. */ |
751 | 0 | if( h->mb.i_skip_intra == 2 ) |
752 | 0 | h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) ); |
753 | 0 | } |
754 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
755 | 0 | { |
756 | 0 | for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0; i < 16; i++ ) |
757 | 0 | { |
758 | 0 | pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]]; |
759 | 0 | int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; |
760 | |
|
761 | 0 | if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) |
762 | | /* emulate missing topright samples */ |
763 | 0 | MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] ); |
764 | |
|
765 | 0 | x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 ); |
766 | 0 | } |
767 | 0 | } |
768 | 0 | } |
769 | 0 | else /* Inter MB */ |
770 | 0 | { |
771 | 0 | int i_decimate_mb = 0; |
772 | | |
773 | | /* Don't repeat motion compensation if it was already done in non-RD transform analysis */ |
774 | 0 | if( !h->mb.b_skip_mc ) |
775 | 0 | x264_mb_mc( h ); |
776 | |
|
777 | 0 | if( h->mb.b_lossless ) |
778 | 0 | { |
779 | 0 | if( h->mb.b_transform_8x8 ) |
780 | 0 | for( int p = 0; p < plane_count; p++ ) |
781 | 0 | for( int i8x8 = 0; i8x8 < 4; i8x8++ ) |
782 | 0 | { |
783 | 0 | int x = i8x8&1; |
784 | 0 | int y = i8x8>>1; |
785 | 0 | nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE, |
786 | 0 | h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE ); |
787 | 0 | STORE_8x8_NNZ( p, i8x8, nz ); |
788 | 0 | h->mb.i_cbp_luma |= nz << i8x8; |
789 | 0 | } |
790 | 0 | else |
791 | 0 | for( int p = 0; p < plane_count; p++ ) |
792 | 0 | for( int i4x4 = 0; i4x4 < 16; i4x4++ ) |
793 | 0 | { |
794 | 0 | nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4], |
795 | 0 | h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4], |
796 | 0 | h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] ); |
797 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz; |
798 | 0 | h->mb.i_cbp_luma |= nz << (i4x4>>2); |
799 | 0 | } |
800 | 0 | } |
801 | 0 | else if( h->mb.b_transform_8x8 ) |
802 | 0 | { |
803 | 0 | ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] ); |
804 | 0 | b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC |
805 | |
|
806 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
807 | 0 | { |
808 | 0 | int quant_cat = p ? CQM_8PC : CQM_8PY; |
809 | 0 | CLEAR_16x16_NNZ( p ); |
810 | 0 | h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); |
811 | 0 | h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4; |
812 | |
|
813 | 0 | int plane_cbp = 0; |
814 | 0 | for( int idx = 0; idx < 4; idx++ ) |
815 | 0 | { |
816 | 0 | nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx ); |
817 | |
|
818 | 0 | if( nz ) |
819 | 0 | { |
820 | 0 | h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] ); |
821 | 0 | if( b_decimate ) |
822 | 0 | { |
823 | 0 | int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] ); |
824 | 0 | i_decimate_mb += i_decimate_8x8; |
825 | 0 | if( i_decimate_8x8 >= 4 ) |
826 | 0 | plane_cbp |= 1<<idx; |
827 | 0 | } |
828 | 0 | else |
829 | 0 | plane_cbp |= 1<<idx; |
830 | 0 | } |
831 | 0 | } |
832 | |
|
833 | 0 | if( i_decimate_mb >= 6 || !b_decimate ) |
834 | 0 | { |
835 | 0 | h->mb.i_cbp_luma |= plane_cbp; |
836 | 0 | FOREACH_BIT( idx, 0, plane_cbp ) |
837 | 0 | { |
838 | 0 | h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[quant_cat], i_qp ); |
839 | 0 | h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] ); |
840 | 0 | STORE_8x8_NNZ( p, idx, 1 ); |
841 | 0 | } |
842 | 0 | } |
843 | 0 | } |
844 | 0 | } |
845 | 0 | else |
846 | 0 | { |
847 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); |
848 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
849 | 0 | { |
850 | 0 | int quant_cat = p ? CQM_4PC : CQM_4PY; |
851 | 0 | CLEAR_16x16_NNZ( p ); |
852 | 0 | h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] ); |
853 | |
|
854 | 0 | if( h->mb.b_noise_reduction ) |
855 | 0 | { |
856 | 0 | h->nr_count[0+!!p*2] += 16; |
857 | 0 | for( int idx = 0; idx < 16; idx++ ) |
858 | 0 | h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); |
859 | 0 | } |
860 | |
|
861 | 0 | int plane_cbp = 0; |
862 | 0 | for( int i8x8 = 0; i8x8 < 4; i8x8++ ) |
863 | 0 | { |
864 | 0 | int i_decimate_8x8 = b_decimate ? 0 : 6; |
865 | 0 | int nnz8x8 = 0; |
866 | 0 | if( h->mb.b_trellis ) |
867 | 0 | { |
868 | 0 | for( int i4x4 = 0; i4x4 < 4; i4x4++ ) |
869 | 0 | { |
870 | 0 | int idx = i8x8*4+i4x4; |
871 | 0 | if( x264_quant_4x4_trellis( h, dct4x4[idx], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) ) |
872 | 0 | { |
873 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); |
874 | 0 | h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp ); |
875 | 0 | if( i_decimate_8x8 < 6 ) |
876 | 0 | i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); |
877 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; |
878 | 0 | nnz8x8 = 1; |
879 | 0 | } |
880 | 0 | } |
881 | 0 | } |
882 | 0 | else |
883 | 0 | { |
884 | 0 | nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); |
885 | 0 | if( nz ) |
886 | 0 | { |
887 | 0 | FOREACH_BIT( idx, i8x8*4, nz ) |
888 | 0 | { |
889 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] ); |
890 | 0 | h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp ); |
891 | 0 | if( i_decimate_8x8 < 6 ) |
892 | 0 | i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] ); |
893 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1; |
894 | 0 | } |
895 | 0 | } |
896 | 0 | } |
897 | 0 | if( nnz8x8 ) |
898 | 0 | { |
899 | 0 | i_decimate_mb += i_decimate_8x8; |
900 | 0 | if( i_decimate_8x8 < 4 ) |
901 | 0 | STORE_8x8_NNZ( p, i8x8, 0 ); |
902 | 0 | else |
903 | 0 | plane_cbp |= 1<<i8x8; |
904 | 0 | } |
905 | 0 | } |
906 | |
|
907 | 0 | if( i_decimate_mb < 6 ) |
908 | 0 | { |
909 | 0 | plane_cbp = 0; |
910 | 0 | CLEAR_16x16_NNZ( p ); |
911 | 0 | } |
912 | 0 | else |
913 | 0 | { |
914 | 0 | h->mb.i_cbp_luma |= plane_cbp; |
915 | 0 | FOREACH_BIT( i8x8, 0, plane_cbp ) |
916 | 0 | { |
917 | 0 | h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); |
918 | 0 | } |
919 | 0 | } |
920 | 0 | } |
921 | 0 | } |
922 | 0 | } |
923 | | |
924 | | /* encode chroma */ |
925 | 0 | if( chroma ) |
926 | 0 | { |
927 | 0 | if( IS_INTRA( h->mb.i_type ) ) |
928 | 0 | { |
929 | 0 | int i_mode = h->mb.i_chroma_pred_mode; |
930 | 0 | if( h->mb.b_lossless ) |
931 | 0 | x264_predict_lossless_chroma( h, i_mode ); |
932 | 0 | else |
933 | 0 | { |
934 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); |
935 | 0 | h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); |
936 | 0 | } |
937 | 0 | } |
938 | | |
939 | | /* encode the 8x8 blocks */ |
940 | 0 | x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp ); |
941 | 0 | } |
942 | 0 | else |
943 | 0 | h->mb.i_cbp_chroma = 0; |
944 | | |
945 | | /* store cbp */ |
946 | 0 | int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma; |
947 | 0 | if( h->param.b_cabac ) |
948 | 0 | cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC ]] << 8 |
949 | 0 | | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9 |
950 | 0 | | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10; |
951 | 0 | h->mb.cbp[h->mb.i_mb_xy] = cbp; |
952 | | |
953 | | /* Check for P_SKIP |
954 | | * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account |
955 | | * (if multiple mv give same result)*/ |
956 | 0 | if( !b_force_no_skip ) |
957 | 0 | { |
958 | 0 | if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 && |
959 | 0 | !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && |
960 | 0 | M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv ) |
961 | 0 | && h->mb.cache.ref[0][x264_scan8[0]] == 0 ) |
962 | 0 | { |
963 | 0 | h->mb.i_type = P_SKIP; |
964 | 0 | } |
965 | | |
966 | | /* Check for B_SKIP */ |
967 | 0 | if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) ) |
968 | 0 | { |
969 | 0 | h->mb.i_type = B_SKIP; |
970 | 0 | } |
971 | 0 | } |
972 | 0 | } |
973 | | |
974 | | void x264_macroblock_encode( x264_t *h ) |
975 | 0 | { |
976 | 0 | if( CHROMA444 ) |
977 | 0 | macroblock_encode_internal( h, 3, 0 ); |
978 | 0 | else if( CHROMA_FORMAT ) |
979 | 0 | macroblock_encode_internal( h, 1, 1 ); |
980 | 0 | else |
981 | 0 | macroblock_encode_internal( h, 1, 0 ); |
982 | 0 | } Unexecuted instantiation: x264_8_macroblock_encode Unexecuted instantiation: x264_10_macroblock_encode |
983 | | |
984 | | /***************************************************************************** |
985 | | * x264_macroblock_probe_skip: |
986 | | * Check if the current MB could be encoded as a [PB]_SKIP |
987 | | *****************************************************************************/ |
988 | | static ALWAYS_INLINE int macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) |
989 | 0 | { |
990 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); |
991 | 0 | ALIGNED_ARRAY_64( dctcoef, dctscan,[16] ); |
992 | 0 | ALIGNED_4( int16_t mvp[2] ); |
993 | 0 | int i_qp = h->mb.i_qp; |
994 | |
|
995 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
996 | 0 | { |
997 | 0 | int quant_cat = p ? CQM_4PC : CQM_4PY; |
998 | 0 | if( !b_bidir ) |
999 | 0 | { |
1000 | | /* Get the MV */ |
1001 | 0 | mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] ); |
1002 | 0 | mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] ); |
1003 | | |
1004 | | /* Motion compensation */ |
1005 | 0 | h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE, |
1006 | 0 | &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p], |
1007 | 0 | mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] ); |
1008 | 0 | } |
1009 | |
|
1010 | 0 | for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ ) |
1011 | 0 | { |
1012 | 0 | int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8; |
1013 | 0 | int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8; |
1014 | |
|
1015 | 0 | h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset, |
1016 | 0 | h->mb.pic.p_fdec[p] + fdec_offset ); |
1017 | |
|
1018 | 0 | if( h->mb.b_noise_reduction ) |
1019 | 0 | for( int i4x4 = 0; i4x4 < 4; i4x4++ ) |
1020 | 0 | h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); |
1021 | |
|
1022 | 0 | int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); |
1023 | 0 | FOREACH_BIT( idx, 0, nz ) |
1024 | 0 | { |
1025 | 0 | h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); |
1026 | 0 | i_decimate_mb += h->quantf.decimate_score16( dctscan ); |
1027 | 0 | if( i_decimate_mb >= 6 ) |
1028 | 0 | return 0; |
1029 | 0 | } |
1030 | 0 | } |
1031 | 0 | } |
1032 | | |
1033 | 0 | if( chroma == CHROMA_420 || chroma == CHROMA_422 ) |
1034 | 0 | { |
1035 | 0 | i_qp = h->mb.i_chroma_qp; |
1036 | 0 | int chroma422 = chroma == CHROMA_422; |
1037 | 0 | int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; |
1038 | 0 | int ssd; |
1039 | 0 | ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); |
1040 | |
|
1041 | 0 | if( !b_bidir ) |
1042 | 0 | { |
1043 | | /* Special case for mv0, which is (of course) very common in P-skip mode. */ |
1044 | 0 | if( M32( mvp ) ) |
1045 | 0 | h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, |
1046 | 0 | h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], |
1047 | 0 | mvp[0], mvp[1] * (1<<chroma422), 8, chroma422?16:8 ); |
1048 | 0 | else |
1049 | 0 | h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], |
1050 | 0 | h->mb.pic.i_stride[1], chroma422?16:8 ); |
1051 | 0 | } |
1052 | |
|
1053 | 0 | for( int ch = 0; ch < 2; ch++ ) |
1054 | 0 | { |
1055 | 0 | pixel *p_src = h->mb.pic.p_fenc[1+ch]; |
1056 | 0 | pixel *p_dst = h->mb.pic.p_fdec[1+ch]; |
1057 | |
|
1058 | 0 | if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) |
1059 | 0 | h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, |
1060 | 0 | h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, |
1061 | 0 | &h->sh.weight[0][1+ch], chroma422?16:8 ); |
1062 | | |
1063 | | /* there is almost never a termination during chroma, but we can't avoid the check entirely */ |
1064 | | /* so instead we check SSD and skip the actual check if the score is low enough. */ |
1065 | 0 | ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); |
1066 | 0 | if( ssd < thresh ) |
1067 | 0 | continue; |
1068 | | |
1069 | | /* The vast majority of chroma checks will terminate during the DC check or the higher |
1070 | | * threshold check, so we can save time by doing a DC-only DCT. */ |
1071 | 0 | if( h->mb.b_noise_reduction ) |
1072 | 0 | { |
1073 | 0 | for( int i = 0; i <= chroma422; i++ ) |
1074 | 0 | h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); |
1075 | |
|
1076 | 0 | for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ ) |
1077 | 0 | { |
1078 | 0 | h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); |
1079 | 0 | dct_dc[i4x4] = dct4x4[i4x4][0]; |
1080 | 0 | dct4x4[i4x4][0] = 0; |
1081 | 0 | } |
1082 | 0 | } |
1083 | 0 | else |
1084 | 0 | { |
1085 | 0 | if( chroma422 ) |
1086 | 0 | h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); |
1087 | 0 | else |
1088 | 0 | h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); |
1089 | 0 | } |
1090 | |
|
1091 | 0 | for( int i = 0; i <= chroma422; i++ ) |
1092 | 0 | if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1, |
1093 | 0 | h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) ) |
1094 | 0 | return 0; |
1095 | | |
1096 | | /* If there wasn't a termination in DC, we can check against a much higher threshold. */ |
1097 | 0 | if( ssd < thresh*4 ) |
1098 | 0 | continue; |
1099 | | |
1100 | 0 | if( !h->mb.b_noise_reduction ) |
1101 | 0 | for( int i = 0; i <= chroma422; i++ ) |
1102 | 0 | { |
1103 | 0 | h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); |
1104 | 0 | dct4x4[i*4+0][0] = 0; |
1105 | 0 | dct4x4[i*4+1][0] = 0; |
1106 | 0 | dct4x4[i*4+2][0] = 0; |
1107 | 0 | dct4x4[i*4+3][0] = 0; |
1108 | 0 | } |
1109 | | |
1110 | | /* calculate dct coeffs */ |
1111 | 0 | for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ ) |
1112 | 0 | { |
1113 | 0 | int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); |
1114 | 0 | FOREACH_BIT( idx, i8x8*4, nz ) |
1115 | 0 | { |
1116 | 0 | h->zigzagf.scan_4x4( dctscan, dct4x4[idx] ); |
1117 | 0 | i_decimate_mb += h->quantf.decimate_score15( dctscan ); |
1118 | 0 | if( i_decimate_mb >= 7 ) |
1119 | 0 | return 0; |
1120 | 0 | } |
1121 | 0 | } |
1122 | 0 | } |
1123 | 0 | } |
1124 | | |
1125 | 0 | h->mb.b_skip_mc = 1; |
1126 | 0 | return 1; |
1127 | 0 | } |
1128 | | |
1129 | | int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) |
1130 | 0 | { |
1131 | 0 | if( CHROMA_FORMAT == CHROMA_420 ) |
1132 | 0 | return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 ); |
1133 | 0 | else if( CHROMA_FORMAT == CHROMA_422 ) |
1134 | 0 | return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 ); |
1135 | 0 | else if( CHROMA_FORMAT == CHROMA_444 ) |
1136 | 0 | return macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 ); |
1137 | 0 | else |
1138 | 0 | return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_400 ); |
1139 | 0 | } Unexecuted instantiation: x264_8_macroblock_probe_skip Unexecuted instantiation: x264_10_macroblock_probe_skip |
1140 | | |
1141 | | /**************************************************************************** |
1142 | | * DCT-domain noise reduction / adaptive deadzone |
1143 | | * from libavcodec |
1144 | | ****************************************************************************/ |
1145 | | |
1146 | | void x264_noise_reduction_update( x264_t *h ) |
1147 | 0 | { |
1148 | 0 | h->nr_offset = h->nr_offset_denoise; |
1149 | 0 | h->nr_residual_sum = h->nr_residual_sum_buf[0]; |
1150 | 0 | h->nr_count = h->nr_count_buf[0]; |
1151 | 0 | for( int cat = 0; cat < 3 + CHROMA444; cat++ ) |
1152 | 0 | { |
1153 | 0 | int dct8x8 = cat&1; |
1154 | 0 | int size = dct8x8 ? 64 : 16; |
1155 | 0 | const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; |
1156 | |
|
1157 | 0 | if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) ) |
1158 | 0 | { |
1159 | 0 | for( int i = 0; i < size; i++ ) |
1160 | 0 | h->nr_residual_sum[cat][i] >>= 1; |
1161 | 0 | h->nr_count[cat] >>= 1; |
1162 | 0 | } |
1163 | |
|
1164 | 0 | for( int i = 0; i < size; i++ ) |
1165 | 0 | h->nr_offset[cat][i] = |
1166 | 0 | ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat] |
1167 | 0 | + h->nr_residual_sum[cat][i]/2) |
1168 | 0 | / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1); |
1169 | | |
1170 | | /* Don't denoise DC coefficients */ |
1171 | 0 | h->nr_offset[cat][0] = 0; |
1172 | 0 | } |
1173 | 0 | } Unexecuted instantiation: x264_8_noise_reduction_update Unexecuted instantiation: x264_10_noise_reduction_update |
1174 | | |
1175 | | /***************************************************************************** |
1176 | | * RD only; 4 calls to this do not make up for one macroblock_encode. |
1177 | | * doesn't transform chroma dc. |
1178 | | *****************************************************************************/ |
1179 | | static ALWAYS_INLINE void macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma ) |
1180 | 0 | { |
1181 | 0 | int b_decimate = h->mb.b_dct_decimate; |
1182 | 0 | int i_qp = h->mb.i_qp; |
1183 | 0 | int x = i8&1; |
1184 | 0 | int y = i8>>1; |
1185 | 0 | int nz; |
1186 | 0 | int chroma422 = chroma == CHROMA_422; |
1187 | |
|
1188 | 0 | h->mb.i_cbp_chroma = 0; |
1189 | 0 | h->mb.i_cbp_luma &= ~(1 << i8); |
1190 | |
|
1191 | 0 | if( !h->mb.b_skip_mc ) |
1192 | 0 | x264_mb_mc_8x8( h, i8 ); |
1193 | |
|
1194 | 0 | if( h->mb.b_lossless ) |
1195 | 0 | { |
1196 | 0 | for( int p = 0; p < plane_count; p++ ) |
1197 | 0 | { |
1198 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; |
1199 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; |
1200 | 0 | int nnz8x8 = 0; |
1201 | 0 | if( h->mb.b_transform_8x8 ) |
1202 | 0 | { |
1203 | 0 | nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec ); |
1204 | 0 | STORE_8x8_NNZ( p, i8, nnz8x8 ); |
1205 | 0 | } |
1206 | 0 | else |
1207 | 0 | { |
1208 | 0 | for( int i4 = i8*4; i4 < i8*4+4; i4++ ) |
1209 | 0 | { |
1210 | 0 | nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4], |
1211 | 0 | h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4], |
1212 | 0 | h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] ); |
1213 | 0 | h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz; |
1214 | 0 | nnz8x8 |= nz; |
1215 | 0 | } |
1216 | 0 | } |
1217 | 0 | h->mb.i_cbp_luma |= nnz8x8 << i8; |
1218 | 0 | } |
1219 | 0 | if( chroma == CHROMA_420 || chroma == CHROMA_422 ) |
1220 | 0 | { |
1221 | 0 | for( int ch = 0; ch < 2; ch++ ) |
1222 | 0 | { |
1223 | 0 | dctcoef dc; |
1224 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; |
1225 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; |
1226 | |
|
1227 | 0 | for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) |
1228 | 0 | { |
1229 | 0 | int offset = chroma422 ? 8*y + 2*i4x4 + x : i8; |
1230 | 0 | nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc ); |
1231 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; |
1232 | 0 | } |
1233 | 0 | } |
1234 | 0 | h->mb.i_cbp_chroma = 0x02; |
1235 | 0 | } |
1236 | 0 | } |
1237 | 0 | else |
1238 | 0 | { |
1239 | 0 | if( h->mb.b_transform_8x8 ) |
1240 | 0 | { |
1241 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
1242 | 0 | { |
1243 | 0 | int quant_cat = p ? CQM_8PC : CQM_8PY; |
1244 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; |
1245 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; |
1246 | 0 | ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); |
1247 | |
|
1248 | 0 | h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); |
1249 | 0 | int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); |
1250 | 0 | if( nnz8x8 ) |
1251 | 0 | { |
1252 | 0 | h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 ); |
1253 | |
|
1254 | 0 | if( b_decimate && !h->mb.b_trellis ) |
1255 | 0 | nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] ); |
1256 | |
|
1257 | 0 | if( nnz8x8 ) |
1258 | 0 | { |
1259 | 0 | h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp ); |
1260 | 0 | h->dctf.add8x8_idct8( p_fdec, dct8x8 ); |
1261 | 0 | STORE_8x8_NNZ( p, i8, 1 ); |
1262 | 0 | h->mb.i_cbp_luma |= 1 << i8; |
1263 | 0 | } |
1264 | 0 | else |
1265 | 0 | STORE_8x8_NNZ( p, i8, 0 ); |
1266 | 0 | } |
1267 | 0 | else |
1268 | 0 | STORE_8x8_NNZ( p, i8, 0 ); |
1269 | 0 | } |
1270 | 0 | } |
1271 | 0 | else |
1272 | 0 | { |
1273 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
1274 | 0 | { |
1275 | 0 | int quant_cat = p ? CQM_4PC : CQM_4PY; |
1276 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; |
1277 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; |
1278 | 0 | int i_decimate_8x8 = b_decimate ? 0 : 4; |
1279 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] ); |
1280 | 0 | int nnz8x8 = 0; |
1281 | |
|
1282 | 0 | h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); |
1283 | 0 | STORE_8x8_NNZ( p, i8, 0 ); |
1284 | |
|
1285 | 0 | if( h->mb.b_noise_reduction ) |
1286 | 0 | for( int idx = 0; idx < 4; idx++ ) |
1287 | 0 | h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 ); |
1288 | |
|
1289 | 0 | if( h->mb.b_trellis ) |
1290 | 0 | { |
1291 | 0 | for( int i4x4 = 0; i4x4 < 4; i4x4++ ) |
1292 | 0 | { |
1293 | 0 | if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) ) |
1294 | 0 | { |
1295 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); |
1296 | 0 | h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); |
1297 | 0 | if( i_decimate_8x8 < 4 ) |
1298 | 0 | i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); |
1299 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; |
1300 | 0 | nnz8x8 = 1; |
1301 | 0 | } |
1302 | 0 | } |
1303 | 0 | } |
1304 | 0 | else |
1305 | 0 | { |
1306 | 0 | nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] ); |
1307 | 0 | if( nz ) |
1308 | 0 | { |
1309 | 0 | FOREACH_BIT( i4x4, 0, nz ) |
1310 | 0 | { |
1311 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] ); |
1312 | 0 | h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp ); |
1313 | 0 | if( i_decimate_8x8 < 4 ) |
1314 | 0 | i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] ); |
1315 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1; |
1316 | 0 | } |
1317 | 0 | } |
1318 | 0 | } |
1319 | 0 | if( nnz8x8 ) |
1320 | 0 | { |
1321 | | /* decimate this 8x8 block */ |
1322 | 0 | if( i_decimate_8x8 < 4 ) |
1323 | 0 | STORE_8x8_NNZ( p, i8, 0 ); |
1324 | 0 | else |
1325 | 0 | { |
1326 | 0 | h->dctf.add8x8_idct( p_fdec, dct4x4 ); |
1327 | 0 | h->mb.i_cbp_luma |= 1 << i8; |
1328 | 0 | } |
1329 | 0 | } |
1330 | 0 | } |
1331 | 0 | } |
1332 | |
|
1333 | 0 | if( chroma == CHROMA_420 || chroma == CHROMA_422 ) |
1334 | 0 | { |
1335 | 0 | i_qp = h->mb.i_chroma_qp; |
1336 | 0 | for( int ch = 0; ch < 2; ch++ ) |
1337 | 0 | { |
1338 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] ); |
1339 | 0 | pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; |
1340 | 0 | pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; |
1341 | |
|
1342 | 0 | for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) |
1343 | 0 | { |
1344 | 0 | h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE ); |
1345 | |
|
1346 | 0 | if( h->mb.b_noise_reduction ) |
1347 | 0 | h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); |
1348 | 0 | dct4x4[i4x4][0] = 0; |
1349 | |
|
1350 | 0 | if( h->mb.b_trellis ) |
1351 | 0 | nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 ); |
1352 | 0 | else |
1353 | 0 | nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); |
1354 | |
|
1355 | 0 | int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8; |
1356 | 0 | h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; |
1357 | 0 | if( nz ) |
1358 | 0 | { |
1359 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] ); |
1360 | 0 | h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp ); |
1361 | 0 | h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] ); |
1362 | 0 | } |
1363 | 0 | } |
1364 | 0 | } |
1365 | 0 | h->mb.i_cbp_chroma = 0x02; |
1366 | 0 | } |
1367 | 0 | } |
1368 | 0 | } |
1369 | | |
1370 | | void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) |
1371 | 0 | { |
1372 | 0 | if( CHROMA_FORMAT == CHROMA_420 ) |
1373 | 0 | macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 ); |
1374 | 0 | else if( CHROMA_FORMAT == CHROMA_422 ) |
1375 | 0 | macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 ); |
1376 | 0 | else if( CHROMA_FORMAT == CHROMA_444 ) |
1377 | 0 | macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 ); |
1378 | 0 | else |
1379 | 0 | macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_400 ); |
1380 | 0 | } Unexecuted instantiation: x264_8_macroblock_encode_p8x8 Unexecuted instantiation: x264_10_macroblock_encode_p8x8 |
1381 | | |
1382 | | /***************************************************************************** |
1383 | | * RD only, luma only (for 4:2:0) |
1384 | | *****************************************************************************/ |
1385 | | static ALWAYS_INLINE void macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count ) |
1386 | 0 | { |
1387 | 0 | int i_qp = h->mb.i_qp; |
1388 | |
|
1389 | 0 | for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) |
1390 | 0 | { |
1391 | 0 | int quant_cat = p ? CQM_4PC : CQM_4PY; |
1392 | 0 | pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]]; |
1393 | 0 | pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]]; |
1394 | 0 | int nz; |
1395 | | |
1396 | | /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */ |
1397 | |
|
1398 | 0 | if( h->mb.b_lossless ) |
1399 | 0 | { |
1400 | 0 | nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec ); |
1401 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; |
1402 | 0 | } |
1403 | 0 | else |
1404 | 0 | { |
1405 | 0 | ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); |
1406 | 0 | h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); |
1407 | 0 | nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); |
1408 | 0 | h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; |
1409 | 0 | if( nz ) |
1410 | 0 | { |
1411 | 0 | h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 ); |
1412 | 0 | h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp ); |
1413 | 0 | h->dctf.add4x4_idct( p_fdec, dct4x4 ); |
1414 | 0 | } |
1415 | 0 | } |
1416 | 0 | } |
1417 | 0 | } |
1418 | | |
1419 | | void x264_macroblock_encode_p4x4( x264_t *h, int i8 ) |
1420 | 0 | { |
1421 | 0 | if( CHROMA444 ) |
1422 | 0 | macroblock_encode_p4x4_internal( h, i8, 3 ); |
1423 | 0 | else |
1424 | 0 | macroblock_encode_p4x4_internal( h, i8, 1 ); |
1425 | 0 | } Unexecuted instantiation: x264_8_macroblock_encode_p4x4 Unexecuted instantiation: x264_10_macroblock_encode_p4x4 |