/src/theora/lib/analyze.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: mode selection code |
14 | | last mod: $Id$ |
15 | | |
16 | | ********************************************************************/ |
17 | | #include <limits.h> |
18 | | #include <string.h> |
19 | | #include "encint.h" |
20 | | #include "modedec.h" |
21 | | #if defined(OC_COLLECT_METRICS) |
22 | | # include "collect.c" |
23 | | #endif |
24 | | |
25 | | |
26 | | |
27 | | typedef struct oc_rd_metric oc_rd_metric; |
28 | | typedef struct oc_mode_choice oc_mode_choice; |
29 | | |
30 | | |
31 | | |
32 | | /*There are 8 possible schemes used to encode macro block modes. |
33 | | Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes. |
34 | | The same set of Huffman codes is used for each of these 7 schemes, but the |
35 | | mode assigned to each codeword varies. |
36 | | Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream, |
37 | | while schemes 1-6 have a fixed mapping. |
38 | | Scheme 7 just encodes each mode directly in 3 bits.*/ |
39 | | |
40 | | /*The mode orderings for the various mode coding schemes. |
41 | | Scheme 0 uses a custom alphabet, which is not stored in this table. |
42 | | This is the inverse of the equivalent table OC_MODE_ALPHABETS in the |
43 | | decoder.*/ |
44 | | static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={ |
45 | | /*Last MV dominates.*/ |
46 | | /*L P M N I G GM 4*/ |
47 | | {3,4,2,0,1,5,6,7}, |
48 | | /*L P N M I G GM 4*/ |
49 | | {2,4,3,0,1,5,6,7}, |
50 | | /*L M P N I G GM 4*/ |
51 | | {3,4,1,0,2,5,6,7}, |
52 | | /*L M N P I G GM 4*/ |
53 | | {2,4,1,0,3,5,6,7}, |
54 | | /*No MV dominates.*/ |
55 | | /*N L P M I G GM 4*/ |
56 | | {0,4,3,1,2,5,6,7}, |
57 | | /*N G L P M I GM 4*/ |
58 | | {0,5,4,2,3,1,6,7}, |
59 | | /*Default ordering.*/ |
60 | | /*N I M L P G GM 4*/ |
61 | | {0,1,2,3,4,5,6,7} |
62 | | }; |
63 | | |
64 | | |
65 | | |
66 | | /*Initialize the mode scheme chooser. |
67 | | This need only be called once per encoder.*/ |
68 | 3.44k | void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){ |
69 | 3.44k | int si; |
70 | 3.44k | _chooser->mode_ranks[0]=_chooser->scheme0_ranks; |
71 | 27.5k | for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1]; |
72 | 3.44k | } |
73 | | |
74 | | /*Reset the mode scheme chooser. |
75 | | This needs to be called once for each frame, including the first.*/ |
76 | 39.4k | static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){ |
77 | 39.4k | int si; |
78 | 39.4k | memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts)); |
79 | | /*Scheme 0 starts with 24 bits to store the mode list in.*/ |
80 | 39.4k | _chooser->scheme_bits[0]=24; |
81 | 39.4k | memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits)); |
82 | 354k | for(si=0;si<8;si++){ |
83 | | /*Scheme 7 should always start first, and scheme 0 should always start |
84 | | last.*/ |
85 | 315k | _chooser->scheme_list[si]=7-si; |
86 | 315k | _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si; |
87 | 315k | } |
88 | 39.4k | } |
89 | | |
90 | | /*Return the cost of coding _mb_mode in the specified scheme.*/ |
91 | | static int oc_mode_scheme_chooser_scheme_mb_cost( |
92 | 13.3M | const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){ |
93 | 13.3M | int codebook; |
94 | 13.3M | int ri; |
95 | 13.3M | codebook=_scheme+1>>3; |
96 | | /*For any scheme except 0, we can just use the bit cost of the mode's rank |
97 | | in that scheme.*/ |
98 | 13.3M | ri=_chooser->mode_ranks[_scheme][_mb_mode]; |
99 | 13.3M | if(_scheme==0){ |
100 | 2.06M | int mc; |
101 | | /*For scheme 0, incrementing the mode count could potentially change the |
102 | | mode's rank. |
103 | | Find the index where the mode would be moved to in the optimal list, |
104 | | and use its bit cost instead of the one for the mode's current |
105 | | position in the list.*/ |
106 | | /*We don't actually reorder the list; this is for computing opportunity |
107 | | cost, not an update.*/ |
108 | 2.06M | mc=_chooser->mode_counts[_mb_mode]; |
109 | 5.27M | while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--; |
110 | 2.06M | } |
111 | 13.3M | return OC_MODE_BITS[codebook][ri]; |
112 | 13.3M | } |
113 | | |
114 | | /*This is the real purpose of this data structure: not actually selecting a |
115 | | mode scheme, but estimating the cost of coding a given mode given all the |
116 | | modes selected so far. |
117 | | This is done via opportunity cost: the cost is defined as the number of bits |
118 | | required to encode all the modes selected so far including the current one |
119 | | using the best possible scheme, minus the number of bits required to encode |
120 | | all the modes selected so far not including the current one using the best |
121 | | possible scheme. |
122 | | The computational expense of doing this probably makes it overkill. |
123 | | Just be happy we take a greedy approach instead of trying to solve the |
124 | | global mode-selection problem (which is NP-hard). |
125 | | _mb_mode: The mode to determine the cost of. |
126 | | Return: The number of bits required to code this mode.*/ |
127 | | static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser, |
128 | 4.26M | int _mb_mode){ |
129 | 4.26M | int scheme0; |
130 | 4.26M | int scheme1; |
131 | 4.26M | int best_bits; |
132 | 4.26M | int mode_bits; |
133 | 4.26M | int si; |
134 | 4.26M | int scheme0_bits; |
135 | 4.26M | int scheme1_bits; |
136 | 4.26M | scheme0=_chooser->scheme_list[0]; |
137 | 4.26M | scheme1=_chooser->scheme_list[1]; |
138 | 4.26M | scheme0_bits=_chooser->scheme_bits[scheme0]; |
139 | 4.26M | scheme1_bits=_chooser->scheme_bits[scheme1]; |
140 | 4.26M | mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode); |
141 | | /*Typical case: If the difference between the best scheme and the next best |
142 | | is greater than 6 bits, then adding just one mode cannot change which |
143 | | scheme we use.*/ |
144 | 4.26M | if(scheme1_bits-scheme0_bits>6)return mode_bits; |
145 | | /*Otherwise, check to see if adding this mode selects a different scheme as |
146 | | the best.*/ |
147 | 1.93M | si=1; |
148 | 1.93M | best_bits=scheme0_bits+mode_bits; |
149 | 9.06M | do{ |
150 | 9.06M | int cur_bits; |
151 | 9.06M | cur_bits=scheme1_bits+ |
152 | 9.06M | oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode); |
153 | 9.06M | if(cur_bits<best_bits)best_bits=cur_bits; |
154 | 9.06M | if(++si>=8)break; |
155 | 9.06M | scheme1=_chooser->scheme_list[si]; |
156 | 9.06M | scheme1_bits=_chooser->scheme_bits[scheme1]; |
157 | 9.06M | } |
158 | 9.06M | while(scheme1_bits-scheme0_bits<=6); |
159 | 0 | return best_bits-scheme0_bits; |
160 | 4.26M | } |
161 | | |
162 | | /*Incrementally update the mode counts and per-scheme bit counts and re-order |
163 | | the scheme lists once a mode has been selected. |
164 | | _mb_mode: The mode that was chosen.*/ |
165 | | static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser, |
166 | 346k | int _mb_mode){ |
167 | 346k | int ri; |
168 | 346k | int si; |
169 | 346k | _chooser->mode_counts[_mb_mode]++; |
170 | | /*Re-order the scheme0 mode list if necessary.*/ |
171 | 483k | for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){ |
172 | 211k | int pmode; |
173 | 211k | pmode=_chooser->scheme0_list[ri-1]; |
174 | 211k | if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break; |
175 | | /*Reorder the mode ranking.*/ |
176 | 136k | _chooser->scheme0_ranks[pmode]++; |
177 | 136k | _chooser->scheme0_list[ri]=pmode; |
178 | 136k | } |
179 | 346k | _chooser->scheme0_ranks[_mb_mode]=ri; |
180 | 346k | _chooser->scheme0_list[ri]=_mb_mode; |
181 | | /*Now add the bit cost for the mode to each scheme.*/ |
182 | 3.11M | for(si=0;si<8;si++){ |
183 | 2.77M | _chooser->scheme_bits[si]+= |
184 | 2.77M | OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]]; |
185 | 2.77M | } |
186 | | /*Finally, re-order the list of schemes.*/ |
187 | 2.77M | for(si=1;si<8;si++){ |
188 | 2.42M | int sj; |
189 | 2.42M | int scheme0; |
190 | 2.42M | int bits0; |
191 | 2.42M | sj=si; |
192 | 2.42M | scheme0=_chooser->scheme_list[si]; |
193 | 2.42M | bits0=_chooser->scheme_bits[scheme0]; |
194 | 2.70M | do{ |
195 | 2.70M | int scheme1; |
196 | 2.70M | scheme1=_chooser->scheme_list[sj-1]; |
197 | 2.70M | if(bits0>=_chooser->scheme_bits[scheme1])break; |
198 | 305k | _chooser->scheme_list[sj]=scheme1; |
199 | 305k | } |
200 | 2.42M | while(--sj>0); |
201 | 0 | _chooser->scheme_list[sj]=scheme0; |
202 | 2.42M | } |
203 | 346k | } |
204 | | |
205 | | |
206 | | |
207 | | /*The number of bits required to encode a super block run. |
208 | | _run_count: The desired run count; must be positive and less than 4130.*/ |
209 | 157M | static int oc_sb_run_bits(int _run_count){ |
210 | 157M | int i; |
211 | 552M | for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++); |
212 | 157M | return OC_SB_RUN_CODE_NBITS[i]; |
213 | 157M | } |
214 | | |
215 | | /*The number of bits required to encode a block run. |
216 | | _run_count: The desired run count; must be positive and less than 30.*/ |
217 | 27.2M | static int oc_block_run_bits(int _run_count){ |
218 | 27.2M | return OC_BLOCK_RUN_CODE_NBITS[_run_count-1]; |
219 | 27.2M | } |
220 | | |
221 | | |
222 | | |
223 | 186k | static void oc_fr_state_init(oc_fr_state *_fr){ |
224 | 186k | _fr->bits=0; |
225 | 186k | _fr->sb_partial_count=0; |
226 | 186k | _fr->sb_full_count=0; |
227 | 186k | _fr->b_coded_count_prev=0; |
228 | 186k | _fr->b_coded_count=0; |
229 | 186k | _fr->b_count=0; |
230 | 186k | _fr->sb_prefer_partial=0; |
231 | 186k | _fr->sb_bits=0; |
232 | 186k | _fr->sb_partial=-1; |
233 | 186k | _fr->sb_full=-1; |
234 | 186k | _fr->b_coded_prev=-1; |
235 | 186k | _fr->b_coded=-1; |
236 | 186k | } |
237 | | |
238 | | |
239 | | static int oc_fr_state_sb_cost(const oc_fr_state *_fr, |
240 | 14.7M | int _sb_partial,int _sb_full){ |
241 | 14.7M | int bits; |
242 | 14.7M | int sb_partial_count; |
243 | 14.7M | int sb_full_count; |
244 | 14.7M | bits=0; |
245 | 14.7M | sb_partial_count=_fr->sb_partial_count; |
246 | | /*Extend the sb_partial run, or start a new one.*/ |
247 | 14.7M | if(_fr->sb_partial==_sb_partial){ |
248 | 3.26M | if(sb_partial_count>=4129){ |
249 | 0 | bits++; |
250 | 0 | sb_partial_count=0; |
251 | 0 | } |
252 | 3.26M | else bits-=oc_sb_run_bits(sb_partial_count); |
253 | 3.26M | } |
254 | 11.5M | else sb_partial_count=0; |
255 | 14.7M | bits+=oc_sb_run_bits(++sb_partial_count); |
256 | 14.7M | if(!_sb_partial){ |
257 | | /*Extend the sb_full run, or start a new one.*/ |
258 | 3.77M | sb_full_count=_fr->sb_full_count; |
259 | 3.77M | if(_fr->sb_full==_sb_full){ |
260 | 1.60M | if(sb_full_count>=4129){ |
261 | 0 | bits++; |
262 | 0 | sb_full_count=0; |
263 | 0 | } |
264 | 1.60M | else bits-=oc_sb_run_bits(sb_full_count); |
265 | 1.60M | } |
266 | 2.16M | else sb_full_count=0; |
267 | 3.77M | bits+=oc_sb_run_bits(++sb_full_count); |
268 | 3.77M | } |
269 | 14.7M | return bits; |
270 | 14.7M | } |
271 | | |
272 | | static void oc_fr_state_advance_sb(oc_fr_state *_fr, |
273 | 255k | int _sb_partial,int _sb_full){ |
274 | 255k | int sb_partial_count; |
275 | 255k | int sb_full_count; |
276 | 255k | sb_partial_count=_fr->sb_partial_count; |
277 | 255k | if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0; |
278 | 255k | sb_partial_count++; |
279 | 255k | if(!_sb_partial){ |
280 | 179k | sb_full_count=_fr->sb_full_count; |
281 | 179k | if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0; |
282 | 179k | sb_full_count++; |
283 | 179k | _fr->sb_full_count=sb_full_count; |
284 | 179k | _fr->sb_full=_sb_full; |
285 | | /*Roll back the partial block state.*/ |
286 | 179k | _fr->b_coded=_fr->b_coded_prev; |
287 | 179k | _fr->b_coded_count=_fr->b_coded_count_prev; |
288 | 179k | } |
289 | 75.6k | else{ |
290 | | /*Commit back the partial block state.*/ |
291 | 75.6k | _fr->b_coded_prev=_fr->b_coded; |
292 | 75.6k | _fr->b_coded_count_prev=_fr->b_coded_count; |
293 | 75.6k | } |
294 | 255k | _fr->sb_partial_count=sb_partial_count; |
295 | 255k | _fr->sb_partial=_sb_partial; |
296 | 255k | _fr->b_count=0; |
297 | 255k | _fr->sb_prefer_partial=0; |
298 | 255k | _fr->sb_bits=0; |
299 | 255k | } |
300 | | |
301 | | /*Commit the state of the current super block and advance to the next.*/ |
302 | 255k | static void oc_fr_state_flush_sb(oc_fr_state *_fr){ |
303 | 255k | int sb_partial; |
304 | 255k | int sb_full; |
305 | 255k | int b_coded_count; |
306 | 255k | int b_count; |
307 | 255k | b_count=_fr->b_count; |
308 | 255k | b_coded_count=_fr->b_coded_count; |
309 | 255k | sb_full=_fr->b_coded; |
310 | 255k | sb_partial=b_coded_count<b_count; |
311 | 255k | if(!sb_partial){ |
312 | | /*If the super block is fully coded/uncoded...*/ |
313 | 180k | if(_fr->sb_prefer_partial){ |
314 | | /*So far coding this super block as partial was cheaper anyway.*/ |
315 | 1.96k | if(b_coded_count>15||_fr->b_coded_prev<0){ |
316 | 1.38k | int sb_bits; |
317 | | /*If the block run is too long, this will limit how far it can be |
318 | | extended into the next partial super block. |
319 | | If we need to extend it farther, we don't want to have to roll all |
320 | | the way back here (since there could be many full SBs between now |
321 | | and then), so we disallow this. |
322 | | Similarly, if this is the start of a stripe, we don't know how the |
323 | | length of the outstanding block run from the previous stripe.*/ |
324 | 1.38k | sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full); |
325 | 1.38k | _fr->bits+=sb_bits-_fr->sb_bits; |
326 | 1.38k | _fr->sb_bits=sb_bits; |
327 | 1.38k | } |
328 | 579 | else sb_partial=1; |
329 | 1.96k | } |
330 | 180k | } |
331 | 255k | oc_fr_state_advance_sb(_fr,sb_partial,sb_full); |
332 | 255k | } |
333 | | |
334 | 34.9M | static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){ |
335 | 34.9M | ptrdiff_t bits; |
336 | 34.9M | int sb_bits; |
337 | 34.9M | int b_coded_count; |
338 | 34.9M | int b_count; |
339 | 34.9M | int sb_prefer_partial; |
340 | 34.9M | sb_bits=_fr->sb_bits; |
341 | 34.9M | bits=_fr->bits-sb_bits; |
342 | 34.9M | b_count=_fr->b_count; |
343 | 34.9M | b_coded_count=_fr->b_coded_count; |
344 | 34.9M | sb_prefer_partial=_fr->sb_prefer_partial; |
345 | 34.9M | if(b_coded_count>=b_count){ |
346 | 26.1M | int sb_partial_bits; |
347 | | /*This super block is currently fully coded/uncoded.*/ |
348 | 26.1M | if(b_count<=0){ |
349 | | /*This is the first block in this SB.*/ |
350 | 2.84M | b_count=1; |
351 | | /*Check to see whether it's cheaper to code it partially or fully.*/ |
352 | 2.84M | if(_fr->b_coded==_b_coded){ |
353 | 608k | sb_partial_bits=-oc_block_run_bits(b_coded_count); |
354 | 608k | sb_partial_bits+=oc_block_run_bits(++b_coded_count); |
355 | 608k | } |
356 | 2.23M | else{ |
357 | 2.23M | b_coded_count=1; |
358 | 2.23M | sb_partial_bits=2; |
359 | 2.23M | } |
360 | 2.84M | sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded); |
361 | 2.84M | sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); |
362 | 2.84M | sb_prefer_partial=sb_partial_bits<sb_bits; |
363 | 2.84M | sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial; |
364 | 2.84M | } |
365 | 23.3M | else if(_fr->b_coded==_b_coded){ |
366 | 14.5M | b_coded_count++; |
367 | 14.5M | if(++b_count<16){ |
368 | 13.9M | if(sb_prefer_partial){ |
369 | | /*Check to see if it's cheaper to code it fully.*/ |
370 | 917k | sb_partial_bits=sb_bits; |
371 | 917k | sb_partial_bits+=oc_block_run_bits(b_coded_count); |
372 | 917k | if(b_coded_count>0){ |
373 | 917k | sb_partial_bits-=oc_block_run_bits(b_coded_count-1); |
374 | 917k | } |
375 | 917k | sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); |
376 | 917k | sb_prefer_partial=sb_partial_bits<sb_bits; |
377 | 917k | sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial; |
378 | 917k | } |
379 | | /*There's no need to check the converse (whether it's cheaper to code |
380 | | this SB partially if we were coding it fully), since the cost to |
381 | | code a SB partially can only increase as we add more blocks, whereas |
382 | | the cost to code it fully stays constant.*/ |
383 | 13.9M | } |
384 | 660k | else{ |
385 | | /*If we get to the end and this SB is still full, then force it to be |
386 | | coded full. |
387 | | Otherwise we might not be able to extend the block run far enough |
388 | | into the next partial SB.*/ |
389 | 660k | if(sb_prefer_partial){ |
390 | 9.44k | sb_prefer_partial=0; |
391 | 9.44k | sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded); |
392 | 9.44k | } |
393 | 660k | } |
394 | 14.5M | } |
395 | 8.73M | else{ |
396 | | /*This SB was full, but now must be made partial.*/ |
397 | 8.73M | if(!sb_prefer_partial){ |
398 | 8.16M | sb_bits=oc_block_run_bits(b_coded_count); |
399 | 8.16M | if(b_coded_count>b_count){ |
400 | 1.79M | sb_bits-=oc_block_run_bits(b_coded_count-b_count); |
401 | 1.79M | } |
402 | 8.16M | sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded); |
403 | 8.16M | } |
404 | 8.73M | b_count++; |
405 | 8.73M | b_coded_count=1; |
406 | 8.73M | sb_prefer_partial=1; |
407 | 8.73M | sb_bits+=2; |
408 | 8.73M | } |
409 | 26.1M | } |
410 | 8.74M | else{ |
411 | 8.74M | b_count++; |
412 | 8.74M | if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count); |
413 | 3.30M | else b_coded_count=0; |
414 | 8.74M | sb_bits+=oc_block_run_bits(++b_coded_count); |
415 | 8.74M | } |
416 | 34.9M | _fr->bits=bits+sb_bits; |
417 | 34.9M | _fr->b_coded_count=b_coded_count; |
418 | 34.9M | _fr->b_coded=_b_coded; |
419 | 34.9M | _fr->b_count=b_count; |
420 | 34.9M | _fr->sb_prefer_partial=sb_prefer_partial; |
421 | 34.9M | _fr->sb_bits=sb_bits; |
422 | 34.9M | } |
423 | | |
424 | 13.7M | static void oc_fr_skip_block(oc_fr_state *_fr){ |
425 | 13.7M | oc_fr_state_advance_block(_fr,0); |
426 | 13.7M | } |
427 | | |
428 | 21.2M | static void oc_fr_code_block(oc_fr_state *_fr){ |
429 | 21.2M | oc_fr_state_advance_block(_fr,1); |
430 | 21.2M | } |
431 | | |
432 | 2.14M | static int oc_fr_cost1(const oc_fr_state *_fr){ |
433 | 2.14M | oc_fr_state tmp; |
434 | 2.14M | ptrdiff_t bits; |
435 | 2.14M | *&tmp=*_fr; |
436 | 2.14M | oc_fr_skip_block(&tmp); |
437 | 2.14M | bits=tmp.bits; |
438 | 2.14M | *&tmp=*_fr; |
439 | 2.14M | oc_fr_code_block(&tmp); |
440 | 2.14M | return (int)(tmp.bits-bits); |
441 | 2.14M | } |
442 | | |
443 | 353k | static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){ |
444 | 353k | oc_fr_state tmp; |
445 | 353k | *&tmp=*_pre; |
446 | 353k | oc_fr_skip_block(&tmp); |
447 | 353k | oc_fr_skip_block(&tmp); |
448 | 353k | oc_fr_skip_block(&tmp); |
449 | 353k | oc_fr_skip_block(&tmp); |
450 | 353k | return (int)(_post->bits-tmp.bits); |
451 | 353k | } |
452 | | |
453 | | |
454 | | |
455 | 225k | static void oc_qii_state_init(oc_qii_state *_qs){ |
456 | 225k | _qs->bits=0; |
457 | 225k | _qs->qi01_count=0; |
458 | 225k | _qs->qi01=-1; |
459 | 225k | _qs->qi12_count=0; |
460 | 225k | _qs->qi12=-1; |
461 | 225k | } |
462 | | |
463 | | |
464 | | static void oc_qii_state_advance(oc_qii_state *_qd, |
465 | 62.3M | const oc_qii_state *_qs,int _qii){ |
466 | 62.3M | ptrdiff_t bits; |
467 | 62.3M | int qi01; |
468 | 62.3M | int qi01_count; |
469 | 62.3M | int qi12; |
470 | 62.3M | int qi12_count; |
471 | 62.3M | bits=_qs->bits; |
472 | 62.3M | qi01=_qii+1>>1; |
473 | 62.3M | qi01_count=_qs->qi01_count; |
474 | 62.3M | if(qi01==_qs->qi01){ |
475 | 35.2M | if(qi01_count>=4129){ |
476 | 2.07k | bits++; |
477 | 2.07k | qi01_count=0; |
478 | 2.07k | } |
479 | 35.2M | else bits-=oc_sb_run_bits(qi01_count); |
480 | 35.2M | } |
481 | 27.1M | else qi01_count=0; |
482 | 62.3M | qi01_count++; |
483 | 62.3M | bits+=oc_sb_run_bits(qi01_count); |
484 | 62.3M | qi12_count=_qs->qi12_count; |
485 | 62.3M | if(_qii){ |
486 | 23.4M | qi12=_qii>>1; |
487 | 23.4M | if(qi12==_qs->qi12){ |
488 | 13.5M | if(qi12_count>=4129){ |
489 | 1.02k | bits++; |
490 | 1.02k | qi12_count=0; |
491 | 1.02k | } |
492 | 13.5M | else bits-=oc_sb_run_bits(qi12_count); |
493 | 13.5M | } |
494 | 9.85M | else qi12_count=0; |
495 | 23.4M | qi12_count++; |
496 | 23.4M | bits+=oc_sb_run_bits(qi12_count); |
497 | 23.4M | } |
498 | 38.9M | else qi12=_qs->qi12; |
499 | 62.3M | _qd->bits=bits; |
500 | 62.3M | _qd->qi01=qi01; |
501 | 62.3M | _qd->qi01_count=qi01_count; |
502 | 62.3M | _qd->qi12=qi12; |
503 | 62.3M | _qd->qi12_count=qi12_count; |
504 | 62.3M | } |
505 | | |
506 | | |
507 | | |
508 | 62.0k | static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){ |
509 | 62.0k | ptrdiff_t *coded_fragis; |
510 | 62.0k | unsigned mcu_nvsbs; |
511 | 62.0k | ptrdiff_t mcu_nfrags; |
512 | 62.0k | int flimit; |
513 | 62.0k | int hdec; |
514 | 62.0k | int vdec; |
515 | 62.0k | int pli; |
516 | 62.0k | int nqis; |
517 | 62.0k | int qii; |
518 | 62.0k | int qi0; |
519 | 62.0k | int qti; |
520 | | /*Initialize the per-plane coded block flag trackers. |
521 | | These are used for bit-estimation purposes only; the real flag bits span |
522 | | all three planes, so we can't compute them in parallel.*/ |
523 | 248k | for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli); |
524 | 248k | for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli); |
525 | | /*Set up the per-plane skip SSD storage pointers.*/ |
526 | 62.0k | mcu_nvsbs=_enc->mcu_nvsbs; |
527 | 62.0k | mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16; |
528 | 62.0k | hdec=!(_enc->state.info.pixel_fmt&1); |
529 | 62.0k | vdec=!(_enc->state.info.pixel_fmt&2); |
530 | 62.0k | _pipe->skip_ssd[0]=_enc->mcu_skip_ssd; |
531 | 62.0k | _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags; |
532 | 62.0k | _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec); |
533 | | /*Set up per-plane pointers to the coded and uncoded fragments lists. |
534 | | Unlike the decoder, each planes' coded and uncoded fragment list is kept |
535 | | separate during the analysis stage; we only make the coded list for all |
536 | | three planes contiguous right before the final packet is output |
537 | | (destroying the uncoded lists, which are no longer needed).*/ |
538 | 62.0k | coded_fragis=_enc->state.coded_fragis; |
539 | 248k | for(pli=0;pli<3;pli++){ |
540 | 186k | _pipe->coded_fragis[pli]=coded_fragis; |
541 | 186k | coded_fragis+=_enc->state.fplanes[pli].nfrags; |
542 | 186k | _pipe->uncoded_fragis[pli]=coded_fragis; |
543 | 186k | } |
544 | 62.0k | memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis)); |
545 | 62.0k | memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis)); |
546 | | /*Set up condensed quantizer tables.*/ |
547 | 62.0k | qi0=_enc->state.qis[0]; |
548 | 62.0k | nqis=_enc->state.nqis; |
549 | 248k | for(pli=0;pli<3;pli++){ |
550 | 493k | for(qii=0;qii<nqis;qii++){ |
551 | 307k | int qi; |
552 | 307k | qi=_enc->state.qis[qii]; |
553 | 922k | for(qti=0;qti<2;qti++){ |
554 | | /*Set the DC coefficient in the dequantization table.*/ |
555 | 614k | _enc->state.dequant_tables[qi][pli][qti][0]= |
556 | 614k | _enc->dequant_dc[qi0][pli][qti]; |
557 | 614k | _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti]; |
558 | | /*Copy over the quantization table.*/ |
559 | 614k | memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti], |
560 | 614k | _enc->opt_data.enquant_table_size); |
561 | 614k | } |
562 | 307k | } |
563 | 186k | } |
564 | | /*Fix up the DC coefficients in the quantization tables.*/ |
565 | 62.0k | oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis); |
566 | | /*Initialize the tokenization state.*/ |
567 | 248k | for(pli=0;pli<3;pli++){ |
568 | 186k | _pipe->ndct_tokens1[pli]=0; |
569 | 186k | _pipe->eob_run1[pli]=0; |
570 | 186k | } |
571 | | /*Initialize the bounding value array for the loop filter.*/ |
572 | 62.0k | flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]]; |
573 | 62.0k | _pipe->loop_filter=flimit!=0; |
574 | 62.0k | if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit); |
575 | | /*Clear the temporary DCT scratch space.*/ |
576 | 62.0k | memset(_pipe->dct_data,0,sizeof(_pipe->dct_data)); |
577 | 62.0k | } |
578 | | |
579 | | /*Sets the current MCU stripe to super block row _sby. |
580 | | Return: A non-zero value if this was the last MCU.*/ |
581 | | static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc, |
582 | 215k | oc_enc_pipeline_state *_pipe,int _sby){ |
583 | 215k | const oc_fragment_plane *fplane; |
584 | 215k | unsigned mcu_nvsbs; |
585 | 215k | int sby_end; |
586 | 215k | int notdone; |
587 | 215k | int vdec; |
588 | 215k | int pli; |
589 | 215k | mcu_nvsbs=_enc->mcu_nvsbs; |
590 | 215k | sby_end=_enc->state.fplanes[0].nvsbs; |
591 | 215k | notdone=_sby+mcu_nvsbs<sby_end; |
592 | 215k | if(notdone)sby_end=_sby+mcu_nvsbs; |
593 | 215k | vdec=0; |
594 | 863k | for(pli=0;pli<3;pli++){ |
595 | 647k | fplane=_enc->state.fplanes+pli; |
596 | 647k | _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs; |
597 | 647k | _pipe->fragy0[pli]=_sby<<2-vdec; |
598 | 647k | _pipe->froffset[pli]=fplane->froffset |
599 | 647k | +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags; |
600 | 647k | if(notdone){ |
601 | 461k | _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs; |
602 | 461k | _pipe->fragy_end[pli]=sby_end<<2-vdec; |
603 | 461k | } |
604 | 186k | else{ |
605 | 186k | _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs; |
606 | 186k | _pipe->fragy_end[pli]=fplane->nvfrags; |
607 | 186k | } |
608 | 647k | vdec=!(_enc->state.info.pixel_fmt&2); |
609 | 647k | } |
610 | 215k | return notdone; |
611 | 215k | } |
612 | | |
613 | | static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc, |
614 | 647k | oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){ |
615 | | /*Copy over all the uncoded fragments from this plane and advance the uncoded |
616 | | fragment list.*/ |
617 | 647k | if(_pipe->nuncoded_fragis[_pli]>0){ |
618 | 58.9k | _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli]; |
619 | 58.9k | oc_frag_copy_list(&_enc->state, |
620 | 58.9k | _enc->state.ref_frame_data[OC_FRAME_SELF], |
621 | 58.9k | _enc->state.ref_frame_data[OC_FRAME_PREV], |
622 | 58.9k | _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli], |
623 | 58.9k | _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs); |
624 | 58.9k | _pipe->nuncoded_fragis[_pli]=0; |
625 | 58.9k | } |
626 | | /*Perform DC prediction.*/ |
627 | 647k | oc_enc_pred_dc_frag_rows(_enc,_pli, |
628 | 647k | _pipe->fragy0[_pli],_pipe->fragy_end[_pli]); |
629 | | /*Finish DC tokenization.*/ |
630 | 647k | oc_enc_tokenize_dc_frag_list(_enc,_pli, |
631 | 647k | _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli], |
632 | 647k | _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]); |
633 | 647k | _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1]; |
634 | 647k | _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1]; |
635 | | /*And advance the coded fragment list.*/ |
636 | 647k | _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli]; |
637 | 647k | _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli]; |
638 | 647k | _pipe->ncoded_fragis[_pli]=0; |
639 | | /*Apply the loop filter if necessary.*/ |
640 | 647k | if(_pipe->loop_filter){ |
641 | 444k | oc_state_loop_filter_frag_rows(&_enc->state, |
642 | 444k | _pipe->bounding_values,OC_FRAME_SELF,_pli, |
643 | 444k | _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay); |
644 | 444k | } |
645 | 202k | else _sdelay=_edelay=0; |
646 | | /*To fill borders, we have an additional two pixel delay, since a fragment |
647 | | in the next row could filter its top edge, using two pixels from a |
648 | | fragment in this row. |
649 | | But there's no reason to delay a full fragment between the two.*/ |
650 | 647k | oc_state_borders_fill_rows(&_enc->state, |
651 | 647k | _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli, |
652 | 647k | (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1), |
653 | 647k | (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1)); |
654 | 647k | } |
655 | | |
656 | | |
657 | | |
658 | | /*Cost information about the coded blocks in a MB.*/ |
659 | | struct oc_rd_metric{ |
660 | | int uncoded_ac_ssd; |
661 | | int coded_ac_ssd; |
662 | | int ac_bits; |
663 | | int dc_flag; |
664 | | }; |
665 | | |
666 | | |
667 | | |
668 | | static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc, |
669 | | oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi, |
670 | | unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo, |
671 | 25.1M | oc_fr_state *_fr,oc_token_checkpoint **_stack){ |
672 | 25.1M | ogg_int16_t *data; |
673 | 25.1M | ogg_int16_t *dct; |
674 | 25.1M | ogg_int16_t *idct; |
675 | 25.1M | oc_qii_state qs; |
676 | 25.1M | const ogg_uint16_t *dequant; |
677 | 25.1M | ogg_uint16_t dequant_dc; |
678 | 25.1M | ptrdiff_t frag_offs; |
679 | 25.1M | int ystride; |
680 | 25.1M | const unsigned char *src; |
681 | 25.1M | const unsigned char *ref; |
682 | 25.1M | unsigned char *dst; |
683 | 25.1M | int nonzero; |
684 | 25.1M | unsigned uncoded_ssd; |
685 | 25.1M | unsigned coded_ssd; |
686 | 25.1M | oc_token_checkpoint *checkpoint; |
687 | 25.1M | oc_fragment *frags; |
688 | 25.1M | int mb_mode; |
689 | 25.1M | int refi; |
690 | 25.1M | int mv_offs[2]; |
691 | 25.1M | int nmv_offs; |
692 | 25.1M | int ac_bits; |
693 | 25.1M | int borderi; |
694 | 25.1M | int nqis; |
695 | 25.1M | int qti; |
696 | 25.1M | int qii; |
697 | 25.1M | int dc; |
698 | 25.1M | nqis=_enc->state.nqis; |
699 | 25.1M | frags=_enc->state.frags; |
700 | 25.1M | frag_offs=_enc->state.frag_buf_offs[_fragi]; |
701 | 25.1M | ystride=_enc->state.ref_ystride[_pli]; |
702 | 25.1M | src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs; |
703 | 25.1M | borderi=frags[_fragi].borderi; |
704 | 25.1M | qii=frags[_fragi].qii; |
705 | 25.1M | data=_enc->pipe.dct_data; |
706 | 25.1M | dct=data+64; |
707 | 25.1M | idct=data+128; |
708 | 25.1M | if(qii&~3){ |
709 | 507k | #if !defined(OC_COLLECT_METRICS) |
710 | 507k | if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){ |
711 | | /*Enable early skip detection.*/ |
712 | 507k | frags[_fragi].coded=0; |
713 | 507k | frags[_fragi].refi=OC_FRAME_NONE; |
714 | 507k | oc_fr_skip_block(_fr); |
715 | 507k | return 0; |
716 | 507k | } |
717 | 0 | #endif |
718 | | /*Try and code this block anyway.*/ |
719 | 0 | qii&=3; |
720 | 0 | } |
721 | 24.6M | refi=frags[_fragi].refi; |
722 | 24.6M | mb_mode=frags[_fragi].mb_mode; |
723 | 24.6M | ref=_enc->state.ref_frame_data[refi]+frag_offs; |
724 | 24.6M | dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs; |
725 | | /*Motion compensation:*/ |
726 | 24.6M | switch(mb_mode){ |
727 | 23.7M | case OC_MODE_INTRA:{ |
728 | 23.7M | nmv_offs=0; |
729 | 23.7M | oc_enc_frag_sub_128(_enc,data,src,ystride); |
730 | 23.7M | }break; |
731 | 55.1k | case OC_MODE_GOLDEN_NOMV: |
732 | 274k | case OC_MODE_INTER_NOMV:{ |
733 | 274k | nmv_offs=1; |
734 | 274k | mv_offs[0]=0; |
735 | 274k | oc_enc_frag_sub(_enc,data,src,ref,ystride); |
736 | 274k | }break; |
737 | 560k | default:{ |
738 | 560k | const oc_mv *frag_mvs; |
739 | 560k | frag_mvs=_enc->state.frag_mvs; |
740 | 560k | nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs, |
741 | 560k | _pli,frag_mvs[_fragi]); |
742 | 560k | if(nmv_offs>1){ |
743 | 408k | oc_enc_frag_copy2(_enc,dst, |
744 | 408k | ref+mv_offs[0],ref+mv_offs[1],ystride); |
745 | 408k | oc_enc_frag_sub(_enc,data,src,dst,ystride); |
746 | 408k | } |
747 | 151k | else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride); |
748 | 560k | }break; |
749 | 24.6M | } |
750 | | #if defined(OC_COLLECT_METRICS) |
751 | | { |
752 | | unsigned sad; |
753 | | unsigned satd; |
754 | | switch(nmv_offs){ |
755 | | case 0:{ |
756 | | sad=oc_enc_frag_intra_sad(_enc,src,ystride); |
757 | | satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride); |
758 | | }break; |
759 | | case 1:{ |
760 | | sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX); |
761 | | satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride); |
762 | | satd+=abs(dc); |
763 | | }break; |
764 | | default:{ |
765 | | sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX); |
766 | | satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride); |
767 | | satd+=abs(dc); |
768 | | }break; |
769 | | } |
770 | | _enc->frag_sad[_fragi]=sad; |
771 | | _enc->frag_satd[_fragi]=satd; |
772 | | } |
773 | | #endif |
774 | | /*Transform:*/ |
775 | 24.6M | oc_enc_fdct8x8(_enc,dct,data); |
776 | | /*Quantize:*/ |
777 | 24.6M | qti=mb_mode!=OC_MODE_INTRA; |
778 | 24.6M | dequant=_enc->dequant[_pli][qii][qti]; |
779 | 24.6M | nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]); |
780 | 24.6M | dc=data[0]; |
781 | | /*Tokenize.*/ |
782 | 24.6M | checkpoint=*_stack; |
783 | 24.6M | if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
784 | 24.6M | ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct, |
785 | 24.6M | nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3); |
786 | 24.6M | } |
787 | 0 | else{ |
788 | 0 | ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct, |
789 | 0 | nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3); |
790 | 0 | } |
791 | | /*Reconstruct. |
792 | | TODO: nonzero may need to be adjusted after tokenization.*/ |
793 | 24.6M | dequant_dc=dequant[0]; |
794 | 24.6M | if(nonzero==0){ |
795 | 19.5M | ogg_int16_t p; |
796 | 19.5M | int ci; |
797 | 19.5M | int qi01; |
798 | 19.5M | int qi12; |
799 | | /*We round this dequant product (and not any of the others) because there's |
800 | | no iDCT rounding.*/ |
801 | 19.5M | p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5); |
802 | | /*LOOP VECTORIZES.*/ |
803 | 1.27G | for(ci=0;ci<64;ci++)data[ci]=p; |
804 | | /*We didn't code any AC coefficients, so don't change the quantizer.*/ |
805 | 19.5M | qi01=_pipe->qs[_pli].qi01; |
806 | 19.5M | qi12=_pipe->qs[_pli].qi12; |
807 | 19.5M | if(qi01>0)qii=1+qi12; |
808 | 17.7M | else if(qi01>=0)qii=0; |
809 | 19.5M | } |
810 | 5.06M | else{ |
811 | 5.06M | idct[0]=dc*dequant_dc; |
812 | | /*Note: This clears idct[] back to zero for the next block.*/ |
813 | 5.06M | oc_idct8x8(&_enc->state,data,idct,nonzero+1); |
814 | 5.06M | } |
815 | 24.6M | frags[_fragi].qii=qii; |
816 | 24.6M | if(nqis>1){ |
817 | 7.26M | oc_qii_state_advance(&qs,_pipe->qs+_pli,qii); |
818 | 7.26M | ac_bits+=qs.bits-_pipe->qs[_pli].bits; |
819 | 7.26M | } |
820 | 24.6M | if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data); |
821 | 834k | else{ |
822 | 834k | oc_enc_frag_recon_inter(_enc,dst, |
823 | 834k | nmv_offs==1?ref+mv_offs[0]:dst,ystride,data); |
824 | 834k | } |
825 | | /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/ |
826 | 24.6M | #if !defined(OC_COLLECT_METRICS) |
827 | 24.6M | if(_fr!=NULL) |
828 | 2.14M | #endif |
829 | 2.14M | { |
830 | | /*In retrospect, should we have skipped this block?*/ |
831 | 2.14M | if(borderi<0){ |
832 | 1.63M | coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride); |
833 | 1.63M | } |
834 | 510k | else{ |
835 | 510k | coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride, |
836 | 510k | _enc->state.borders[borderi].mask); |
837 | 510k | } |
838 | | /*Scale to match DCT domain.*/ |
839 | 2.14M | coded_ssd<<=4; |
840 | | #if defined(OC_COLLECT_METRICS) |
841 | | _enc->frag_ssd[_fragi]=coded_ssd; |
842 | | } |
843 | | if(_fr!=NULL){ |
844 | | #endif |
845 | 2.14M | coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale); |
846 | 2.14M | uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]]; |
847 | 2.14M | if(uncoded_ssd<UINT_MAX&& |
848 | | /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility |
849 | | is enabled.*/ |
850 | 2.14M | (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){ |
851 | 2.14M | int overhead_bits; |
852 | 2.14M | overhead_bits=oc_fr_cost1(_fr); |
853 | | /*Although the fragment coding overhead determination is accurate, it is |
854 | | greedy, using very coarse-grained local information. |
855 | | Allowing it to mildly discourage coding turns out to be beneficial, but |
856 | | it's not clear that allowing it to encourage coding through negative |
857 | | coding overhead deltas is useful. |
858 | | For that reason, we disallow negative coding overheads.*/ |
859 | 2.14M | if(overhead_bits<0)overhead_bits=0; |
860 | 2.14M | if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){ |
861 | | /*Hm, not worth it; roll back.*/ |
862 | 163k | oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint); |
863 | 163k | *_stack=checkpoint; |
864 | 163k | frags[_fragi].coded=0; |
865 | 163k | frags[_fragi].refi=OC_FRAME_NONE; |
866 | 163k | oc_fr_skip_block(_fr); |
867 | 163k | return 0; |
868 | 163k | } |
869 | 2.14M | } |
870 | 0 | else _mo->dc_flag=1; |
871 | 1.98M | _mo->uncoded_ac_ssd+=uncoded_ssd; |
872 | 1.98M | _mo->coded_ac_ssd+=coded_ssd; |
873 | 1.98M | _mo->ac_bits+=ac_bits; |
874 | 1.98M | oc_fr_code_block(_fr); |
875 | 1.98M | } |
876 | | /*GCC 4.4.4 generates a warning here because it can't tell that |
877 | | the init code in the nqis check above will run anytime this |
878 | | line runs.*/ |
879 | 24.4M | if(nqis>1)*(_pipe->qs+_pli)=*&qs; |
880 | 24.4M | frags[_fragi].dc=dc; |
881 | 24.4M | frags[_fragi].coded=1; |
882 | 24.4M | return 1; |
883 | 24.6M | } |
884 | | |
885 | | static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc, |
886 | | oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead, |
887 | 411k | const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){ |
888 | | /*Worst case token stack usage for 4 fragments.*/ |
889 | 411k | oc_token_checkpoint stack[64*4]; |
890 | 411k | oc_token_checkpoint *stackptr; |
891 | 411k | const oc_sb_map *sb_maps; |
892 | 411k | signed char *mb_modes; |
893 | 411k | oc_fragment *frags; |
894 | 411k | ptrdiff_t *coded_fragis; |
895 | 411k | ptrdiff_t ncoded_fragis; |
896 | 411k | ptrdiff_t *uncoded_fragis; |
897 | 411k | ptrdiff_t nuncoded_fragis; |
898 | 411k | oc_rd_metric mo; |
899 | 411k | oc_fr_state fr_checkpoint; |
900 | 411k | oc_qii_state qs_checkpoint; |
901 | 411k | int mb_mode; |
902 | 411k | int refi; |
903 | 411k | int ncoded; |
904 | 411k | ptrdiff_t fragi; |
905 | 411k | int bi; |
906 | 411k | *&fr_checkpoint=*(_pipe->fr+0); |
907 | 411k | *&qs_checkpoint=*(_pipe->qs+0); |
908 | 411k | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
909 | 411k | mb_modes=_enc->state.mb_modes; |
910 | 411k | frags=_enc->state.frags; |
911 | 411k | coded_fragis=_pipe->coded_fragis[0]; |
912 | 411k | ncoded_fragis=_pipe->ncoded_fragis[0]; |
913 | 411k | uncoded_fragis=_pipe->uncoded_fragis[0]; |
914 | 411k | nuncoded_fragis=_pipe->nuncoded_fragis[0]; |
915 | 411k | mb_mode=mb_modes[_mbi]; |
916 | 411k | refi=OC_FRAME_FOR_MODE(mb_mode); |
917 | 411k | ncoded=0; |
918 | 411k | stackptr=stack; |
919 | 411k | memset(&mo,0,sizeof(mo)); |
920 | 2.05M | for(bi=0;bi<4;bi++){ |
921 | 1.64M | fragi=sb_maps[_mbi>>2][_mbi&3][bi]; |
922 | 1.64M | frags[fragi].refi=refi; |
923 | 1.64M | frags[fragi].mb_mode=mb_mode; |
924 | 1.64M | if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi, |
925 | 1.64M | _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){ |
926 | 1.27M | coded_fragis[ncoded_fragis++]=fragi; |
927 | 1.27M | ncoded++; |
928 | 1.27M | } |
929 | 368k | else *(uncoded_fragis-++nuncoded_fragis)=fragi; |
930 | 1.64M | } |
931 | 411k | if(ncoded>0&&!mo.dc_flag){ |
932 | 353k | int cost; |
933 | | /*Some individual blocks were worth coding. |
934 | | See if that's still true when accounting for mode and MV overhead.*/ |
935 | 353k | cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits |
936 | 353k | +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead); |
937 | 353k | if(mo.uncoded_ac_ssd<=cost){ |
938 | | /*Taking macroblock overhead into account, it is not worth coding this |
939 | | MB.*/ |
940 | 6.74k | oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack); |
941 | 6.74k | *(_pipe->fr+0)=*&fr_checkpoint; |
942 | 6.74k | *(_pipe->qs+0)=*&qs_checkpoint; |
943 | 33.7k | for(bi=0;bi<4;bi++){ |
944 | 26.9k | fragi=sb_maps[_mbi>>2][_mbi&3][bi]; |
945 | 26.9k | if(frags[fragi].coded){ |
946 | 10.8k | *(uncoded_fragis-++nuncoded_fragis)=fragi; |
947 | 10.8k | frags[fragi].coded=0; |
948 | 10.8k | frags[fragi].refi=OC_FRAME_NONE; |
949 | 10.8k | } |
950 | 26.9k | oc_fr_skip_block(_pipe->fr+0); |
951 | 26.9k | } |
952 | 6.74k | ncoded_fragis-=ncoded; |
953 | 6.74k | ncoded=0; |
954 | 6.74k | } |
955 | 353k | } |
956 | | /*If no luma blocks coded, the mode is forced.*/ |
957 | 411k | if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV; |
958 | | /*Assume that a 1MV with a single coded block is always cheaper than a 4MV |
959 | | with a single coded block. |
960 | | This may not be strictly true: a 4MV computes chroma MVs using (0,0) for |
961 | | skipped blocks, while a 1MV does not.*/ |
962 | 346k | else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){ |
963 | 66 | mb_modes[_mbi]=OC_MODE_INTER_MV; |
964 | 66 | } |
965 | 411k | _pipe->ncoded_fragis[0]=ncoded_fragis; |
966 | 411k | _pipe->nuncoded_fragis[0]=nuncoded_fragis; |
967 | 411k | return ncoded; |
968 | 411k | } |
969 | | |
970 | | static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc, |
971 | 84.9k | oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){ |
972 | 84.9k | const ogg_uint16_t *mcu_rd_scale; |
973 | 84.9k | const ogg_uint16_t *mcu_rd_iscale; |
974 | 84.9k | const oc_sb_map *sb_maps; |
975 | 84.9k | oc_sb_flags *sb_flags; |
976 | 84.9k | oc_fr_state *fr; |
977 | 84.9k | ptrdiff_t *coded_fragis; |
978 | 84.9k | ptrdiff_t ncoded_fragis; |
979 | 84.9k | ptrdiff_t *uncoded_fragis; |
980 | 84.9k | ptrdiff_t nuncoded_fragis; |
981 | 84.9k | ptrdiff_t froffset; |
982 | 84.9k | int sbi; |
983 | 84.9k | fr=_pipe->fr+_pli; |
984 | 84.9k | mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale; |
985 | 84.9k | mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale; |
986 | 84.9k | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
987 | 84.9k | sb_flags=_enc->state.sb_flags; |
988 | 84.9k | coded_fragis=_pipe->coded_fragis[_pli]; |
989 | 84.9k | ncoded_fragis=_pipe->ncoded_fragis[_pli]; |
990 | 84.9k | uncoded_fragis=_pipe->uncoded_fragis[_pli]; |
991 | 84.9k | nuncoded_fragis=_pipe->nuncoded_fragis[_pli]; |
992 | 84.9k | froffset=_pipe->froffset[_pli]; |
993 | 216k | for(sbi=_sbi_start;sbi<_sbi_end;sbi++){ |
994 | | /*Worst case token stack usage for 1 fragment.*/ |
995 | 131k | oc_token_checkpoint stack[64]; |
996 | 131k | oc_rd_metric mo; |
997 | 131k | int quadi; |
998 | 131k | int bi; |
999 | 131k | memset(&mo,0,sizeof(mo)); |
1000 | 2.63M | for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){ |
1001 | 2.10M | ptrdiff_t fragi; |
1002 | 2.10M | fragi=sb_maps[sbi][quadi][bi]; |
1003 | 2.10M | if(fragi>=0){ |
1004 | 1.00M | oc_token_checkpoint *stackptr; |
1005 | 1.00M | unsigned rd_scale; |
1006 | 1.00M | unsigned rd_iscale; |
1007 | 1.00M | rd_scale=mcu_rd_scale[fragi-froffset]; |
1008 | 1.00M | rd_iscale=mcu_rd_iscale[fragi-froffset]; |
1009 | 1.00M | stackptr=stack; |
1010 | 1.00M | if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi, |
1011 | 1.00M | rd_scale,rd_iscale,&mo,fr,&stackptr)){ |
1012 | 704k | coded_fragis[ncoded_fragis++]=fragi; |
1013 | 704k | } |
1014 | 302k | else *(uncoded_fragis-++nuncoded_fragis)=fragi; |
1015 | 1.00M | } |
1016 | 2.10M | } |
1017 | 131k | oc_fr_state_flush_sb(fr); |
1018 | 131k | sb_flags[sbi].coded_fully=fr->sb_full; |
1019 | 131k | sb_flags[sbi].coded_partially=fr->sb_partial; |
1020 | 131k | } |
1021 | 84.9k | _pipe->ncoded_fragis[_pli]=ncoded_fragis; |
1022 | 84.9k | _pipe->nuncoded_fragis[_pli]=nuncoded_fragis; |
1023 | 84.9k | } |
1024 | | |
1025 | | /*Mode decision is done by exhaustively examining all potential choices. |
1026 | | Obviously, doing the motion compensation, fDCT, tokenization, and then |
1027 | | counting the bits each token uses is computationally expensive. |
1028 | | Theora's EOB runs can also split the cost of these tokens across multiple |
1029 | | fragments, and naturally we don't know what the optimal choice of Huffman |
1030 | | codes will be until we know all the tokens we're going to encode in all the |
1031 | | fragments. |
1032 | | So we use a simple approach to estimating the bit cost and distortion of each |
1033 | | mode based upon the SATD value of the residual before coding. |
1034 | | The mathematics behind the technique are outlined by Kim \cite{Kim03}, but |
1035 | | the process (modified somewhat from that of the paper) is very simple. |
1036 | | We build a non-linear regression of the mappings from |
1037 | | (pre-transform+quantization) SATD to (post-transform+quantization) bits and |
1038 | | SSD for each qi. |
1039 | | A separate set of mappings is kept for each quantization type and color |
1040 | | plane. |
1041 | | The mappings are constructed by partitioning the SATD values into a small |
1042 | | number of bins (currently 24) and using a linear regression in each bin |
1043 | | (as opposed to the 0th-order regression used by Kim). |
1044 | | The bit counts and SSD measurements are obtained by examining actual encoded |
1045 | | frames, with appropriate lambda values and optimal Huffman codes selected. |
1046 | | EOB bits are assigned to the fragment that started the EOB run (as opposed to |
1047 | | dividing them among all the blocks in the run; the latter approach seems |
1048 | | more theoretically correct, but Monty's testing showed a small improvement |
1049 | | with the former, though that may have been merely statistical noise). |
1050 | | |
1051 | | @ARTICLE{Kim03, |
1052 | | author="Hyun Mun Kim", |
1053 | | title="Adaptive Rate Control Using Nonlinear Regression", |
1054 | | journal="IEEE Transactions on Circuits and Systems for Video Technology", |
1055 | | volume=13, |
1056 | | number=5, |
1057 | | pages="432--439", |
1058 | | month=May, |
1059 | | year=2003 |
1060 | | }*/ |
1061 | | |
1062 | | /*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding |
1063 | | overflow for large lambda values.*/ |
1064 | | #define OC_MODE_RD_COST(_ssd,_rate,_lambda) \ |
1065 | 97.9M | ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \ |
1066 | 97.9M | +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \ |
1067 | 97.9M | +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE) |
1068 | | |
1069 | 62.0k | static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){ |
1070 | 62.0k | #if !defined(OC_COLLECT_METRICS) |
1071 | 62.0k | const |
1072 | 62.0k | #endif |
1073 | 62.0k | oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]= |
1074 | 62.0k | _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD; |
1075 | 62.0k | int qii; |
1076 | | #if defined(OC_COLLECT_METRICS) |
1077 | | oc_enc_mode_metrics_load(_enc); |
1078 | | #endif |
1079 | 164k | for(qii=0;qii<_enc->state.nqis;qii++){ |
1080 | 102k | int qi; |
1081 | 102k | int pli; |
1082 | 102k | qi=_enc->state.qis[qii]; |
1083 | 409k | for(pli=0;pli<3;pli++){ |
1084 | 307k | int qti; |
1085 | 922k | for(qti=0;qti<2;qti++){ |
1086 | 614k | int log_plq; |
1087 | 614k | int modeline; |
1088 | 614k | int bin; |
1089 | 614k | int dx; |
1090 | 614k | int dq; |
1091 | 614k | log_plq=_enc->log_plq[qi][pli][qti]; |
1092 | | /*Find the pair of rows in the mode table that bracket this quantizer. |
1093 | | If it falls outside the range the table covers, then we just use a |
1094 | | pair on the edge for linear extrapolation.*/ |
1095 | 2.89M | for(modeline=0;modeline<OC_LOGQ_BINS-1&& |
1096 | 2.89M | OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++); |
1097 | | /*Interpolate a row for this quantizer.*/ |
1098 | 614k | dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq; |
1099 | 614k | dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti]; |
1100 | 614k | if(dq==0)dq=1; |
1101 | 15.3M | for(bin=0;bin<OC_COMP_BINS;bin++){ |
1102 | 14.7M | int y0; |
1103 | 14.7M | int z0; |
1104 | 14.7M | int dy; |
1105 | 14.7M | int dz; |
1106 | 14.7M | y0=oc_mode_rd_table[modeline][pli][qti][bin].rate; |
1107 | 14.7M | z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse; |
1108 | 14.7M | dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0; |
1109 | 14.7M | dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0; |
1110 | 14.7M | _enc->mode_rd[qii][pli][qti][bin].rate= |
1111 | 14.7M | (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767); |
1112 | 14.7M | _enc->mode_rd[qii][pli][qti][bin].rmse= |
1113 | 14.7M | (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767); |
1114 | 14.7M | } |
1115 | 614k | } |
1116 | 307k | } |
1117 | 102k | } |
1118 | 62.0k | } |
1119 | | |
1120 | | /*Estimate the R-D cost of the DCT coefficients given the SATD of a block after |
1121 | | prediction.*/ |
1122 | | static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd, |
1123 | 62.1M | int _qii,int _pli,int _qti,int _satd){ |
1124 | 62.1M | unsigned rmse; |
1125 | 62.1M | int shift; |
1126 | 62.1M | int bin; |
1127 | 62.1M | int dx; |
1128 | 62.1M | int y0; |
1129 | 62.1M | int z0; |
1130 | 62.1M | int dy; |
1131 | 62.1M | int dz; |
1132 | | /*SATD metrics for chroma planes vary much less than luma, so we scale them |
1133 | | by 4 to distribute them into the mode decision bins more evenly.*/ |
1134 | 62.1M | _satd<<=_pli+1&2; |
1135 | 62.1M | shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT; |
1136 | 62.1M | bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2); |
1137 | 62.1M | dx=_satd-(bin<<shift); |
1138 | 62.1M | y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate; |
1139 | 62.1M | z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse; |
1140 | 62.1M | dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0; |
1141 | 62.1M | dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0; |
1142 | 62.1M | rmse=OC_MAXI(z0+(dz*dx>>shift),0); |
1143 | 62.1M | *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE; |
1144 | 62.1M | return OC_MAXI(y0+(dy*dx>>shift),0); |
1145 | 62.1M | } |
1146 | | |
1147 | | /*activity_avg must be positive, or flat regions could get a zero weight, which |
1148 | | confounds analysis. |
1149 | | We set the minimum to this value so that it also avoids the need for divide |
1150 | | by zero checks in oc_mb_masking().*/ |
1151 | | # define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS) |
1152 | | |
1153 | | static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi, |
1154 | 3.23M | unsigned _activity[4]){ |
1155 | 3.23M | const unsigned char *src; |
1156 | 3.23M | const ptrdiff_t *frag_buf_offs; |
1157 | 3.23M | const ptrdiff_t *sb_map; |
1158 | 3.23M | unsigned luma; |
1159 | 3.23M | int ystride; |
1160 | 3.23M | ptrdiff_t frag_offs; |
1161 | 3.23M | ptrdiff_t fragi; |
1162 | 3.23M | int bi; |
1163 | 3.23M | frag_buf_offs=_enc->state.frag_buf_offs; |
1164 | 3.23M | sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; |
1165 | 3.23M | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
1166 | 3.23M | ystride=_enc->state.ref_ystride[0]; |
1167 | 3.23M | luma=0; |
1168 | 16.1M | for(bi=0;bi<4;bi++){ |
1169 | 12.9M | const unsigned char *s; |
1170 | 12.9M | unsigned x; |
1171 | 12.9M | unsigned x2; |
1172 | 12.9M | unsigned act; |
1173 | 12.9M | int i; |
1174 | 12.9M | int j; |
1175 | 12.9M | fragi=sb_map[bi]; |
1176 | 12.9M | frag_offs=frag_buf_offs[fragi]; |
1177 | | /*TODO: This could be replaced with SATD^2, since we already have to |
1178 | | compute SATD.*/ |
1179 | 12.9M | x=x2=0; |
1180 | 12.9M | s=src+frag_offs; |
1181 | 116M | for(i=0;i<8;i++){ |
1182 | 930M | for(j=0;j<8;j++){ |
1183 | 827M | unsigned c; |
1184 | 827M | c=s[j]; |
1185 | 827M | x+=c; |
1186 | 827M | x2+=c*c; |
1187 | 827M | } |
1188 | 103M | s+=ystride; |
1189 | 103M | } |
1190 | 12.9M | luma+=x; |
1191 | 12.9M | act=(x2<<6)-x*x; |
1192 | 12.9M | if(act<8<<12){ |
1193 | | /*The region is flat.*/ |
1194 | 8.97M | act=OC_MINI(act,5<<12); |
1195 | 8.97M | } |
1196 | 3.94M | else{ |
1197 | 3.94M | unsigned e1; |
1198 | 3.94M | unsigned e2; |
1199 | 3.94M | unsigned e3; |
1200 | 3.94M | unsigned e4; |
1201 | | /*Test for an edge. |
1202 | | TODO: There are probably much simpler ways to do this (e.g., it could |
1203 | | probably be combined with the SATD calculation). |
1204 | | Alternatively, we could split the block around the mean and compute the |
1205 | | reduction in variance in each half. |
1206 | | For a Gaussian source the reduction should be |
1207 | | (1-2/pi) ~= 0.36338022763241865692446494650994. |
1208 | | Significantly more reduction is a good indication of a bi-level image. |
1209 | | This has the advantage of identifying, in addition to straight edges, |
1210 | | small text regions, which would otherwise be classified as "texture".*/ |
1211 | 3.94M | e1=e2=e3=e4=0; |
1212 | 3.94M | s=src+frag_offs-1; |
1213 | 35.5M | for(i=0;i<8;i++){ |
1214 | 284M | for(j=0;j<8;j++){ |
1215 | 252M | e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j] |
1216 | 252M | +(s+ystride)[j+2]-(s+ystride)[j]); |
1217 | 252M | e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1) |
1218 | 252M | +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]); |
1219 | 252M | e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1) |
1220 | 252M | +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]); |
1221 | 252M | e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1) |
1222 | 252M | +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]); |
1223 | 252M | } |
1224 | 31.5M | s+=ystride; |
1225 | 31.5M | } |
1226 | | /*If the largest component of the edge energy is at least 40% of the |
1227 | | total, then classify the block as an edge block.*/ |
1228 | 3.94M | if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){ |
1229 | | /*act=act_th*(act/act_th)**0.7 |
1230 | | =exp(log(act_th)+0.7*(log(act)-log(act_th))). |
1231 | | Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/ |
1232 | 44.5k | act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10)); |
1233 | 44.5k | } |
1234 | 3.94M | } |
1235 | 12.9M | _activity[bi]=act; |
1236 | 12.9M | } |
1237 | 3.23M | return luma; |
1238 | 3.23M | } |
1239 | | |
1240 | | static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi, |
1241 | 0 | unsigned _activity[4],const unsigned _intra_satd[12]){ |
1242 | 0 | int bi; |
1243 | 0 | for(bi=0;bi<4;bi++){ |
1244 | 0 | unsigned act; |
1245 | 0 | act=(11*_intra_satd[bi]>>8)*_intra_satd[bi]; |
1246 | 0 | if(act<8<<12){ |
1247 | | /*The region is flat.*/ |
1248 | 0 | act=OC_MINI(act,5<<12); |
1249 | 0 | } |
1250 | 0 | _activity[bi]=act; |
1251 | 0 | } |
1252 | 0 | } |
1253 | | |
1254 | | /*Compute the masking scales for the blocks in a macro block. |
1255 | | All masking is computed from the luma blocks. |
1256 | | We derive scaling factors for the chroma blocks from these, and use the same |
1257 | | ones for all chroma blocks, regardless of the subsampling. |
1258 | | It's possible for luma to be perfectly flat and yet have high chroma energy, |
1259 | | but this is unlikely in non-artificial images, and not a case that has been |
1260 | | addressed by any research to my knowledge. |
1261 | | The output of the masking process is two scale factors, which are fed into |
1262 | | the various R-D optimizations. |
1263 | | The first, rd_scale, is applied to D in the equation |
1264 | | D*rd_scale+lambda*R. |
1265 | | This is the form that must be used to properly combine scores from multiple |
1266 | | blocks, and can be interpreted as scaling distortions by their visibility. |
1267 | | The inverse, rd_iscale, is applied to lambda in the equation |
1268 | | D+rd_iscale*lambda*R. |
1269 | | This is equivalent to the first form within a single block, but much faster |
1270 | | to use when evaluating many possible distortions (e.g., during actual |
1271 | | quantization, where separate distortions are evaluated for every |
1272 | | coefficient). |
1273 | | The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are |
1274 | | used to perform the multiplications with the proper re-scaling for the range |
1275 | | of the scaling factors. |
1276 | | Many researchers apply masking values directly to the quantizers used, and |
1277 | | not to the R-D cost. |
1278 | | Since we generally use MSE for D, rd_scale must use the square of their |
1279 | | values to generate an equivalent effect.*/ |
1280 | | static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5], |
1281 | | const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4], |
1282 | 3.23M | unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){ |
1283 | 3.23M | unsigned activity_sum; |
1284 | 3.23M | unsigned la; |
1285 | 3.23M | unsigned lb; |
1286 | 3.23M | unsigned d; |
1287 | 3.23M | int bi; |
1288 | 3.23M | int bi_min; |
1289 | 3.23M | int bi_min2; |
1290 | | /*The ratio lb/la is meant to approximate |
1291 | | ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the |
1292 | | effective luminance masking from~\cite{LKW06} (including the self-masking |
1293 | | deflator). |
1294 | | The following actually turns out to be a pretty good approximation for |
1295 | | _luma>75 or so. |
1296 | | For smaller values luminance does not really follow Weber's Law anyway, and |
1297 | | this approximation gives a much less aggressive bitrate boost in this |
1298 | | region. |
1299 | | Though some researchers claim that contrast sensitivity actually decreases |
1300 | | for very low luminance values, in my experience excessive brightness on |
1301 | | LCDs or buggy color conversions (e.g., treating Y' as full-range instead |
1302 | | of the CCIR 601 range) make artifacts in such regions extremely visible. |
1303 | | We substitute _luma_avg for 128 to allow the strength of the masking to |
1304 | | vary with the actual average image luminance, within certain limits (the |
1305 | | caller has clamped _luma_avg to the range [90,160], inclusive). |
1306 | | @ARTICLE{LKW06, |
1307 | | author="Zhen Liu and Lina J. Karam and Andrew B. Watson", |
1308 | | title="{JPEG2000} Encoding With Perceptual Distortion Control", |
1309 | | journal="{IEEE} Transactions on Image Processing", |
1310 | | volume=15, |
1311 | | number=7, |
1312 | | pages="1763--1778", |
1313 | | month=Jul, |
1314 | | year=2006 |
1315 | | }*/ |
1316 | | #if 0 |
1317 | | la=_luma+4*_luma_avg; |
1318 | | lb=4*_luma+_luma_avg; |
1319 | | #else |
1320 | | /*Disable luminance masking.*/ |
1321 | 3.23M | la=lb=1; |
1322 | 3.23M | #endif |
1323 | 3.23M | activity_sum=0; |
1324 | 16.1M | for(bi=0;bi<4;bi++){ |
1325 | 12.9M | unsigned a; |
1326 | 12.9M | unsigned b; |
1327 | 12.9M | activity_sum+=_activity[bi]; |
1328 | | /*Apply activity masking.*/ |
1329 | 12.9M | a=_activity[bi]+4*_activity_avg; |
1330 | 12.9M | b=4*_activity[bi]+_activity_avg; |
1331 | 12.9M | d=OC_RD_SCALE(b,1); |
1332 | | /*And luminance masking.*/ |
1333 | 12.9M | d=(a+(d>>1))/d; |
1334 | 12.9M | _rd_scale[bi]=(d*la+(lb>>1))/lb; |
1335 | | /*And now the inverse.*/ |
1336 | 12.9M | d=OC_MAXI(OC_RD_ISCALE(a,1),1); |
1337 | 12.9M | d=(b+(d>>1))/d; |
1338 | 12.9M | _rd_iscale[bi]=(d*lb+(la>>1))/la; |
1339 | 12.9M | } |
1340 | | /*Now compute scaling factors for chroma blocks. |
1341 | | We start by finding the two smallest iscales from the luma blocks.*/ |
1342 | 3.23M | bi_min=_rd_iscale[1]<_rd_iscale[0]; |
1343 | 3.23M | bi_min2=1-bi_min; |
1344 | 9.69M | for(bi=2;bi<4;bi++){ |
1345 | 6.46M | if(_rd_iscale[bi]<_rd_iscale[bi_min]){ |
1346 | 692k | bi_min2=bi_min; |
1347 | 692k | bi_min=bi; |
1348 | 692k | } |
1349 | 5.76M | else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi; |
1350 | 6.46M | } |
1351 | | /*If the minimum iscale is less than 1.0, use the second smallest instead, |
1352 | | and force the value to at least 1.0 (inflating chroma is a waste).*/ |
1353 | 3.23M | if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2; |
1354 | 3.23M | d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS); |
1355 | 3.23M | _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]); |
1356 | 3.23M | d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS); |
1357 | 3.23M | _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]); |
1358 | 3.23M | return activity_sum; |
1359 | 3.23M | } |
1360 | | |
1361 | | static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi, |
1362 | 411k | unsigned _frag_satd[12]){ |
1363 | 411k | const unsigned char *src; |
1364 | 411k | const ptrdiff_t *frag_buf_offs; |
1365 | 411k | const ptrdiff_t *sb_map; |
1366 | 411k | const oc_mb_map_plane *mb_map; |
1367 | 411k | const unsigned char *map_idxs; |
1368 | 411k | int map_nidxs; |
1369 | 411k | int mapii; |
1370 | 411k | int mapi; |
1371 | 411k | int ystride; |
1372 | 411k | int pli; |
1373 | 411k | int bi; |
1374 | 411k | ptrdiff_t fragi; |
1375 | 411k | ptrdiff_t frag_offs; |
1376 | 411k | unsigned luma; |
1377 | 411k | int dc; |
1378 | 411k | frag_buf_offs=_enc->state.frag_buf_offs; |
1379 | 411k | sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; |
1380 | 411k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
1381 | 411k | ystride=_enc->state.ref_ystride[0]; |
1382 | 411k | luma=0; |
1383 | 2.05M | for(bi=0;bi<4;bi++){ |
1384 | 1.64M | fragi=sb_map[bi]; |
1385 | 1.64M | frag_offs=frag_buf_offs[fragi]; |
1386 | 1.64M | _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); |
1387 | 1.64M | luma+=dc; |
1388 | 1.64M | } |
1389 | 411k | mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; |
1390 | 411k | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
1391 | 411k | map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
1392 | | /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ |
1393 | 411k | ystride=_enc->state.ref_ystride[1]; |
1394 | 1.41M | for(mapii=4;mapii<map_nidxs;mapii++){ |
1395 | 1.00M | mapi=map_idxs[mapii]; |
1396 | 1.00M | pli=mapi>>2; |
1397 | 1.00M | bi=mapi&3; |
1398 | 1.00M | fragi=mb_map[pli][bi]; |
1399 | 1.00M | frag_offs=frag_buf_offs[fragi]; |
1400 | 1.00M | _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); |
1401 | 1.00M | } |
1402 | 411k | return luma; |
1403 | 411k | } |
1404 | | |
1405 | | /*Select luma block-level quantizers for a MB in an INTRA frame.*/ |
1406 | | static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc, |
1407 | 2.81M | const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){ |
1408 | 2.81M | const unsigned char *src; |
1409 | 2.81M | const ptrdiff_t *frag_buf_offs; |
1410 | 2.81M | const oc_sb_map *sb_maps; |
1411 | 2.81M | oc_fragment *frags; |
1412 | 2.81M | ptrdiff_t frag_offs; |
1413 | 2.81M | ptrdiff_t fragi; |
1414 | 2.81M | oc_qii_state qs[4][3]; |
1415 | 2.81M | unsigned cost[4][3]; |
1416 | 2.81M | unsigned ssd[4][3]; |
1417 | 2.81M | unsigned rate[4][3]; |
1418 | 2.81M | int prev[3][3]; |
1419 | 2.81M | unsigned satd; |
1420 | 2.81M | int dc; |
1421 | 2.81M | unsigned best_cost; |
1422 | 2.81M | unsigned best_ssd; |
1423 | 2.81M | unsigned best_rate; |
1424 | 2.81M | int best_qii; |
1425 | 2.81M | int qii; |
1426 | 2.81M | int lambda; |
1427 | 2.81M | int ystride; |
1428 | 2.81M | int nqis; |
1429 | 2.81M | int bi; |
1430 | 2.81M | frag_buf_offs=_enc->state.frag_buf_offs; |
1431 | 2.81M | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
1432 | 2.81M | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
1433 | 2.81M | ystride=_enc->state.ref_ystride[0]; |
1434 | 2.81M | fragi=sb_maps[_mbi>>2][_mbi&3][0]; |
1435 | 2.81M | frag_offs=frag_buf_offs[fragi]; |
1436 | 2.81M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
1437 | 2.81M | satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); |
1438 | 2.81M | } |
1439 | 0 | else{ |
1440 | 0 | satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); |
1441 | 0 | } |
1442 | 2.81M | nqis=_enc->state.nqis; |
1443 | 2.81M | lambda=_enc->lambda; |
1444 | 7.18M | for(qii=0;qii<nqis;qii++){ |
1445 | 4.36M | oc_qii_state_advance(qs[0]+qii,_qs,qii); |
1446 | 4.36M | rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd) |
1447 | 4.36M | +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE); |
1448 | 4.36M | ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]); |
1449 | 4.36M | cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda); |
1450 | 4.36M | } |
1451 | 11.2M | for(bi=1;bi<4;bi++){ |
1452 | 8.45M | fragi=sb_maps[_mbi>>2][_mbi&3][bi]; |
1453 | 8.45M | frag_offs=frag_buf_offs[fragi]; |
1454 | 8.45M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
1455 | 8.45M | satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); |
1456 | 8.45M | } |
1457 | 0 | else{ |
1458 | 0 | satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); |
1459 | 0 | } |
1460 | 21.5M | for(qii=0;qii<nqis;qii++){ |
1461 | 13.0M | oc_qii_state qt[3]; |
1462 | 13.0M | unsigned cur_ssd; |
1463 | 13.0M | unsigned cur_rate; |
1464 | 13.0M | int best_qij; |
1465 | 13.0M | int qij; |
1466 | 13.0M | oc_qii_state_advance(qt+0,qs[bi-1]+0,qii); |
1467 | 13.0M | cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd); |
1468 | 13.0M | cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]); |
1469 | 13.0M | best_ssd=ssd[bi-1][0]+cur_ssd; |
1470 | 13.0M | best_rate=rate[bi-1][0]+cur_rate |
1471 | 13.0M | +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE); |
1472 | 13.0M | best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda); |
1473 | 13.0M | best_qij=0; |
1474 | 26.3M | for(qij=1;qij<nqis;qij++){ |
1475 | 13.2M | unsigned chain_ssd; |
1476 | 13.2M | unsigned chain_rate; |
1477 | 13.2M | unsigned chain_cost; |
1478 | 13.2M | oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii); |
1479 | 13.2M | chain_ssd=ssd[bi-1][qij]+cur_ssd; |
1480 | 13.2M | chain_rate=rate[bi-1][qij]+cur_rate |
1481 | 13.2M | +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE); |
1482 | 13.2M | chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda); |
1483 | 13.2M | if(chain_cost<best_cost){ |
1484 | 5.39M | best_cost=chain_cost; |
1485 | 5.39M | best_ssd=chain_ssd; |
1486 | 5.39M | best_rate=chain_rate; |
1487 | 5.39M | best_qij=qij; |
1488 | 5.39M | } |
1489 | 13.2M | } |
1490 | 13.0M | *(qs[bi]+qii)=*(qt+best_qij); |
1491 | 13.0M | cost[bi][qii]=best_cost; |
1492 | 13.0M | ssd[bi][qii]=best_ssd; |
1493 | 13.0M | rate[bi][qii]=best_rate; |
1494 | 13.0M | prev[bi-1][qii]=best_qij; |
1495 | 13.0M | } |
1496 | 8.45M | } |
1497 | 2.81M | best_qii=0; |
1498 | 2.81M | best_cost=cost[3][0]; |
1499 | 4.36M | for(qii=1;qii<nqis;qii++){ |
1500 | 1.54M | if(cost[3][qii]<best_cost){ |
1501 | 634k | best_cost=cost[3][qii]; |
1502 | 634k | best_qii=qii; |
1503 | 634k | } |
1504 | 1.54M | } |
1505 | 2.81M | frags=_enc->state.frags; |
1506 | 11.2M | for(bi=3;;){ |
1507 | 11.2M | fragi=sb_maps[_mbi>>2][_mbi&3][bi]; |
1508 | 11.2M | frags[fragi].qii=best_qii; |
1509 | 11.2M | if(bi--<=0)break; |
1510 | 8.45M | best_qii=prev[bi][best_qii]; |
1511 | 8.45M | } |
1512 | 2.81M | return best_cost; |
1513 | 2.81M | } |
1514 | | |
1515 | | /*Select a block-level quantizer for a single chroma block in an INTRA frame.*/ |
1516 | | static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc, |
1517 | 11.2M | const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){ |
1518 | 11.2M | const unsigned char *src; |
1519 | 11.2M | oc_fragment *frags; |
1520 | 11.2M | ptrdiff_t frag_offs; |
1521 | 11.2M | oc_qii_state qt[3]; |
1522 | 11.2M | unsigned cost[3]; |
1523 | 11.2M | unsigned satd; |
1524 | 11.2M | int dc; |
1525 | 11.2M | unsigned best_cost; |
1526 | 11.2M | int best_qii; |
1527 | 11.2M | int qii; |
1528 | 11.2M | int lambda; |
1529 | 11.2M | int ystride; |
1530 | 11.2M | int nqis; |
1531 | 11.2M | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
1532 | 11.2M | ystride=_enc->state.ref_ystride[_pli]; |
1533 | 11.2M | frag_offs=_enc->state.frag_buf_offs[_fragi]; |
1534 | 11.2M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
1535 | 11.2M | satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride); |
1536 | 11.2M | } |
1537 | 0 | else{ |
1538 | 0 | satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride); |
1539 | 0 | } |
1540 | | /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not |
1541 | | worth spending the bits to change the AC quantizer. |
1542 | | TODO: This may be worth revisiting when we separate out DC and AC |
1543 | | predictions from SATD.*/ |
1544 | | #if 0 |
1545 | | nqis=_enc->state.nqis; |
1546 | | #else |
1547 | 11.2M | nqis=1; |
1548 | 11.2M | #endif |
1549 | 11.2M | lambda=_enc->lambda; |
1550 | 11.2M | best_qii=0; |
1551 | 22.4M | for(qii=0;qii<nqis;qii++){ |
1552 | 11.2M | unsigned cur_rate; |
1553 | 11.2M | unsigned cur_ssd; |
1554 | 11.2M | oc_qii_state_advance(qt+qii,_qs,qii); |
1555 | 11.2M | cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd) |
1556 | 11.2M | +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE); |
1557 | 11.2M | cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale); |
1558 | 11.2M | cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda); |
1559 | 11.2M | } |
1560 | 11.2M | best_cost=cost[0]; |
1561 | 11.2M | for(qii=1;qii<nqis;qii++){ |
1562 | 0 | if(cost[qii]<best_cost){ |
1563 | 0 | best_cost=cost[qii]; |
1564 | 0 | best_qii=qii; |
1565 | 0 | } |
1566 | 0 | } |
1567 | 11.2M | frags=_enc->state.frags; |
1568 | 11.2M | frags[_fragi].qii=best_qii; |
1569 | 11.2M | return best_cost; |
1570 | 11.2M | } |
1571 | | |
1572 | | static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc, |
1573 | | oc_enc_pipeline_state *_pipe,unsigned _mbi, |
1574 | 2.81M | const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){ |
1575 | | /*Worst case token stack usage for 4 fragments.*/ |
1576 | 2.81M | oc_token_checkpoint stack[64*4]; |
1577 | 2.81M | oc_token_checkpoint *stackptr; |
1578 | 2.81M | const oc_sb_map *sb_maps; |
1579 | 2.81M | oc_fragment *frags; |
1580 | 2.81M | ptrdiff_t *coded_fragis; |
1581 | 2.81M | ptrdiff_t ncoded_fragis; |
1582 | 2.81M | ptrdiff_t fragi; |
1583 | 2.81M | int bi; |
1584 | 2.81M | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
1585 | 2.81M | frags=_enc->state.frags; |
1586 | 2.81M | coded_fragis=_pipe->coded_fragis[0]; |
1587 | 2.81M | ncoded_fragis=_pipe->ncoded_fragis[0]; |
1588 | 2.81M | stackptr=stack; |
1589 | 14.0M | for(bi=0;bi<4;bi++){ |
1590 | 11.2M | fragi=sb_maps[_mbi>>2][_mbi&3][bi]; |
1591 | 11.2M | frags[fragi].refi=OC_FRAME_SELF; |
1592 | 11.2M | frags[fragi].mb_mode=OC_MODE_INTRA; |
1593 | 11.2M | oc_enc_block_transform_quantize(_enc,_pipe,0,fragi, |
1594 | 11.2M | _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr); |
1595 | 11.2M | coded_fragis[ncoded_fragis++]=fragi; |
1596 | 11.2M | } |
1597 | 2.81M | _pipe->ncoded_fragis[0]=ncoded_fragis; |
1598 | 2.81M | } |
1599 | | |
1600 | | static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc, |
1601 | 346k | oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){ |
1602 | 346k | const ogg_uint16_t *mcu_rd_scale; |
1603 | 346k | const ogg_uint16_t *mcu_rd_iscale; |
1604 | 346k | const oc_sb_map *sb_maps; |
1605 | 346k | ptrdiff_t *coded_fragis; |
1606 | 346k | ptrdiff_t ncoded_fragis; |
1607 | 346k | ptrdiff_t froffset; |
1608 | 346k | int sbi; |
1609 | 346k | mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale; |
1610 | 346k | mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale; |
1611 | 346k | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
1612 | 346k | coded_fragis=_pipe->coded_fragis[_pli]; |
1613 | 346k | ncoded_fragis=_pipe->ncoded_fragis[_pli]; |
1614 | 346k | froffset=_pipe->froffset[_pli]; |
1615 | 2.06M | for(sbi=_sbi_start;sbi<_sbi_end;sbi++){ |
1616 | | /*Worst case token stack usage for 1 fragment.*/ |
1617 | 1.72M | oc_token_checkpoint stack[64]; |
1618 | 1.72M | int quadi; |
1619 | 1.72M | int bi; |
1620 | 34.4M | for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){ |
1621 | 27.5M | ptrdiff_t fragi; |
1622 | 27.5M | fragi=sb_maps[sbi][quadi][bi]; |
1623 | 27.5M | if(fragi>=0){ |
1624 | 11.2M | oc_token_checkpoint *stackptr; |
1625 | 11.2M | unsigned rd_scale; |
1626 | 11.2M | unsigned rd_iscale; |
1627 | 11.2M | rd_scale=mcu_rd_scale[fragi-froffset]; |
1628 | 11.2M | rd_iscale=mcu_rd_iscale[fragi-froffset]; |
1629 | 11.2M | oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale); |
1630 | 11.2M | stackptr=stack; |
1631 | 11.2M | oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi, |
1632 | 11.2M | rd_scale,rd_iscale,NULL,NULL,&stackptr); |
1633 | 11.2M | coded_fragis[ncoded_fragis++]=fragi; |
1634 | 11.2M | } |
1635 | 27.5M | } |
1636 | 1.72M | } |
1637 | 346k | _pipe->ncoded_fragis[_pli]=ncoded_fragis; |
1638 | 346k | } |
1639 | | |
1640 | | /*Analysis stage for an INTRA frame.*/ |
1641 | 22.6k | void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){ |
1642 | 22.6k | ogg_int64_t activity_sum; |
1643 | 22.6k | ogg_int64_t luma_sum; |
1644 | 22.6k | unsigned activity_avg; |
1645 | 22.6k | unsigned luma_avg; |
1646 | 22.6k | const ogg_uint16_t *chroma_rd_scale; |
1647 | 22.6k | ogg_uint16_t *mcu_rd_scale; |
1648 | 22.6k | ogg_uint16_t *mcu_rd_iscale; |
1649 | 22.6k | const unsigned char *map_idxs; |
1650 | 22.6k | int nmap_idxs; |
1651 | 22.6k | oc_sb_flags *sb_flags; |
1652 | 22.6k | signed char *mb_modes; |
1653 | 22.6k | const oc_mb_map *mb_maps; |
1654 | 22.6k | const oc_sb_map *sb_maps; |
1655 | 22.6k | oc_fragment *frags; |
1656 | 22.6k | unsigned stripe_sby; |
1657 | 22.6k | unsigned mcu_nvsbs; |
1658 | 22.6k | int notstart; |
1659 | 22.6k | int notdone; |
1660 | 22.6k | int refi; |
1661 | 22.6k | int pli; |
1662 | 22.6k | _enc->state.frame_type=OC_INTRA_FRAME; |
1663 | 22.6k | oc_enc_tokenize_start(_enc); |
1664 | 22.6k | oc_enc_pipeline_init(_enc,&_enc->pipe); |
1665 | 22.6k | oc_enc_mode_rd_init(_enc); |
1666 | 22.6k | activity_sum=luma_sum=0; |
1667 | 22.6k | activity_avg=_enc->activity_avg; |
1668 | 22.6k | luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8); |
1669 | 22.6k | chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]]; |
1670 | 22.6k | mcu_rd_scale=_enc->mcu_rd_scale; |
1671 | 22.6k | mcu_rd_iscale=_enc->mcu_rd_iscale; |
1672 | | /*Choose MVs and MB modes and quantize and code luma. |
1673 | | Must be done in Hilbert order.*/ |
1674 | 22.6k | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
1675 | 22.6k | nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
1676 | 22.6k | _enc->state.ncoded_fragis[0]=0; |
1677 | 22.6k | _enc->state.ncoded_fragis[1]=0; |
1678 | 22.6k | _enc->state.ncoded_fragis[2]=0; |
1679 | 22.6k | sb_flags=_enc->state.sb_flags; |
1680 | 22.6k | mb_modes=_enc->state.mb_modes; |
1681 | 22.6k | mb_maps=(const oc_mb_map *)_enc->state.mb_maps; |
1682 | 22.6k | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
1683 | 22.6k | frags=_enc->state.frags; |
1684 | 22.6k | notstart=0; |
1685 | 22.6k | notdone=1; |
1686 | 22.6k | mcu_nvsbs=_enc->mcu_nvsbs; |
1687 | 196k | for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){ |
1688 | 173k | ptrdiff_t cfroffset; |
1689 | 173k | unsigned sbi; |
1690 | 173k | unsigned sbi_end; |
1691 | 173k | notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby); |
1692 | 173k | sbi_end=_enc->pipe.sbi_end[0]; |
1693 | 173k | cfroffset=_enc->pipe.froffset[1]; |
1694 | 1.48M | for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){ |
1695 | 1.31M | int quadi; |
1696 | | /*Mode addressing is through Y plane, always 4 MB per SB.*/ |
1697 | 6.56M | for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ |
1698 | 2.81M | unsigned activity[4]; |
1699 | 2.81M | unsigned rd_scale[5]; |
1700 | 2.81M | unsigned rd_iscale[5]; |
1701 | 2.81M | unsigned luma; |
1702 | 2.81M | unsigned mbi; |
1703 | 2.81M | int mapii; |
1704 | 2.81M | int mapi; |
1705 | 2.81M | int bi; |
1706 | 2.81M | ptrdiff_t fragi; |
1707 | 2.81M | mbi=sbi<<2|quadi; |
1708 | | /*Activity masking.*/ |
1709 | 2.81M | if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
1710 | 2.81M | luma=oc_mb_activity(_enc,mbi,activity); |
1711 | 2.81M | } |
1712 | 0 | else{ |
1713 | 0 | unsigned intra_satd[12]; |
1714 | 0 | luma=oc_mb_intra_satd(_enc,mbi,intra_satd); |
1715 | 0 | oc_mb_activity_fast(_enc,mbi,activity,intra_satd); |
1716 | 0 | for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0; |
1717 | 0 | } |
1718 | 2.81M | activity_sum+=oc_mb_masking(rd_scale,rd_iscale, |
1719 | 2.81M | chroma_rd_scale,activity,activity_avg,luma,luma_avg); |
1720 | 2.81M | luma_sum+=luma; |
1721 | | /*Motion estimation: |
1722 | | We do a basic 1MV search for all macroblocks, coded or not, |
1723 | | keyframe or not, unless we aren't using motion estimation at all.*/ |
1724 | 2.81M | if(!_recode&&_enc->state.curframe_num>0&& |
1725 | 2.81M | _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){ |
1726 | 648 | oc_mcenc_search(_enc,mbi); |
1727 | 648 | } |
1728 | 2.81M | if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
1729 | 2.81M | oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale); |
1730 | 2.81M | } |
1731 | 2.81M | mb_modes[mbi]=OC_MODE_INTRA; |
1732 | 2.81M | oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe, |
1733 | 2.81M | mbi,rd_scale,rd_iscale); |
1734 | | /*Propagate final MB mode and MVs to the chroma blocks.*/ |
1735 | 14.0M | for(mapii=4;mapii<nmap_idxs;mapii++){ |
1736 | 11.2M | mapi=map_idxs[mapii]; |
1737 | 11.2M | pli=mapi>>2; |
1738 | 11.2M | bi=mapi&3; |
1739 | 11.2M | fragi=mb_maps[mbi][pli][bi]; |
1740 | 11.2M | frags[fragi].refi=OC_FRAME_SELF; |
1741 | 11.2M | frags[fragi].mb_mode=OC_MODE_INTRA; |
1742 | 11.2M | } |
1743 | | /*Save masking scale factors for chroma blocks.*/ |
1744 | 8.42M | for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){ |
1745 | 5.60M | mapi=map_idxs[mapii]; |
1746 | 5.60M | bi=mapi&3; |
1747 | 5.60M | fragi=mb_maps[mbi][1][bi]; |
1748 | 5.60M | mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4]; |
1749 | 5.60M | mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4]; |
1750 | 5.60M | } |
1751 | 2.81M | } |
1752 | 1.31M | } |
1753 | 173k | oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone); |
1754 | | /*Code chroma planes.*/ |
1755 | 520k | for(pli=1;pli<3;pli++){ |
1756 | 346k | oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe, |
1757 | 346k | pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]); |
1758 | 346k | oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone); |
1759 | 346k | } |
1760 | 173k | notstart=1; |
1761 | 173k | } |
1762 | | /*Compute the average block activity and MB luma score for the frame.*/ |
1763 | 22.6k | _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN, |
1764 | 22.6k | (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/ |
1765 | 22.6k | _enc->state.fplanes[0].nfrags)); |
1766 | 22.6k | _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs); |
1767 | | /*Finish filling in the reference frame borders.*/ |
1768 | 22.6k | refi=_enc->state.ref_frame_idx[OC_FRAME_SELF]; |
1769 | 90.6k | for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli); |
1770 | 22.6k | _enc->state.ntotal_coded_fragis=_enc->state.nfrags; |
1771 | 22.6k | } |
1772 | | |
1773 | | |
1774 | | |
1775 | | /*Cost information about a MB mode.*/ |
1776 | | struct oc_mode_choice{ |
1777 | | unsigned cost; |
1778 | | unsigned ssd; |
1779 | | unsigned rate; |
1780 | | unsigned overhead; |
1781 | | unsigned char qii[12]; |
1782 | | }; |
1783 | | |
1784 | | |
1785 | | |
1786 | 5.60M | static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){ |
1787 | 5.60M | _modec->cost=OC_MODE_RD_COST(_modec->ssd, |
1788 | 5.60M | _modec->rate+_modec->overhead,_lambda); |
1789 | 5.60M | } |
1790 | | |
1791 | | /*A set of skip SSD's to use to disable early skipping.*/ |
1792 | | static const unsigned OC_NOSKIP[12]={ |
1793 | | UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX, |
1794 | | UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX, |
1795 | | UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX |
1796 | | }; |
1797 | | |
1798 | | /*The estimated number of bits used by a coded chroma block to specify the AC |
1799 | | quantizer. |
1800 | | TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression); |
1801 | | measurements suggest this is in the right ballpark, but it varies somewhat |
1802 | | with lambda.*/ |
1803 | 10.4M | #define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1) |
1804 | | |
1805 | | static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc, |
1806 | | oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs, |
1807 | | const unsigned _frag_satd[12],const unsigned _skip_ssd[12], |
1808 | 4.26M | const unsigned _rd_scale[4],int _qti){ |
1809 | 4.26M | oc_fr_state fr; |
1810 | 4.26M | oc_qii_state qs; |
1811 | 4.26M | unsigned ssd; |
1812 | 4.26M | unsigned rate; |
1813 | 4.26M | unsigned satd; |
1814 | 4.26M | unsigned best_ssd; |
1815 | 4.26M | unsigned best_rate; |
1816 | 4.26M | int best_fri; |
1817 | 4.26M | int best_qii; |
1818 | 4.26M | int lambda; |
1819 | 4.26M | int nqis; |
1820 | 4.26M | int nskipped; |
1821 | 4.26M | int bi; |
1822 | 4.26M | lambda=_enc->lambda; |
1823 | 4.26M | nqis=_enc->state.nqis; |
1824 | | /*We could do a trellis optimization here, but we don't make final skip |
1825 | | decisions until after transform+quantization, so the result wouldn't be |
1826 | | optimal anyway. |
1827 | | Instead we just use a greedy approach; for most SATD values, the |
1828 | | differences between the qiis are large enough to drown out the cost to |
1829 | | code the flags, anyway.*/ |
1830 | 4.26M | *&fr=*_fr; |
1831 | 4.26M | *&qs=*_qs; |
1832 | 4.26M | ssd=rate=nskipped=0; |
1833 | 21.3M | for(bi=0;bi<4;bi++){ |
1834 | 17.0M | oc_fr_state ft[2]; |
1835 | 17.0M | oc_qii_state qt[3]; |
1836 | 17.0M | unsigned best_cost; |
1837 | 17.0M | unsigned cur_cost; |
1838 | 17.0M | unsigned cur_ssd; |
1839 | 17.0M | unsigned cur_rate; |
1840 | 17.0M | unsigned cur_overhead; |
1841 | 17.0M | int qii; |
1842 | 17.0M | satd=_frag_satd[bi]; |
1843 | 17.0M | *(ft+0)=*&fr; |
1844 | 17.0M | oc_fr_code_block(ft+0); |
1845 | 17.0M | cur_overhead=ft[0].bits-fr.bits; |
1846 | 17.0M | best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd) |
1847 | 17.0M | +(cur_overhead<<OC_BIT_SCALE); |
1848 | 17.0M | if(nqis>1){ |
1849 | 5.56M | oc_qii_state_advance(qt+0,&qs,0); |
1850 | 5.56M | best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE; |
1851 | 5.56M | } |
1852 | 17.0M | best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]); |
1853 | 17.0M | best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda); |
1854 | 17.0M | best_fri=0; |
1855 | 17.0M | best_qii=0; |
1856 | 23.0M | for(qii=1;qii<nqis;qii++){ |
1857 | 5.96M | oc_qii_state_advance(qt+qii,&qs,qii); |
1858 | 5.96M | cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd) |
1859 | 5.96M | +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE); |
1860 | 5.96M | cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]); |
1861 | 5.96M | cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda); |
1862 | 5.96M | if(cur_cost<best_cost){ |
1863 | 1.67M | best_cost=cur_cost; |
1864 | 1.67M | best_ssd=cur_ssd; |
1865 | 1.67M | best_rate=cur_rate; |
1866 | 1.67M | best_qii=qii; |
1867 | 1.67M | } |
1868 | 5.96M | } |
1869 | 17.0M | if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){ |
1870 | 9.45M | *(ft+1)=*&fr; |
1871 | 9.45M | oc_fr_skip_block(ft+1); |
1872 | 9.45M | cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE; |
1873 | 9.45M | cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE; |
1874 | 9.45M | cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda); |
1875 | 9.45M | if(cur_cost<=best_cost){ |
1876 | 2.67M | best_ssd=cur_ssd; |
1877 | 2.67M | best_rate=cur_overhead; |
1878 | 2.67M | best_fri=1; |
1879 | 2.67M | best_qii+=4; |
1880 | 2.67M | } |
1881 | 9.45M | } |
1882 | 17.0M | rate+=best_rate; |
1883 | 17.0M | ssd+=best_ssd; |
1884 | 17.0M | *&fr=*(ft+best_fri); |
1885 | 17.0M | if(best_fri==0)*&qs=*(qt+best_qii); |
1886 | 2.67M | else nskipped++; |
1887 | 17.0M | _modec->qii[bi]=best_qii; |
1888 | 17.0M | } |
1889 | 4.26M | _modec->ssd=ssd; |
1890 | 4.26M | _modec->rate=rate; |
1891 | 4.26M | } |
1892 | | |
1893 | | static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc, |
1894 | | oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs, |
1895 | | const unsigned _frag_satd[12],const unsigned _skip_ssd[12], |
1896 | 4.26M | unsigned _rd_scale,int _qti){ |
1897 | 4.26M | unsigned ssd; |
1898 | 4.26M | unsigned rate; |
1899 | 4.26M | unsigned satd; |
1900 | 4.26M | unsigned best_ssd; |
1901 | 4.26M | unsigned best_rate; |
1902 | 4.26M | int best_qii; |
1903 | 4.26M | unsigned cur_cost; |
1904 | 4.26M | unsigned cur_ssd; |
1905 | 4.26M | unsigned cur_rate; |
1906 | 4.26M | int lambda; |
1907 | 4.26M | int nblocks; |
1908 | 4.26M | int nqis; |
1909 | 4.26M | int pli; |
1910 | 4.26M | int bi; |
1911 | 4.26M | int qii; |
1912 | 4.26M | lambda=_enc->lambda; |
1913 | | /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not |
1914 | | worth spending the bits to change the AC quantizer. |
1915 | | TODO: This may be worth revisiting when we separate out DC and AC |
1916 | | predictions from SATD.*/ |
1917 | | #if 0 |
1918 | | nqis=_enc->state.nqis; |
1919 | | #else |
1920 | 4.26M | nqis=1; |
1921 | 4.26M | #endif |
1922 | 4.26M | ssd=_modec->ssd; |
1923 | 4.26M | rate=_modec->rate; |
1924 | | /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded |
1925 | | order, we assume a constant overhead for coded block and qii flags.*/ |
1926 | 4.26M | nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
1927 | 4.26M | nblocks=(nblocks-4>>1)+4; |
1928 | 4.26M | bi=4; |
1929 | 12.8M | for(pli=1;pli<3;pli++){ |
1930 | 18.9M | for(;bi<nblocks;bi++){ |
1931 | 10.4M | unsigned best_cost; |
1932 | 10.4M | satd=_frag_satd[bi]; |
1933 | 10.4M | best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd) |
1934 | 10.4M | +OC_CHROMA_QII_RATE; |
1935 | 10.4M | best_ssd=OC_RD_SCALE(best_ssd,_rd_scale); |
1936 | 10.4M | best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda); |
1937 | 10.4M | best_qii=0; |
1938 | 10.4M | for(qii=1;qii<nqis;qii++){ |
1939 | 0 | cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd) |
1940 | 0 | +OC_CHROMA_QII_RATE; |
1941 | 0 | cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale); |
1942 | 0 | cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda); |
1943 | 0 | if(cur_cost<best_cost){ |
1944 | 0 | best_cost=cur_cost; |
1945 | 0 | best_ssd=cur_ssd; |
1946 | 0 | best_rate=cur_rate; |
1947 | 0 | best_qii=qii; |
1948 | 0 | } |
1949 | 0 | } |
1950 | 10.4M | if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){ |
1951 | 7.44M | cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE; |
1952 | 7.44M | cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda); |
1953 | 7.44M | if(cur_cost<=best_cost){ |
1954 | 3.06M | best_ssd=cur_ssd; |
1955 | 3.06M | best_rate=0; |
1956 | 3.06M | best_qii+=4; |
1957 | 3.06M | } |
1958 | 7.44M | } |
1959 | 10.4M | rate+=best_rate; |
1960 | 10.4M | ssd+=best_ssd; |
1961 | 10.4M | _modec->qii[bi]=best_qii; |
1962 | 10.4M | } |
1963 | 8.53M | nblocks=(nblocks-4<<1)+4; |
1964 | 8.53M | } |
1965 | 4.26M | _modec->ssd=ssd; |
1966 | 4.26M | _modec->rate=rate; |
1967 | 4.26M | } |
1968 | | |
1969 | | static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe, |
1970 | 411k | unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){ |
1971 | 411k | const unsigned char *src; |
1972 | 411k | const unsigned char *ref; |
1973 | 411k | int ystride; |
1974 | 411k | const oc_fragment *frags; |
1975 | 411k | const ptrdiff_t *frag_buf_offs; |
1976 | 411k | const ptrdiff_t *sb_map; |
1977 | 411k | const oc_mb_map_plane *mb_map; |
1978 | 411k | const unsigned char *map_idxs; |
1979 | 411k | oc_mv *mvs; |
1980 | 411k | int map_nidxs; |
1981 | 411k | unsigned uncoded_ssd; |
1982 | 411k | int mapii; |
1983 | 411k | int mapi; |
1984 | 411k | int pli; |
1985 | 411k | int bi; |
1986 | 411k | ptrdiff_t fragi; |
1987 | 411k | ptrdiff_t frag_offs; |
1988 | 411k | int borderi; |
1989 | 411k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
1990 | 411k | ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; |
1991 | 411k | ystride=_enc->state.ref_ystride[0]; |
1992 | 411k | frags=_enc->state.frags; |
1993 | 411k | frag_buf_offs=_enc->state.frag_buf_offs; |
1994 | 411k | sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; |
1995 | 411k | mvs=_enc->mb_info[_mbi].block_mv; |
1996 | 2.05M | for(bi=0;bi<4;bi++){ |
1997 | 1.64M | fragi=sb_map[bi]; |
1998 | 1.64M | borderi=frags[fragi].borderi; |
1999 | 1.64M | frag_offs=frag_buf_offs[fragi]; |
2000 | 1.64M | if(borderi<0){ |
2001 | 1.19M | uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride); |
2002 | 1.19M | } |
2003 | 449k | else{ |
2004 | 449k | uncoded_ssd=oc_enc_frag_border_ssd(_enc, |
2005 | 449k | src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask); |
2006 | 449k | } |
2007 | | /*Scale to match DCT domain and RD.*/ |
2008 | 1.64M | uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]); |
2009 | | /*Motion is a special case; if there is more than a full-pixel motion |
2010 | | against the prior frame, penalize skipping. |
2011 | | TODO: The factor of two here is a kludge, but it tested out better than a |
2012 | | hard limit.*/ |
2013 | 1.64M | if(mvs[bi]!=0)uncoded_ssd*=2; |
2014 | 1.64M | _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd; |
2015 | 1.64M | } |
2016 | 411k | mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; |
2017 | 411k | map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
2018 | 411k | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
2019 | 411k | map_nidxs=(map_nidxs-4>>1)+4; |
2020 | 411k | mapii=4; |
2021 | 411k | mvs=_enc->mb_info[_mbi].unref_mv; |
2022 | 1.23M | for(pli=1;pli<3;pli++){ |
2023 | 822k | ystride=_enc->state.ref_ystride[pli]; |
2024 | 1.82M | for(;mapii<map_nidxs;mapii++){ |
2025 | 1.00M | mapi=map_idxs[mapii]; |
2026 | 1.00M | bi=mapi&3; |
2027 | 1.00M | fragi=mb_map[pli][bi]; |
2028 | 1.00M | borderi=frags[fragi].borderi; |
2029 | 1.00M | frag_offs=frag_buf_offs[fragi]; |
2030 | 1.00M | if(borderi<0){ |
2031 | 716k | uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride); |
2032 | 716k | } |
2033 | 289k | else{ |
2034 | 289k | uncoded_ssd=oc_enc_frag_border_ssd(_enc, |
2035 | 289k | src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask); |
2036 | 289k | } |
2037 | | /*Scale to match DCT domain and RD.*/ |
2038 | 1.00M | uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]); |
2039 | | /*Motion is a special case; if there is more than a full-pixel motion |
2040 | | against the prior frame, penalize skipping. |
2041 | | TODO: The factor of two here is a kludge, but it tested out better than |
2042 | | a hard limit*/ |
2043 | 1.00M | if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2; |
2044 | 1.00M | _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd; |
2045 | 1.00M | } |
2046 | 822k | map_nidxs=(map_nidxs-4<<1)+4; |
2047 | 822k | } |
2048 | 411k | } |
2049 | | |
2050 | | |
2051 | | static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec, |
2052 | | unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs, |
2053 | | const unsigned _frag_satd[12],const unsigned _skip_ssd[12], |
2054 | 822k | const unsigned _rd_scale[5]){ |
2055 | 822k | oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0); |
2056 | 822k | oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, |
2057 | 822k | _frag_satd,_skip_ssd,_rd_scale[4],0); |
2058 | 822k | _modec->overhead= |
2059 | 822k | oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE; |
2060 | 822k | oc_mode_set_cost(_modec,_enc->lambda); |
2061 | 822k | } |
2062 | | |
2063 | | static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec, |
2064 | | unsigned _mbi,int _mb_mode,oc_mv _mv, |
2065 | | const oc_fr_state *_fr,const oc_qii_state *_qs, |
2066 | 2.98M | const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){ |
2067 | 2.98M | unsigned frag_satd[12]; |
2068 | 2.98M | const unsigned char *src; |
2069 | 2.98M | const unsigned char *ref; |
2070 | 2.98M | int ystride; |
2071 | 2.98M | const ptrdiff_t *frag_buf_offs; |
2072 | 2.98M | const ptrdiff_t *sb_map; |
2073 | 2.98M | const oc_mb_map_plane *mb_map; |
2074 | 2.98M | const unsigned char *map_idxs; |
2075 | 2.98M | int map_nidxs; |
2076 | 2.98M | int mapii; |
2077 | 2.98M | int mapi; |
2078 | 2.98M | int mv_offs[2]; |
2079 | 2.98M | int pli; |
2080 | 2.98M | int bi; |
2081 | 2.98M | ptrdiff_t fragi; |
2082 | 2.98M | ptrdiff_t frag_offs; |
2083 | 2.98M | int dc; |
2084 | 2.98M | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
2085 | 2.98M | ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)]; |
2086 | 2.98M | ystride=_enc->state.ref_ystride[0]; |
2087 | 2.98M | frag_buf_offs=_enc->state.frag_buf_offs; |
2088 | 2.98M | sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3]; |
2089 | 2.98M | _modec->rate=_modec->ssd=0; |
2090 | 2.98M | if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){ |
2091 | 2.87M | for(bi=0;bi<4;bi++){ |
2092 | 2.30M | fragi=sb_map[bi]; |
2093 | 2.30M | frag_offs=frag_buf_offs[fragi]; |
2094 | 2.30M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
2095 | 2.30M | frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, |
2096 | 2.30M | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); |
2097 | 2.30M | frag_satd[bi]+=abs(dc); |
2098 | 2.30M | } |
2099 | 0 | else{ |
2100 | 0 | frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs, |
2101 | 0 | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); |
2102 | 0 | } |
2103 | 2.30M | } |
2104 | 575k | } |
2105 | 2.40M | else{ |
2106 | 12.0M | for(bi=0;bi<4;bi++){ |
2107 | 9.61M | fragi=sb_map[bi]; |
2108 | 9.61M | frag_offs=frag_buf_offs[fragi]; |
2109 | 9.61M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
2110 | 9.61M | frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs, |
2111 | 9.61M | ref+frag_offs+mv_offs[0],ystride); |
2112 | 9.61M | frag_satd[bi]+=abs(dc); |
2113 | 9.61M | } |
2114 | 0 | else{ |
2115 | 0 | frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs, |
2116 | 0 | ref+frag_offs+mv_offs[0],ystride); |
2117 | 0 | } |
2118 | 9.61M | } |
2119 | 2.40M | } |
2120 | 2.98M | mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; |
2121 | 2.98M | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
2122 | 2.98M | map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
2123 | | /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ |
2124 | 2.98M | ystride=_enc->state.ref_ystride[1]; |
2125 | 2.98M | if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){ |
2126 | 3.50M | for(mapii=4;mapii<map_nidxs;mapii++){ |
2127 | 2.42M | mapi=map_idxs[mapii]; |
2128 | 2.42M | pli=mapi>>2; |
2129 | 2.42M | bi=mapi&3; |
2130 | 2.42M | fragi=mb_map[pli][bi]; |
2131 | 2.42M | frag_offs=frag_buf_offs[fragi]; |
2132 | 2.42M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
2133 | 2.42M | frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, |
2134 | 2.42M | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); |
2135 | 2.42M | frag_satd[mapii]+=abs(dc); |
2136 | 2.42M | } |
2137 | 0 | else{ |
2138 | 0 | frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs, |
2139 | 0 | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX); |
2140 | 0 | } |
2141 | 2.42M | } |
2142 | 1.08M | } |
2143 | 1.89M | else{ |
2144 | 6.75M | for(mapii=4;mapii<map_nidxs;mapii++){ |
2145 | 4.85M | mapi=map_idxs[mapii]; |
2146 | 4.85M | pli=mapi>>2; |
2147 | 4.85M | bi=mapi&3; |
2148 | 4.85M | fragi=mb_map[pli][bi]; |
2149 | 4.85M | frag_offs=frag_buf_offs[fragi]; |
2150 | 4.85M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
2151 | 4.85M | frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs, |
2152 | 4.85M | ref+frag_offs+mv_offs[0],ystride); |
2153 | 4.85M | frag_satd[mapii]+=abs(dc); |
2154 | 4.85M | } |
2155 | 0 | else{ |
2156 | 0 | frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs, |
2157 | 0 | ref+frag_offs+mv_offs[0],ystride); |
2158 | 0 | } |
2159 | 4.85M | } |
2160 | 1.89M | } |
2161 | 2.98M | oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1); |
2162 | 2.98M | oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, |
2163 | 2.98M | frag_satd,_skip_ssd,_rd_scale[4],1); |
2164 | 2.98M | _modec->overhead= |
2165 | 2.98M | oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE; |
2166 | 2.98M | oc_mode_set_cost(_modec,_enc->lambda); |
2167 | 2.98M | } |
2168 | | |
2169 | | static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec, |
2170 | | unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs, |
2171 | 822k | const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){ |
2172 | 822k | oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale); |
2173 | 822k | } |
2174 | | |
2175 | | static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, |
2176 | | unsigned _mbi,int _mb_mode,oc_mv _mv, |
2177 | | const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12], |
2178 | 1.33M | const unsigned _rd_scale[4]){ |
2179 | 1.33M | int bits0; |
2180 | 1.33M | oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale); |
2181 | 1.33M | bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31]; |
2182 | 1.33M | _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12) |
2183 | 1.33M | -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE; |
2184 | 1.33M | oc_mode_set_cost(_modec,_enc->lambda); |
2185 | 1.33M | return bits0; |
2186 | 1.33M | } |
2187 | | |
2188 | | /*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/ |
2189 | | static const unsigned char OC_MB_PHASE[4][4]={ |
2190 | | {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0} |
2191 | | }; |
2192 | | |
2193 | | static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec, |
2194 | | unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs, |
2195 | 465k | const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){ |
2196 | 465k | unsigned frag_satd[12]; |
2197 | 465k | oc_mv lbmvs[4]; |
2198 | 465k | oc_mv cbmvs[4]; |
2199 | 465k | const unsigned char *src; |
2200 | 465k | const unsigned char *ref; |
2201 | 465k | int ystride; |
2202 | 465k | const ptrdiff_t *frag_buf_offs; |
2203 | 465k | oc_mv *frag_mvs; |
2204 | 465k | const oc_mb_map_plane *mb_map; |
2205 | 465k | const unsigned char *map_idxs; |
2206 | 465k | int map_nidxs; |
2207 | 465k | int nqis; |
2208 | 465k | int mapii; |
2209 | 465k | int mapi; |
2210 | 465k | int mv_offs[2]; |
2211 | 465k | int pli; |
2212 | 465k | int bi; |
2213 | 465k | ptrdiff_t fragi; |
2214 | 465k | ptrdiff_t frag_offs; |
2215 | 465k | int bits0; |
2216 | 465k | int bits1; |
2217 | 465k | unsigned satd; |
2218 | 465k | int dc; |
2219 | 465k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
2220 | 465k | ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; |
2221 | 465k | ystride=_enc->state.ref_ystride[0]; |
2222 | 465k | frag_buf_offs=_enc->state.frag_buf_offs; |
2223 | 465k | frag_mvs=_enc->state.frag_mvs; |
2224 | 465k | mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi]; |
2225 | 465k | _modec->rate=_modec->ssd=0; |
2226 | 2.32M | for(bi=0;bi<4;bi++){ |
2227 | 1.86M | fragi=mb_map[0][bi]; |
2228 | | /*Save the block MVs as the current ones while we're here; we'll replace |
2229 | | them if we don't ultimately choose 4MV mode.*/ |
2230 | 1.86M | frag_mvs[fragi]=_mv[bi]; |
2231 | 1.86M | frag_offs=frag_buf_offs[fragi]; |
2232 | 1.86M | if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){ |
2233 | 184k | satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, |
2234 | 184k | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); |
2235 | 184k | } |
2236 | 1.67M | else{ |
2237 | 1.67M | satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs, |
2238 | 1.67M | ref+frag_offs+mv_offs[0],ystride); |
2239 | 1.67M | } |
2240 | 1.86M | frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc); |
2241 | 1.86M | } |
2242 | 465k | oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd, |
2243 | 465k | _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1); |
2244 | | /*Figure out which blocks are being skipped and give them (0,0) MVs.*/ |
2245 | 465k | bits0=0; |
2246 | 465k | bits1=0; |
2247 | 465k | nqis=_enc->state.nqis; |
2248 | 2.32M | for(bi=0;bi<4;bi++){ |
2249 | 1.86M | if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0; |
2250 | 1.56M | else{ |
2251 | 1.56M | lbmvs[bi]=_mv[bi]; |
2252 | 1.56M | bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31] |
2253 | 1.56M | +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31]; |
2254 | 1.56M | bits1+=12; |
2255 | 1.56M | } |
2256 | 1.86M | } |
2257 | 465k | (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs); |
2258 | 465k | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
2259 | 465k | map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
2260 | | /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/ |
2261 | 465k | ystride=_enc->state.ref_ystride[1]; |
2262 | 1.61M | for(mapii=4;mapii<map_nidxs;mapii++){ |
2263 | 1.14M | mapi=map_idxs[mapii]; |
2264 | 1.14M | pli=mapi>>2; |
2265 | 1.14M | bi=mapi&3; |
2266 | 1.14M | fragi=mb_map[pli][bi]; |
2267 | 1.14M | frag_offs=frag_buf_offs[fragi]; |
2268 | | /*TODO: We could save half these calls by re-using the results for the Cb |
2269 | | and Cr planes; is it worth it?*/ |
2270 | 1.14M | if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){ |
2271 | 673k | satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs, |
2272 | 673k | ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride); |
2273 | 673k | } |
2274 | 473k | else{ |
2275 | 473k | satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs, |
2276 | 473k | ref+frag_offs+mv_offs[0],ystride); |
2277 | 473k | } |
2278 | 1.14M | frag_satd[mapii]=satd+abs(dc); |
2279 | 1.14M | } |
2280 | 465k | oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs, |
2281 | 465k | frag_satd,_skip_ssd,_rd_scale[4],1); |
2282 | 465k | _modec->overhead= |
2283 | 465k | oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR) |
2284 | 465k | +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1) |
2285 | 465k | -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE; |
2286 | 465k | oc_mode_set_cost(_modec,_enc->lambda); |
2287 | 465k | } |
2288 | | |
2289 | 39.4k | int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){ |
2290 | 39.4k | oc_set_chroma_mvs_func set_chroma_mvs; |
2291 | 39.4k | oc_qii_state intra_luma_qs; |
2292 | 39.4k | oc_mv last_mv; |
2293 | 39.4k | oc_mv prior_mv; |
2294 | 39.4k | ogg_int64_t interbits; |
2295 | 39.4k | ogg_int64_t intrabits; |
2296 | 39.4k | ogg_int64_t activity_sum; |
2297 | 39.4k | ogg_int64_t luma_sum; |
2298 | 39.4k | unsigned activity_avg; |
2299 | 39.4k | unsigned luma_avg; |
2300 | 39.4k | const ogg_uint16_t *chroma_rd_scale; |
2301 | 39.4k | ogg_uint16_t *mcu_rd_scale; |
2302 | 39.4k | ogg_uint16_t *mcu_rd_iscale; |
2303 | 39.4k | const unsigned char *map_idxs; |
2304 | 39.4k | int nmap_idxs; |
2305 | 39.4k | unsigned *coded_mbis; |
2306 | 39.4k | unsigned *uncoded_mbis; |
2307 | 39.4k | size_t ncoded_mbis; |
2308 | 39.4k | size_t nuncoded_mbis; |
2309 | 39.4k | oc_sb_flags *sb_flags; |
2310 | 39.4k | signed char *mb_modes; |
2311 | 39.4k | const oc_sb_map *sb_maps; |
2312 | 39.4k | const oc_mb_map *mb_maps; |
2313 | 39.4k | oc_mb_enc_info *embs; |
2314 | 39.4k | oc_fragment *frags; |
2315 | 39.4k | oc_mv *frag_mvs; |
2316 | 39.4k | unsigned stripe_sby; |
2317 | 39.4k | unsigned mcu_nvsbs; |
2318 | 39.4k | int notstart; |
2319 | 39.4k | int notdone; |
2320 | 39.4k | unsigned sbi; |
2321 | 39.4k | unsigned sbi_end; |
2322 | 39.4k | int refi; |
2323 | 39.4k | int pli; |
2324 | 39.4k | int sp_level; |
2325 | 39.4k | sp_level=_enc->sp_level; |
2326 | 39.4k | set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt]; |
2327 | 39.4k | _enc->state.frame_type=OC_INTER_FRAME; |
2328 | 39.4k | oc_mode_scheme_chooser_reset(&_enc->chooser); |
2329 | 39.4k | oc_enc_tokenize_start(_enc); |
2330 | 39.4k | oc_enc_pipeline_init(_enc,&_enc->pipe); |
2331 | 39.4k | oc_enc_mode_rd_init(_enc); |
2332 | 39.4k | if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs); |
2333 | 39.4k | _enc->mv_bits[0]=_enc->mv_bits[1]=0; |
2334 | 39.4k | interbits=intrabits=0; |
2335 | 39.4k | activity_sum=luma_sum=0; |
2336 | 39.4k | activity_avg=_enc->activity_avg; |
2337 | 39.4k | luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8); |
2338 | 39.4k | chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]]; |
2339 | 39.4k | mcu_rd_scale=_enc->mcu_rd_scale; |
2340 | 39.4k | mcu_rd_iscale=_enc->mcu_rd_iscale; |
2341 | 39.4k | last_mv=prior_mv=0; |
2342 | | /*Choose MVs and MB modes and quantize and code luma. |
2343 | | Must be done in Hilbert order.*/ |
2344 | 39.4k | map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt]; |
2345 | 39.4k | nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt]; |
2346 | 39.4k | coded_mbis=_enc->coded_mbis; |
2347 | 39.4k | uncoded_mbis=coded_mbis+_enc->state.nmbs; |
2348 | 39.4k | ncoded_mbis=0; |
2349 | 39.4k | nuncoded_mbis=0; |
2350 | 39.4k | _enc->state.ncoded_fragis[0]=0; |
2351 | 39.4k | _enc->state.ncoded_fragis[1]=0; |
2352 | 39.4k | _enc->state.ncoded_fragis[2]=0; |
2353 | 39.4k | sb_flags=_enc->state.sb_flags; |
2354 | 39.4k | mb_modes=_enc->state.mb_modes; |
2355 | 39.4k | sb_maps=(const oc_sb_map *)_enc->state.sb_maps; |
2356 | 39.4k | mb_maps=(const oc_mb_map *)_enc->state.mb_maps; |
2357 | 39.4k | embs=_enc->mb_info; |
2358 | 39.4k | frags=_enc->state.frags; |
2359 | 39.4k | frag_mvs=_enc->state.frag_mvs; |
2360 | 39.4k | notstart=0; |
2361 | 39.4k | notdone=1; |
2362 | 39.4k | mcu_nvsbs=_enc->mcu_nvsbs; |
2363 | 81.8k | for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){ |
2364 | 42.4k | ptrdiff_t cfroffset; |
2365 | 42.4k | notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby); |
2366 | 42.4k | sbi_end=_enc->pipe.sbi_end[0]; |
2367 | 42.4k | cfroffset=_enc->pipe.froffset[1]; |
2368 | 166k | for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){ |
2369 | 123k | int quadi; |
2370 | | /*Mode addressing is through Y plane, always 4 MB per SB.*/ |
2371 | 618k | for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){ |
2372 | 411k | oc_mode_choice modes[8]; |
2373 | 411k | unsigned activity[4]; |
2374 | 411k | unsigned rd_scale[5]; |
2375 | 411k | unsigned rd_iscale[5]; |
2376 | 411k | unsigned skip_ssd[12]; |
2377 | 411k | unsigned intra_satd[12]; |
2378 | 411k | unsigned luma; |
2379 | 411k | int mb_mv_bits_0; |
2380 | 411k | int mb_gmv_bits_0; |
2381 | 411k | int inter_mv_pref; |
2382 | 411k | int mb_mode; |
2383 | 411k | int refi; |
2384 | 411k | int mv; |
2385 | 411k | unsigned mbi; |
2386 | 411k | int mapii; |
2387 | 411k | int mapi; |
2388 | 411k | int bi; |
2389 | 411k | ptrdiff_t fragi; |
2390 | 411k | mbi=sbi<<2|quadi; |
2391 | 411k | luma=oc_mb_intra_satd(_enc,mbi,intra_satd); |
2392 | | /*Activity masking.*/ |
2393 | 411k | if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
2394 | 411k | oc_mb_activity(_enc,mbi,activity); |
2395 | 411k | } |
2396 | 0 | else oc_mb_activity_fast(_enc,mbi,activity,intra_satd); |
2397 | 411k | luma_sum+=luma; |
2398 | 411k | activity_sum+=oc_mb_masking(rd_scale,rd_iscale, |
2399 | 411k | chroma_rd_scale,activity,activity_avg,luma,luma_avg); |
2400 | | /*Motion estimation: |
2401 | | We always do a basic 1MV search for all macroblocks, coded or not, |
2402 | | keyframe or not.*/ |
2403 | 411k | if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi); |
2404 | 411k | mv=0; |
2405 | | /*Find the block choice with the lowest estimated coding cost. |
2406 | | If a Cb or Cr block is coded but no Y' block from a macro block then |
2407 | | the mode MUST be OC_MODE_INTER_NOMV. |
2408 | | This is the default state to which the mode data structure is |
2409 | | initialised in encoder and decoder at the start of each frame.*/ |
2410 | | /*Block coding cost is estimated from correlated SATD metrics.*/ |
2411 | | /*At this point, all blocks that are in frame are still marked coded.*/ |
2412 | 411k | if(!_recode){ |
2413 | 313k | embs[mbi].unref_mv[OC_FRAME_GOLD]= |
2414 | 313k | embs[mbi].analysis_mv[0][OC_FRAME_GOLD]; |
2415 | 313k | embs[mbi].unref_mv[OC_FRAME_PREV]= |
2416 | 313k | embs[mbi].analysis_mv[0][OC_FRAME_PREV]; |
2417 | 313k | embs[mbi].refined=0; |
2418 | 313k | } |
2419 | | /*Estimate the cost of coding this MB in a keyframe.*/ |
2420 | 411k | if(_allow_keyframe){ |
2421 | 411k | oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, |
2422 | 411k | _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale); |
2423 | 411k | intrabits+=modes[OC_MODE_INTRA].rate; |
2424 | 2.05M | for(bi=0;bi<4;bi++){ |
2425 | 1.64M | oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs, |
2426 | 1.64M | modes[OC_MODE_INTRA].qii[bi]); |
2427 | 1.64M | } |
2428 | 411k | } |
2429 | | /*Estimate the cost in a delta frame for various modes.*/ |
2430 | 411k | oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd); |
2431 | 411k | if(sp_level<OC_SP_LEVEL_NOMC){ |
2432 | 411k | oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi, |
2433 | 411k | OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2434 | 411k | skip_ssd,rd_scale); |
2435 | 411k | oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, |
2436 | 411k | _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale); |
2437 | 411k | mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi, |
2438 | 411k | OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV], |
2439 | 411k | _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); |
2440 | 411k | oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi, |
2441 | 411k | OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2442 | 411k | skip_ssd,rd_scale); |
2443 | 411k | oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi, |
2444 | 411k | OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2445 | 411k | skip_ssd,rd_scale); |
2446 | 411k | oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi, |
2447 | 411k | OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2448 | 411k | skip_ssd,rd_scale); |
2449 | 411k | mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi, |
2450 | 411k | OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD], |
2451 | 411k | _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); |
2452 | | /*The explicit MV modes (2,6,7) have not yet gone through halfpel |
2453 | | refinement. |
2454 | | We choose the explicit MV mode that's already furthest ahead on |
2455 | | R-D cost and refine only that one. |
2456 | | We have to be careful to remember which ones we've refined so that |
2457 | | we don't refine it again if we re-encode this frame.*/ |
2458 | 411k | inter_mv_pref=_enc->lambda*3; |
2459 | 411k | if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
2460 | 411k | oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi, |
2461 | 411k | embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2462 | 411k | skip_ssd,rd_scale); |
2463 | 411k | } |
2464 | 0 | else{ |
2465 | 0 | modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX; |
2466 | 0 | } |
2467 | 411k | if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&& |
2468 | 411k | modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){ |
2469 | 53.8k | if(!(embs[mbi].refined&0x80)){ |
2470 | 41.8k | oc_mcenc_refine4mv(_enc,mbi); |
2471 | 41.8k | embs[mbi].refined|=0x80; |
2472 | 41.8k | } |
2473 | 53.8k | oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi, |
2474 | 53.8k | embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2475 | 53.8k | skip_ssd,rd_scale); |
2476 | 53.8k | } |
2477 | 357k | else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref< |
2478 | 357k | modes[OC_MODE_INTER_MV].cost){ |
2479 | 100k | if(!(embs[mbi].refined&0x40)){ |
2480 | 86.6k | oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD); |
2481 | 86.6k | embs[mbi].refined|=0x40; |
2482 | 86.6k | } |
2483 | 100k | mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi, |
2484 | 100k | OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD], |
2485 | 100k | _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); |
2486 | 100k | } |
2487 | 411k | if(!(embs[mbi].refined&0x04)){ |
2488 | 313k | oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV); |
2489 | 313k | embs[mbi].refined|=0x04; |
2490 | 313k | } |
2491 | 411k | mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi, |
2492 | 411k | OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV], |
2493 | 411k | _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale); |
2494 | | /*Finally, pick the mode with the cheapest estimated R-D cost.*/ |
2495 | 411k | mb_mode=OC_MODE_INTER_NOMV; |
2496 | 411k | if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){ |
2497 | 319k | mb_mode=OC_MODE_INTRA; |
2498 | 319k | } |
2499 | 411k | if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){ |
2500 | 61.9k | mb_mode=OC_MODE_INTER_MV_LAST; |
2501 | 61.9k | } |
2502 | 411k | if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){ |
2503 | 10.9k | mb_mode=OC_MODE_INTER_MV_LAST2; |
2504 | 10.9k | } |
2505 | 411k | if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){ |
2506 | 16.5k | mb_mode=OC_MODE_GOLDEN_NOMV; |
2507 | 16.5k | } |
2508 | 411k | if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){ |
2509 | 33.0k | mb_mode=OC_MODE_GOLDEN_MV; |
2510 | 33.0k | } |
2511 | 411k | if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){ |
2512 | 17.0k | mb_mode=OC_MODE_INTER_MV_FOUR; |
2513 | 17.0k | } |
2514 | | /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/ |
2515 | 411k | if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){ |
2516 | 53.6k | inter_mv_pref=0; |
2517 | 53.6k | } |
2518 | 411k | if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){ |
2519 | 31.0k | mb_mode=OC_MODE_INTER_MV; |
2520 | 31.0k | } |
2521 | 411k | } |
2522 | 0 | else{ |
2523 | 0 | oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi, |
2524 | 0 | OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2525 | 0 | skip_ssd,rd_scale); |
2526 | 0 | oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi, |
2527 | 0 | _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale); |
2528 | 0 | oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi, |
2529 | 0 | OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0, |
2530 | 0 | skip_ssd,rd_scale); |
2531 | 0 | mb_mode=OC_MODE_INTER_NOMV; |
2532 | 0 | if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){ |
2533 | 0 | mb_mode=OC_MODE_INTRA; |
2534 | 0 | } |
2535 | 0 | if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){ |
2536 | 0 | mb_mode=OC_MODE_GOLDEN_NOMV; |
2537 | 0 | } |
2538 | 0 | mb_mv_bits_0=mb_gmv_bits_0=0; |
2539 | 0 | } |
2540 | 411k | mb_modes[mbi]=mb_mode; |
2541 | | /*Propagate the MVs to the luma blocks.*/ |
2542 | 411k | if(mb_mode!=OC_MODE_INTER_MV_FOUR){ |
2543 | 401k | switch(mb_mode){ |
2544 | 31.0k | case OC_MODE_INTER_MV:{ |
2545 | 31.0k | mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV]; |
2546 | 31.0k | }break; |
2547 | 42.2k | case OC_MODE_INTER_MV_LAST:mv=last_mv;break; |
2548 | 7.46k | case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break; |
2549 | 23.2k | case OC_MODE_GOLDEN_MV:{ |
2550 | 23.2k | mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD]; |
2551 | 23.2k | }break; |
2552 | 401k | } |
2553 | 2.00M | for(bi=0;bi<4;bi++){ |
2554 | 1.60M | fragi=mb_maps[mbi][0][bi]; |
2555 | 1.60M | frag_mvs[fragi]=mv; |
2556 | 1.60M | } |
2557 | 401k | } |
2558 | 2.05M | for(bi=0;bi<4;bi++){ |
2559 | 1.64M | fragi=sb_maps[mbi>>2][mbi&3][bi]; |
2560 | 1.64M | frags[fragi].qii=modes[mb_mode].qii[bi]; |
2561 | 1.64M | } |
2562 | 411k | if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi, |
2563 | 411k | modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){ |
2564 | 346k | int orig_mb_mode; |
2565 | 346k | orig_mb_mode=mb_mode; |
2566 | 346k | mb_mode=mb_modes[mbi]; |
2567 | 346k | refi=OC_FRAME_FOR_MODE(mb_mode); |
2568 | 346k | switch(mb_mode){ |
2569 | 18.5k | case OC_MODE_INTER_MV:{ |
2570 | 18.5k | prior_mv=last_mv; |
2571 | | /*If we're backing out from 4MV, find the MV we're actually |
2572 | | using.*/ |
2573 | 18.5k | if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){ |
2574 | 186 | for(bi=0;;bi++){ |
2575 | 186 | fragi=mb_maps[mbi][0][bi]; |
2576 | 186 | if(frags[fragi].coded){ |
2577 | 66 | mv=last_mv=frag_mvs[fragi]; |
2578 | 66 | break; |
2579 | 66 | } |
2580 | 186 | } |
2581 | 66 | mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31] |
2582 | 66 | +OC_MV_BITS[0][OC_MV_Y(mv)+31]; |
2583 | 66 | } |
2584 | | /*Otherwise we used the original analysis MV.*/ |
2585 | 18.5k | else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV]; |
2586 | 18.5k | _enc->mv_bits[0]+=mb_mv_bits_0; |
2587 | 18.5k | _enc->mv_bits[1]+=12; |
2588 | 18.5k | }break; |
2589 | 5.86k | case OC_MODE_INTER_MV_LAST2:{ |
2590 | 5.86k | oc_mv tmp_mv; |
2591 | 5.86k | tmp_mv=prior_mv; |
2592 | 5.86k | prior_mv=last_mv; |
2593 | 5.86k | last_mv=tmp_mv; |
2594 | 5.86k | }break; |
2595 | 19.0k | case OC_MODE_GOLDEN_MV:{ |
2596 | 19.0k | _enc->mv_bits[0]+=mb_gmv_bits_0; |
2597 | 19.0k | _enc->mv_bits[1]+=12; |
2598 | 19.0k | }break; |
2599 | 7.75k | case OC_MODE_INTER_MV_FOUR:{ |
2600 | 7.75k | oc_mv lbmvs[4]; |
2601 | 7.75k | oc_mv cbmvs[4]; |
2602 | 7.75k | prior_mv=last_mv; |
2603 | 38.7k | for(bi=0;bi<4;bi++){ |
2604 | 31.0k | fragi=mb_maps[mbi][0][bi]; |
2605 | 31.0k | if(frags[fragi].coded){ |
2606 | 29.5k | lbmvs[bi]=last_mv=frag_mvs[fragi]; |
2607 | 29.5k | _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31] |
2608 | 29.5k | +OC_MV_BITS[0][OC_MV_Y(last_mv)+31]; |
2609 | 29.5k | _enc->mv_bits[1]+=12; |
2610 | 29.5k | } |
2611 | | /*Replace the block MVs for not-coded blocks with (0,0).*/ |
2612 | 1.48k | else lbmvs[bi]=0; |
2613 | 31.0k | } |
2614 | 7.75k | (*set_chroma_mvs)(cbmvs,lbmvs); |
2615 | 26.8k | for(mapii=4;mapii<nmap_idxs;mapii++){ |
2616 | 19.0k | mapi=map_idxs[mapii]; |
2617 | 19.0k | pli=mapi>>2; |
2618 | 19.0k | bi=mapi&3; |
2619 | 19.0k | fragi=mb_maps[mbi][pli][bi]; |
2620 | 19.0k | frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii]; |
2621 | 19.0k | frags[fragi].refi=refi; |
2622 | 19.0k | frags[fragi].mb_mode=mb_mode; |
2623 | 19.0k | frag_mvs[fragi]=cbmvs[bi]; |
2624 | 19.0k | } |
2625 | 7.75k | }break; |
2626 | 346k | } |
2627 | 346k | coded_mbis[ncoded_mbis++]=mbi; |
2628 | 346k | oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode); |
2629 | 346k | interbits+=modes[mb_mode].rate+modes[mb_mode].overhead; |
2630 | 346k | } |
2631 | 65.1k | else{ |
2632 | 65.1k | *(uncoded_mbis-++nuncoded_mbis)=mbi; |
2633 | 65.1k | mb_mode=OC_MODE_INTER_NOMV; |
2634 | 65.1k | refi=OC_FRAME_PREV; |
2635 | 65.1k | mv=0; |
2636 | 65.1k | } |
2637 | | /*Propagate final MB mode and MVs to the chroma blocks. |
2638 | | This has already been done for 4MV mode, since it requires individual |
2639 | | block motion vectors.*/ |
2640 | 411k | if(mb_mode!=OC_MODE_INTER_MV_FOUR){ |
2641 | 1.39M | for(mapii=4;mapii<nmap_idxs;mapii++){ |
2642 | 987k | mapi=map_idxs[mapii]; |
2643 | 987k | pli=mapi>>2; |
2644 | 987k | bi=mapi&3; |
2645 | 987k | fragi=mb_maps[mbi][pli][bi]; |
2646 | | /*If we switched from 4MV mode to INTER_MV mode, then the qii |
2647 | | values won't have been chosen with the right MV, but it's |
2648 | | probaby not worth re-estimating them.*/ |
2649 | 987k | frags[fragi].qii=modes[mb_mode].qii[mapii]; |
2650 | 987k | frags[fragi].refi=refi; |
2651 | 987k | frags[fragi].mb_mode=mb_mode; |
2652 | 987k | frag_mvs[fragi]=mv; |
2653 | 987k | } |
2654 | 403k | } |
2655 | | /*Save masking scale factors for chroma blocks.*/ |
2656 | 914k | for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){ |
2657 | 503k | mapi=map_idxs[mapii]; |
2658 | 503k | bi=mapi&3; |
2659 | 503k | fragi=mb_maps[mbi][1][bi]; |
2660 | 503k | mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4]; |
2661 | 503k | mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4]; |
2662 | 503k | } |
2663 | 411k | } |
2664 | 123k | oc_fr_state_flush_sb(_enc->pipe.fr+0); |
2665 | 123k | sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full; |
2666 | 123k | sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial; |
2667 | 123k | } |
2668 | 42.4k | oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone); |
2669 | | /*Code chroma planes.*/ |
2670 | 127k | for(pli=1;pli<3;pli++){ |
2671 | 84.9k | oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe, |
2672 | 84.9k | pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]); |
2673 | 84.9k | oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone); |
2674 | 84.9k | } |
2675 | 42.4k | notstart=1; |
2676 | 42.4k | } |
2677 | | /*Update the average block activity and MB luma score for the frame. |
2678 | | We could use a Bessel follower here, but fast reaction is probably almost |
2679 | | always best.*/ |
2680 | 39.4k | _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN, |
2681 | 39.4k | (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/ |
2682 | 39.4k | _enc->state.fplanes[0].nfrags)); |
2683 | 39.4k | _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs); |
2684 | | /*Finish filling in the reference frame borders.*/ |
2685 | 39.4k | refi=_enc->state.ref_frame_idx[OC_FRAME_SELF]; |
2686 | 157k | for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli); |
2687 | | /*Finish adding flagging overhead costs to inter bit counts to determine if |
2688 | | we should have coded a key frame instead.*/ |
2689 | 39.4k | if(_allow_keyframe){ |
2690 | | /*Technically the chroma plane counts are over-estimations, because they |
2691 | | don't account for continuing runs from the luma planes, but the |
2692 | | inaccuracy is small. |
2693 | | We don't need to add the luma plane coding flag costs, because they are |
2694 | | already included in the MB rate estimates.*/ |
2695 | 118k | for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE; |
2696 | 39.4k | if(interbits>intrabits)return 1; |
2697 | 39.4k | } |
2698 | 24.7k | _enc->ncoded_mbis=ncoded_mbis; |
2699 | | /*Compact the coded fragment list.*/ |
2700 | 24.7k | { |
2701 | 24.7k | ptrdiff_t ncoded_fragis; |
2702 | 24.7k | ncoded_fragis=_enc->state.ncoded_fragis[0]; |
2703 | 74.1k | for(pli=1;pli<3;pli++){ |
2704 | 49.4k | memmove(_enc->state.coded_fragis+ncoded_fragis, |
2705 | 49.4k | _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset, |
2706 | 49.4k | _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis)); |
2707 | 49.4k | ncoded_fragis+=_enc->state.ncoded_fragis[pli]; |
2708 | 49.4k | } |
2709 | 24.7k | _enc->state.ntotal_coded_fragis=ncoded_fragis; |
2710 | 24.7k | } |
2711 | 24.7k | return 0; |
2712 | 39.4k | } |