Coverage Report

Created: 2026-05-16 07:49

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/theora/lib/analyze.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025           *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function: mode selection code
14
15
 ********************************************************************/
16
#include <limits.h>
17
#include <string.h>
18
#include "encint.h"
19
#include "modedec.h"
20
#if defined(OC_COLLECT_METRICS)
21
# include "collect.c"
22
#endif
23
24
25
26
typedef struct oc_rd_metric          oc_rd_metric;
27
typedef struct oc_mode_choice        oc_mode_choice;
28
29
30
31
/*There are 8 possible schemes used to encode macro block modes.
32
  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
33
  The same set of Huffman codes is used for each of these 7 schemes, but the
34
   mode assigned to each codeword varies.
35
  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
36
   while schemes 1-6 have a fixed mapping.
37
  Scheme 7 just encodes each mode directly in 3 bits.*/
38
39
/*The mode orderings for the various mode coding schemes.
40
  Scheme 0 uses a custom alphabet, which is not stored in this table.
41
  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
42
   decoder.*/
43
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
44
  /*Last MV dominates.*/
45
  /*L P M N I G GM 4*/
46
  {3,4,2,0,1,5,6,7},
47
  /*L P N M I G GM 4*/
48
  {2,4,3,0,1,5,6,7},
49
  /*L M P N I G GM 4*/
50
  {3,4,1,0,2,5,6,7},
51
  /*L M N P I G GM 4*/
52
  {2,4,1,0,3,5,6,7},
53
  /*No MV dominates.*/
54
  /*N L P M I G GM 4*/
55
  {0,4,3,1,2,5,6,7},
56
  /*N G L P M I GM 4*/
57
  {0,5,4,2,3,1,6,7},
58
  /*Default ordering.*/
59
  /*N I M L P G GM 4*/
60
  {0,1,2,3,4,5,6,7}
61
};
62
63
64
65
/*Initialize the mode scheme chooser.
66
  This need only be called once per encoder.*/
67
2.87k
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
68
2.87k
  int si;
69
2.87k
  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
70
23.0k
  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
71
2.87k
}
72
73
/*Reset the mode scheme chooser.
74
  This needs to be called once for each frame, including the first.*/
75
27.9k
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
76
27.9k
  int si;
77
27.9k
  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
78
  /*Scheme 0 starts with 24 bits to store the mode list in.*/
79
27.9k
  _chooser->scheme_bits[0]=24;
80
27.9k
  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
81
251k
  for(si=0;si<8;si++){
82
    /*Scheme 7 should always start first, and scheme 0 should always start
83
       last.*/
84
223k
    _chooser->scheme_list[si]=7-si;
85
223k
    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
86
223k
  }
87
27.9k
}
88
89
/*Return the cost of coding _mb_mode in the specified scheme.*/
90
static int oc_mode_scheme_chooser_scheme_mb_cost(
91
8.10M
 const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
92
8.10M
  int codebook;
93
8.10M
  int ri;
94
8.10M
  codebook=_scheme+1>>3;
95
  /*For any scheme except 0, we can just use the bit cost of the mode's rank
96
     in that scheme.*/
97
8.10M
  ri=_chooser->mode_ranks[_scheme][_mb_mode];
98
8.10M
  if(_scheme==0){
99
1.05M
    int mc;
100
    /*For scheme 0, incrementing the mode count could potentially change the
101
       mode's rank.
102
      Find the index where the mode would be moved to in the optimal list,
103
       and use its bit cost instead of the one for the mode's current
104
       position in the list.*/
105
    /*We don't actually reorder the list; this is for computing opportunity
106
       cost, not an update.*/
107
1.05M
    mc=_chooser->mode_counts[_mb_mode];
108
2.77M
    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
109
1.05M
  }
110
8.10M
  return OC_MODE_BITS[codebook][ri];
111
8.10M
}
112
113
/*This is the real purpose of this data structure: not actually selecting a
114
   mode scheme, but estimating the cost of coding a given mode given all the
115
   modes selected so far.
116
  This is done via opportunity cost: the cost is defined as the number of bits
117
   required to encode all the modes selected so far including the current one
118
   using the best possible scheme, minus the number of bits required to encode
119
   all the modes selected so far not including the current one using the best
120
   possible scheme.
121
  The computational expense of doing this probably makes it overkill.
122
  Just be happy we take a greedy approach instead of trying to solve the
123
   global mode-selection problem (which is NP-hard).
124
  _mb_mode: The mode to determine the cost of.
125
  Return: The number of bits required to code this mode.*/
126
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
127
2.30M
 int _mb_mode){
128
2.30M
  int scheme0;
129
2.30M
  int scheme1;
130
2.30M
  int best_bits;
131
2.30M
  int mode_bits;
132
2.30M
  int si;
133
2.30M
  int scheme0_bits;
134
2.30M
  int scheme1_bits;
135
2.30M
  scheme0=_chooser->scheme_list[0];
136
2.30M
  scheme1=_chooser->scheme_list[1];
137
2.30M
  scheme0_bits=_chooser->scheme_bits[scheme0];
138
2.30M
  scheme1_bits=_chooser->scheme_bits[scheme1];
139
2.30M
  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
140
  /*Typical case: If the difference between the best scheme and the next best
141
     is greater than 6 bits, then adding just one mode cannot change which
142
     scheme we use.*/
143
2.30M
  if(scheme1_bits-scheme0_bits>6)return mode_bits;
144
  /*Otherwise, check to see if adding this mode selects a different scheme as
145
     the best.*/
146
1.13M
  si=1;
147
1.13M
  best_bits=scheme0_bits+mode_bits;
148
5.79M
  do{
149
5.79M
    int cur_bits;
150
5.79M
    cur_bits=scheme1_bits+
151
5.79M
     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
152
5.79M
    if(cur_bits<best_bits)best_bits=cur_bits;
153
5.79M
    if(++si>=8)break;
154
5.79M
    scheme1=_chooser->scheme_list[si];
155
5.79M
    scheme1_bits=_chooser->scheme_bits[scheme1];
156
5.79M
  }
157
5.79M
  while(scheme1_bits-scheme0_bits<=6);
158
1.13M
  return best_bits-scheme0_bits;
159
2.30M
}
160
161
/*Incrementally update the mode counts and per-scheme bit counts and re-order
162
   the scheme lists once a mode has been selected.
163
  _mb_mode: The mode that was chosen.*/
164
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
165
180k
 int _mb_mode){
166
180k
  int ri;
167
180k
  int si;
168
180k
  _chooser->mode_counts[_mb_mode]++;
169
  /*Re-order the scheme0 mode list if necessary.*/
170
249k
  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
171
98.0k
    int pmode;
172
98.0k
    pmode=_chooser->scheme0_list[ri-1];
173
98.0k
    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
174
    /*Reorder the mode ranking.*/
175
68.5k
    _chooser->scheme0_ranks[pmode]++;
176
68.5k
    _chooser->scheme0_list[ri]=pmode;
177
68.5k
  }
178
180k
  _chooser->scheme0_ranks[_mb_mode]=ri;
179
180k
  _chooser->scheme0_list[ri]=_mb_mode;
180
  /*Now add the bit cost for the mode to each scheme.*/
181
1.62M
  for(si=0;si<8;si++){
182
1.44M
    _chooser->scheme_bits[si]+=
183
1.44M
     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
184
1.44M
  }
185
  /*Finally, re-order the list of schemes.*/
186
1.44M
  for(si=1;si<8;si++){
187
1.26M
    int sj;
188
1.26M
    int scheme0;
189
1.26M
    int bits0;
190
1.26M
    sj=si;
191
1.26M
    scheme0=_chooser->scheme_list[si];
192
1.26M
    bits0=_chooser->scheme_bits[scheme0];
193
1.41M
    do{
194
1.41M
      int scheme1;
195
1.41M
      scheme1=_chooser->scheme_list[sj-1];
196
1.41M
      if(bits0>=_chooser->scheme_bits[scheme1])break;
197
166k
      _chooser->scheme_list[sj]=scheme1;
198
166k
    }
199
1.26M
    while(--sj>0);
200
1.26M
    _chooser->scheme_list[sj]=scheme0;
201
1.26M
  }
202
180k
}
203
204
205
206
/*The number of bits required to encode a super block run.
207
  _run_count: The desired run count; must be positive and less than 4130.*/
208
141M
static int oc_sb_run_bits(int _run_count){
209
141M
  int i;
210
537M
  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
211
141M
  return OC_SB_RUN_CODE_NBITS[i];
212
141M
}
213
214
/*The number of bits required to encode a block run.
215
  _run_count: The desired run count; must be positive and less than 30.*/
216
12.6M
static int oc_block_run_bits(int _run_count){
217
12.6M
  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
218
12.6M
}
219
220
221
222
154k
static void oc_fr_state_init(oc_fr_state *_fr){
223
154k
  _fr->bits=0;
224
154k
  _fr->sb_partial_count=0;
225
154k
  _fr->sb_full_count=0;
226
154k
  _fr->b_coded_count_prev=0;
227
154k
  _fr->b_coded_count=0;
228
154k
  _fr->b_count=0;
229
154k
  _fr->sb_prefer_partial=0;
230
154k
  _fr->sb_bits=0;
231
154k
  _fr->sb_partial=-1;
232
154k
  _fr->sb_full=-1;
233
154k
  _fr->b_coded_prev=-1;
234
154k
  _fr->b_coded=-1;
235
154k
}
236
237
238
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
239
6.52M
 int _sb_partial,int _sb_full){
240
6.52M
  int bits;
241
6.52M
  int sb_partial_count;
242
6.52M
  int sb_full_count;
243
6.52M
  bits=0;
244
6.52M
  sb_partial_count=_fr->sb_partial_count;
245
  /*Extend the sb_partial run, or start a new one.*/
246
6.52M
  if(_fr->sb_partial==_sb_partial){
247
1.29M
    if(sb_partial_count>=4129){
248
0
      bits++;
249
0
      sb_partial_count=0;
250
0
    }
251
1.29M
    else bits-=oc_sb_run_bits(sb_partial_count);
252
1.29M
  }
253
5.22M
  else sb_partial_count=0;
254
6.52M
  bits+=oc_sb_run_bits(++sb_partial_count);
255
6.52M
  if(!_sb_partial){
256
    /*Extend the sb_full run, or start a new one.*/
257
1.87M
    sb_full_count=_fr->sb_full_count;
258
1.87M
    if(_fr->sb_full==_sb_full){
259
647k
      if(sb_full_count>=4129){
260
0
        bits++;
261
0
        sb_full_count=0;
262
0
      }
263
647k
      else bits-=oc_sb_run_bits(sb_full_count);
264
647k
    }
265
1.23M
    else sb_full_count=0;
266
1.87M
    bits+=oc_sb_run_bits(++sb_full_count);
267
1.87M
  }
268
6.52M
  return bits;
269
6.52M
}
270
271
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
272
154k
 int _sb_partial,int _sb_full){
273
154k
  int sb_partial_count;
274
154k
  int sb_full_count;
275
154k
  sb_partial_count=_fr->sb_partial_count;
276
154k
  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
277
154k
  sb_partial_count++;
278
154k
  if(!_sb_partial){
279
97.5k
    sb_full_count=_fr->sb_full_count;
280
97.5k
    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
281
97.5k
    sb_full_count++;
282
97.5k
    _fr->sb_full_count=sb_full_count;
283
97.5k
    _fr->sb_full=_sb_full;
284
    /*Roll back the partial block state.*/
285
97.5k
    _fr->b_coded=_fr->b_coded_prev;
286
97.5k
    _fr->b_coded_count=_fr->b_coded_count_prev;
287
97.5k
  }
288
56.5k
  else{
289
    /*Commit back the partial block state.*/
290
56.5k
    _fr->b_coded_prev=_fr->b_coded;
291
56.5k
    _fr->b_coded_count_prev=_fr->b_coded_count;
292
56.5k
  }
293
154k
  _fr->sb_partial_count=sb_partial_count;
294
154k
  _fr->sb_partial=_sb_partial;
295
154k
  _fr->b_count=0;
296
154k
  _fr->sb_prefer_partial=0;
297
154k
  _fr->sb_bits=0;
298
154k
}
299
300
/*Commit the state of the current super block and advance to the next.*/
301
154k
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
302
154k
  int sb_partial;
303
154k
  int sb_full;
304
154k
  int b_coded_count;
305
154k
  int b_count;
306
154k
  b_count=_fr->b_count;
307
154k
  b_coded_count=_fr->b_coded_count;
308
154k
  sb_full=_fr->b_coded;
309
154k
  sb_partial=b_coded_count<b_count;
310
154k
  if(!sb_partial){
311
    /*If the super block is fully coded/uncoded...*/
312
98.0k
    if(_fr->sb_prefer_partial){
313
      /*So far coding this super block as partial was cheaper anyway.*/
314
780
      if(b_coded_count>15||_fr->b_coded_prev<0){
315
372
        int sb_bits;
316
        /*If the block run is too long, this will limit how far it can be
317
           extended into the next partial super block.
318
          If we need to extend it farther, we don't want to have to roll all
319
           the way back here (since there could be many full SBs between now
320
           and then), so we disallow this.
321
          Similarly, if this is the start of a stripe, we don't know how the
322
           length of the outstanding block run from the previous stripe.*/
323
372
        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
324
372
        _fr->bits+=sb_bits-_fr->sb_bits;
325
372
        _fr->sb_bits=sb_bits;
326
372
      }
327
408
      else sb_partial=1;
328
780
    }
329
98.0k
  }
330
154k
  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
331
154k
}
332
333
17.1M
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
334
17.1M
  ptrdiff_t bits;
335
17.1M
  int       sb_bits;
336
17.1M
  int       b_coded_count;
337
17.1M
  int       b_count;
338
17.1M
  int       sb_prefer_partial;
339
17.1M
  sb_bits=_fr->sb_bits;
340
17.1M
  bits=_fr->bits-sb_bits;
341
17.1M
  b_count=_fr->b_count;
342
17.1M
  b_coded_count=_fr->b_coded_count;
343
17.1M
  sb_prefer_partial=_fr->sb_prefer_partial;
344
17.1M
  if(b_coded_count>=b_count){
345
12.4M
    int sb_partial_bits;
346
    /*This super block is currently fully coded/uncoded.*/
347
12.4M
    if(b_count<=0){
348
      /*This is the first block in this SB.*/
349
1.57M
      b_count=1;
350
      /*Check to see whether it's cheaper to code it partially or fully.*/
351
1.57M
      if(_fr->b_coded==_b_coded){
352
272k
        sb_partial_bits=-oc_block_run_bits(b_coded_count);
353
272k
        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
354
272k
      }
355
1.29M
      else{
356
1.29M
        b_coded_count=1;
357
1.29M
        sb_partial_bits=2;
358
1.29M
      }
359
1.57M
      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
360
1.57M
      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
361
1.57M
      sb_prefer_partial=sb_partial_bits<sb_bits;
362
1.57M
      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
363
1.57M
    }
364
10.8M
    else if(_fr->b_coded==_b_coded){
365
7.65M
      b_coded_count++;
366
7.65M
      if(++b_count<16){
367
7.33M
        if(sb_prefer_partial){
368
          /*Check to see if it's cheaper to code it fully.*/
369
305k
          sb_partial_bits=sb_bits;
370
305k
          sb_partial_bits+=oc_block_run_bits(b_coded_count);
371
305k
          if(b_coded_count>0){
372
305k
            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
373
305k
          }
374
305k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
375
305k
          sb_prefer_partial=sb_partial_bits<sb_bits;
376
305k
          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
377
305k
        }
378
        /*There's no need to check the converse (whether it's cheaper to code
379
           this SB partially if we were coding it fully), since the cost to
380
           code a SB partially can only increase as we add more blocks, whereas
381
           the cost to code it fully stays constant.*/
382
7.33M
      }
383
322k
      else{
384
        /*If we get to the end and this SB is still full, then force it to be
385
           coded full.
386
          Otherwise we might not be able to extend the block run far enough
387
           into the next partial SB.*/
388
322k
        if(sb_prefer_partial){
389
3.03k
          sb_prefer_partial=0;
390
3.03k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
391
3.03k
        }
392
322k
      }
393
7.65M
    }
394
3.19M
    else{
395
      /*This SB was full, but now must be made partial.*/
396
3.19M
      if(!sb_prefer_partial){
397
3.07M
        sb_bits=oc_block_run_bits(b_coded_count);
398
3.07M
        if(b_coded_count>b_count){
399
563k
          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
400
563k
        }
401
3.07M
        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
402
3.07M
      }
403
3.19M
      b_count++;
404
3.19M
      b_coded_count=1;
405
3.19M
      sb_prefer_partial=1;
406
3.19M
      sb_bits+=2;
407
3.19M
    }
408
12.4M
  }
409
4.71M
  else{
410
4.71M
    b_count++;
411
4.71M
    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
412
1.53M
    else b_coded_count=0;
413
4.71M
    sb_bits+=oc_block_run_bits(++b_coded_count);
414
4.71M
  }
415
17.1M
  _fr->bits=bits+sb_bits;
416
17.1M
  _fr->b_coded_count=b_coded_count;
417
17.1M
  _fr->b_coded=_b_coded;
418
17.1M
  _fr->b_count=b_count;
419
17.1M
  _fr->sb_prefer_partial=sb_prefer_partial;
420
17.1M
  _fr->sb_bits=sb_bits;
421
17.1M
}
422
423
5.68M
static void oc_fr_skip_block(oc_fr_state *_fr){
424
5.68M
  oc_fr_state_advance_block(_fr,0);
425
5.68M
}
426
427
11.4M
static void oc_fr_code_block(oc_fr_state *_fr){
428
11.4M
  oc_fr_state_advance_block(_fr,1);
429
11.4M
}
430
431
1.23M
static int oc_fr_cost1(const oc_fr_state *_fr){
432
1.23M
  oc_fr_state tmp;
433
1.23M
  ptrdiff_t   bits;
434
1.23M
  *&tmp=*_fr;
435
1.23M
  oc_fr_skip_block(&tmp);
436
1.23M
  bits=tmp.bits;
437
1.23M
  *&tmp=*_fr;
438
1.23M
  oc_fr_code_block(&tmp);
439
1.23M
  return (int)(tmp.bits-bits);
440
1.23M
}
441
442
184k
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
443
184k
  oc_fr_state tmp;
444
184k
  *&tmp=*_pre;
445
184k
  oc_fr_skip_block(&tmp);
446
184k
  oc_fr_skip_block(&tmp);
447
184k
  oc_fr_skip_block(&tmp);
448
184k
  oc_fr_skip_block(&tmp);
449
184k
  return (int)(_post->bits-tmp.bits);
450
184k
}
451
452
453
454
182k
static void oc_qii_state_init(oc_qii_state *_qs){
455
182k
  _qs->bits=0;
456
182k
  _qs->qi01_count=0;
457
182k
  _qs->qi01=-1;
458
182k
  _qs->qi12_count=0;
459
182k
  _qs->qi12=-1;
460
182k
}
461
462
463
static void oc_qii_state_advance(oc_qii_state *_qd,
464
58.4M
 const oc_qii_state *_qs,int _qii){
465
58.4M
  ptrdiff_t bits;
466
58.4M
  int       qi01;
467
58.4M
  int       qi01_count;
468
58.4M
  int       qi12;
469
58.4M
  int       qi12_count;
470
58.4M
  bits=_qs->bits;
471
58.4M
  qi01=_qii+1>>1;
472
58.4M
  qi01_count=_qs->qi01_count;
473
58.4M
  if(qi01==_qs->qi01){
474
34.8M
    if(qi01_count>=4129){
475
2.79k
      bits++;
476
2.79k
      qi01_count=0;
477
2.79k
    }
478
34.8M
    else bits-=oc_sb_run_bits(qi01_count);
479
34.8M
  }
480
23.5M
  else qi01_count=0;
481
58.4M
  qi01_count++;
482
58.4M
  bits+=oc_sb_run_bits(qi01_count);
483
58.4M
  qi12_count=_qs->qi12_count;
484
58.4M
  if(_qii){
485
24.3M
    qi12=_qii>>1;
486
24.3M
    if(qi12==_qs->qi12){
487
13.4M
      if(qi12_count>=4129){
488
14.8k
        bits++;
489
14.8k
        qi12_count=0;
490
14.8k
      }
491
13.4M
      else bits-=oc_sb_run_bits(qi12_count);
492
13.4M
    }
493
10.8M
    else qi12_count=0;
494
24.3M
    qi12_count++;
495
24.3M
    bits+=oc_sb_run_bits(qi12_count);
496
24.3M
  }
497
34.0M
  else qi12=_qs->qi12;
498
58.4M
  _qd->bits=bits;
499
58.4M
  _qd->qi01=qi01;
500
58.4M
  _qd->qi01_count=qi01_count;
501
58.4M
  _qd->qi12=qi12;
502
58.4M
  _qd->qi12_count=qi12_count;
503
58.4M
}
504
505
506
507
51.4k
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
508
51.4k
  ptrdiff_t *coded_fragis;
509
51.4k
  unsigned   mcu_nvsbs;
510
51.4k
  ptrdiff_t  mcu_nfrags;
511
51.4k
  int        flimit;
512
51.4k
  int        hdec;
513
51.4k
  int        vdec;
514
51.4k
  int        pli;
515
51.4k
  int        nqis;
516
51.4k
  int        qii;
517
51.4k
  int        qi0;
518
51.4k
  int        qti;
519
  /*Initialize the per-plane coded block flag trackers.
520
    These are used for bit-estimation purposes only; the real flag bits span
521
     all three planes, so we can't compute them in parallel.*/
522
205k
  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
523
205k
  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
524
  /*Set up the per-plane skip SSD storage pointers.*/
525
51.4k
  mcu_nvsbs=_enc->mcu_nvsbs;
526
51.4k
  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
527
51.4k
  hdec=!(_enc->state.info.pixel_fmt&1);
528
51.4k
  vdec=!(_enc->state.info.pixel_fmt&2);
529
51.4k
  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
530
51.4k
  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
531
51.4k
  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
532
  /*Set up per-plane pointers to the coded and uncoded fragments lists.
533
    Unlike the decoder, each planes' coded and uncoded fragment list is kept
534
     separate during the analysis stage; we only make the coded list for all
535
     three planes contiguous right before the final packet is output
536
     (destroying the uncoded lists, which are no longer needed).*/
537
51.4k
  coded_fragis=_enc->state.coded_fragis;
538
205k
  for(pli=0;pli<3;pli++){
539
154k
    _pipe->coded_fragis[pli]=coded_fragis;
540
154k
    coded_fragis+=_enc->state.fplanes[pli].nfrags;
541
154k
    _pipe->uncoded_fragis[pli]=coded_fragis;
542
154k
  }
543
51.4k
  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
544
51.4k
  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
545
  /*Set up condensed quantizer tables.*/
546
51.4k
  qi0=_enc->state.qis[0];
547
51.4k
  nqis=_enc->state.nqis;
548
205k
  for(pli=0;pli<3;pli++){
549
405k
    for(qii=0;qii<nqis;qii++){
550
250k
      int qi;
551
250k
      qi=_enc->state.qis[qii];
552
752k
      for(qti=0;qti<2;qti++){
553
        /*Set the DC coefficient in the dequantization table.*/
554
501k
        _enc->state.dequant_tables[qi][pli][qti][0]=
555
501k
         _enc->dequant_dc[qi0][pli][qti];
556
501k
        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
557
        /*Copy over the quantization table.*/
558
501k
        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
559
501k
         _enc->opt_data.enquant_table_size);
560
501k
      }
561
250k
    }
562
154k
  }
563
  /*Fix up the DC coefficients in the quantization tables.*/
564
51.4k
  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
565
  /*Initialize the tokenization state.*/
566
205k
  for(pli=0;pli<3;pli++){
567
154k
    _pipe->ndct_tokens1[pli]=0;
568
154k
    _pipe->eob_run1[pli]=0;
569
154k
  }
570
  /*Initialize the bounding value array for the loop filter.*/
571
51.4k
  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
572
51.4k
  _pipe->loop_filter=flimit!=0;
573
51.4k
  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
574
  /*Clear the temporary DCT scratch space.*/
575
51.4k
  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
576
51.4k
}
577
578
/*Sets the current MCU stripe to super block row _sby.
579
  Return: A non-zero value if this was the last MCU.*/
580
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
581
206k
 oc_enc_pipeline_state *_pipe,int _sby){
582
206k
  const oc_fragment_plane *fplane;
583
206k
  unsigned                 mcu_nvsbs;
584
206k
  int                      sby_end;
585
206k
  int                      notdone;
586
206k
  int                      vdec;
587
206k
  int                      pli;
588
206k
  mcu_nvsbs=_enc->mcu_nvsbs;
589
206k
  sby_end=_enc->state.fplanes[0].nvsbs;
590
206k
  notdone=_sby+mcu_nvsbs<sby_end;
591
206k
  if(notdone)sby_end=_sby+mcu_nvsbs;
592
206k
  vdec=0;
593
827k
  for(pli=0;pli<3;pli++){
594
620k
    fplane=_enc->state.fplanes+pli;
595
620k
    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
596
620k
    _pipe->fragy0[pli]=_sby<<2-vdec;
597
620k
    _pipe->froffset[pli]=fplane->froffset
598
620k
     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
599
620k
    if(notdone){
600
466k
      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
601
466k
      _pipe->fragy_end[pli]=sby_end<<2-vdec;
602
466k
    }
603
154k
    else{
604
154k
      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
605
154k
      _pipe->fragy_end[pli]=fplane->nvfrags;
606
154k
    }
607
620k
    vdec=!(_enc->state.info.pixel_fmt&2);
608
620k
  }
609
206k
  return notdone;
610
206k
}
611
612
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
613
620k
 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
614
  /*Copy over all the uncoded fragments from this plane and advance the uncoded
615
     fragment list.*/
616
620k
  if(_pipe->nuncoded_fragis[_pli]>0){
617
54.4k
    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
618
54.4k
    oc_frag_copy_list(&_enc->state,
619
54.4k
     _enc->state.ref_frame_data[OC_FRAME_SELF],
620
54.4k
     _enc->state.ref_frame_data[OC_FRAME_PREV],
621
54.4k
     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
622
54.4k
     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
623
54.4k
    _pipe->nuncoded_fragis[_pli]=0;
624
54.4k
  }
625
  /*Perform DC prediction.*/
626
620k
  oc_enc_pred_dc_frag_rows(_enc,_pli,
627
620k
   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
628
  /*Finish DC tokenization.*/
629
620k
  oc_enc_tokenize_dc_frag_list(_enc,_pli,
630
620k
   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
631
620k
   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
632
620k
  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
633
620k
  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
634
  /*And advance the coded fragment list.*/
635
620k
  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
636
620k
  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
620k
  _pipe->ncoded_fragis[_pli]=0;
638
  /*Apply the loop filter if necessary.*/
639
620k
  if(_pipe->loop_filter){
640
352k
    oc_state_loop_filter_frag_rows(&_enc->state,
641
352k
     _pipe->bounding_values,OC_FRAME_SELF,_pli,
642
352k
     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
643
352k
  }
644
267k
  else _sdelay=_edelay=0;
645
  /*To fill borders, we have an additional two pixel delay, since a fragment
646
     in the next row could filter its top edge, using two pixels from a
647
     fragment in this row.
648
    But there's no reason to delay a full fragment between the two.*/
649
620k
  oc_state_borders_fill_rows(&_enc->state,
650
620k
   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
651
620k
   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
652
620k
   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
653
620k
}
654
655
656
657
/*Cost information about the coded blocks in a MB.*/
658
struct oc_rd_metric{
659
  int uncoded_ac_ssd;
660
  int coded_ac_ssd;
661
  int ac_bits;
662
  int dc_flag;
663
};
664
665
666
667
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
668
 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
669
 unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
670
20.6M
 oc_fr_state *_fr,oc_token_checkpoint **_stack){
671
20.6M
  ogg_int16_t            *data;
672
20.6M
  ogg_int16_t            *dct;
673
20.6M
  ogg_int16_t            *idct;
674
20.6M
  oc_qii_state            qs;
675
20.6M
  const ogg_uint16_t     *dequant;
676
20.6M
  ogg_uint16_t            dequant_dc;
677
20.6M
  ptrdiff_t               frag_offs;
678
20.6M
  int                     ystride;
679
20.6M
  const unsigned char    *src;
680
20.6M
  const unsigned char    *ref;
681
20.6M
  unsigned char          *dst;
682
20.6M
  int                     nonzero;
683
20.6M
  unsigned                uncoded_ssd;
684
20.6M
  unsigned                coded_ssd;
685
20.6M
  oc_token_checkpoint    *checkpoint;
686
20.6M
  oc_fragment            *frags;
687
20.6M
  int                     mb_mode;
688
20.6M
  int                     refi;
689
20.6M
  int                     mv_offs[2];
690
20.6M
  int                     nmv_offs;
691
20.6M
  int                     ac_bits;
692
20.6M
  int                     borderi;
693
20.6M
  int                     nqis;
694
20.6M
  int                     qti;
695
20.6M
  int                     qii;
696
20.6M
  int                     dc;
697
20.6M
  nqis=_enc->state.nqis;
698
20.6M
  frags=_enc->state.frags;
699
20.6M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
700
20.6M
  ystride=_enc->state.ref_ystride[_pli];
701
20.6M
  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
702
20.6M
  borderi=frags[_fragi].borderi;
703
20.6M
  qii=frags[_fragi].qii;
704
20.6M
  data=_enc->pipe.dct_data;
705
20.6M
  dct=data+64;
706
20.6M
  idct=data+128;
707
20.6M
  if(qii&~3){
708
144k
#if !defined(OC_COLLECT_METRICS)
709
144k
    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
710
      /*Enable early skip detection.*/
711
144k
      frags[_fragi].coded=0;
712
144k
      frags[_fragi].refi=OC_FRAME_NONE;
713
144k
      oc_fr_skip_block(_fr);
714
144k
      return 0;
715
144k
    }
716
0
#endif
717
    /*Try and code this block anyway.*/
718
0
    qii&=3;
719
0
  }
720
20.4M
  refi=frags[_fragi].refi;
721
20.4M
  mb_mode=frags[_fragi].mb_mode;
722
20.4M
  ref=_enc->state.ref_frame_data[refi]+frag_offs;
723
20.4M
  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
724
  /*Motion compensation:*/
725
20.4M
  switch(mb_mode){
726
20.0M
    case OC_MODE_INTRA:{
727
20.0M
      nmv_offs=0;
728
20.0M
      oc_enc_frag_sub_128(_enc,data,src,ystride);
729
20.0M
    }break;
730
13.8k
    case OC_MODE_GOLDEN_NOMV:
731
169k
    case OC_MODE_INTER_NOMV:{
732
169k
      nmv_offs=1;
733
169k
      mv_offs[0]=0;
734
169k
      oc_enc_frag_sub(_enc,data,src,ref,ystride);
735
169k
    }break;
736
270k
    default:{
737
270k
      const oc_mv *frag_mvs;
738
270k
      frag_mvs=_enc->state.frag_mvs;
739
270k
      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
740
270k
       _pli,frag_mvs[_fragi]);
741
270k
      if(nmv_offs>1){
742
237k
        oc_enc_frag_copy2(_enc,dst,
743
237k
         ref+mv_offs[0],ref+mv_offs[1],ystride);
744
237k
        oc_enc_frag_sub(_enc,data,src,dst,ystride);
745
237k
      }
746
32.8k
      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
747
270k
    }break;
748
20.4M
  }
749
#if defined(OC_COLLECT_METRICS)
750
  {
751
    unsigned sad;
752
    unsigned satd;
753
    switch(nmv_offs){
754
      case 0:{
755
        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
756
        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
757
      }break;
758
      case 1:{
759
        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
760
        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
761
        satd+=abs(dc);
762
      }break;
763
      default:{
764
        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
765
        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
766
        satd+=abs(dc);
767
      }break;
768
    }
769
    _enc->frag_sad[_fragi]=sad;
770
    _enc->frag_satd[_fragi]=satd;
771
  }
772
#endif
773
  /*Transform:*/
774
20.4M
  oc_enc_fdct8x8(_enc,dct,data);
775
  /*Quantize:*/
776
20.4M
  qti=mb_mode!=OC_MODE_INTRA;
777
20.4M
  dequant=_enc->dequant[_pli][qii][qti];
778
20.4M
  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
779
20.4M
  dc=data[0];
780
  /*Tokenize.*/
781
20.4M
  checkpoint=*_stack;
782
20.4M
  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
783
20.4M
    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
784
20.4M
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
785
20.4M
  }
786
0
  else{
787
0
    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
788
0
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
789
0
  }
790
  /*Reconstruct.
791
    TODO: nonzero may need to be adjusted after tokenization.*/
792
20.4M
  dequant_dc=dequant[0];
793
20.4M
  if(nonzero==0){
794
17.6M
    ogg_int16_t p;
795
17.6M
    int         ci;
796
17.6M
    int         qi01;
797
17.6M
    int         qi12;
798
    /*We round this dequant product (and not any of the others) because there's
799
       no iDCT rounding.*/
800
17.6M
    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
801
    /*LOOP VECTORIZES.*/
802
1.14G
    for(ci=0;ci<64;ci++)data[ci]=p;
803
    /*We didn't code any AC coefficients, so don't change the quantizer.*/
804
17.6M
    qi01=_pipe->qs[_pli].qi01;
805
17.6M
    qi12=_pipe->qs[_pli].qi12;
806
17.6M
    if(qi01>0)qii=1+qi12;
807
15.4M
    else if(qi01>=0)qii=0;
808
17.6M
  }
809
2.82M
  else{
810
2.82M
    idct[0]=dc*dequant_dc;
811
    /*Note: This clears idct[] back to zero for the next block.*/
812
2.82M
    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
813
2.82M
  }
814
20.4M
  frags[_fragi].qii=qii;
815
20.4M
  if(nqis>1){
816
7.57M
    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
817
7.57M
    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
818
7.57M
  }
819
20.4M
  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
820
439k
  else{
821
439k
    oc_enc_frag_recon_inter(_enc,dst,
822
439k
     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
823
439k
  }
824
  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
825
20.4M
#if !defined(OC_COLLECT_METRICS)
826
20.4M
  if(_fr!=NULL)
827
1.23M
#endif
828
1.23M
  {
829
    /*In retrospect, should we have skipped this block?*/
830
1.23M
    if(borderi<0){
831
758k
      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
832
758k
    }
833
478k
    else{
834
478k
      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
835
478k
       _enc->state.borders[borderi].mask);
836
478k
    }
837
    /*Scale to match DCT domain.*/
838
1.23M
    coded_ssd<<=4;
839
#if defined(OC_COLLECT_METRICS)
840
    _enc->frag_ssd[_fragi]=coded_ssd;
841
  }
842
  if(_fr!=NULL){
843
#endif
844
1.23M
    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
845
1.23M
    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
846
1.23M
    if(uncoded_ssd<UINT_MAX&&
847
     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
848
        is enabled.*/
849
1.23M
     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
850
1.23M
      int overhead_bits;
851
1.23M
      overhead_bits=oc_fr_cost1(_fr);
852
      /*Although the fragment coding overhead determination is accurate, it is
853
         greedy, using very coarse-grained local information.
854
        Allowing it to mildly discourage coding turns out to be beneficial, but
855
         it's not clear that allowing it to encourage coding through negative
856
         coding overhead deltas is useful.
857
        For that reason, we disallow negative coding overheads.*/
858
1.23M
      if(overhead_bits<0)overhead_bits=0;
859
1.23M
      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
860
        /*Hm, not worth it; roll back.*/
861
258k
        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
862
258k
        *_stack=checkpoint;
863
258k
        frags[_fragi].coded=0;
864
258k
        frags[_fragi].refi=OC_FRAME_NONE;
865
258k
        oc_fr_skip_block(_fr);
866
258k
        return 0;
867
258k
      }
868
1.23M
    }
869
0
    else _mo->dc_flag=1;
870
979k
    _mo->uncoded_ac_ssd+=uncoded_ssd;
871
979k
    _mo->coded_ac_ssd+=coded_ssd;
872
979k
    _mo->ac_bits+=ac_bits;
873
979k
    oc_fr_code_block(_fr);
874
979k
  }
875
  /*GCC 4.4.4 generates a warning here because it can't tell that
876
     the init code in the nqis check above will run anytime this
877
     line runs.*/
878
20.2M
  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
879
20.2M
  frags[_fragi].dc=dc;
880
20.2M
  frags[_fragi].coded=1;
881
20.2M
  return 1;
882
20.4M
}
883
884
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
885
 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
886
221k
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
887
  /*Worst case token stack usage for 4 fragments.*/
888
221k
  oc_token_checkpoint  stack[64*4];
889
221k
  oc_token_checkpoint *stackptr;
890
221k
  const oc_sb_map     *sb_maps;
891
221k
  signed char         *mb_modes;
892
221k
  oc_fragment         *frags;
893
221k
  ptrdiff_t           *coded_fragis;
894
221k
  ptrdiff_t            ncoded_fragis;
895
221k
  ptrdiff_t           *uncoded_fragis;
896
221k
  ptrdiff_t            nuncoded_fragis;
897
221k
  oc_rd_metric         mo;
898
221k
  oc_fr_state          fr_checkpoint;
899
221k
  oc_qii_state         qs_checkpoint;
900
221k
  int                  mb_mode;
901
221k
  int                  refi;
902
221k
  int                  ncoded;
903
221k
  ptrdiff_t            fragi;
904
221k
  int                  bi;
905
221k
  *&fr_checkpoint=*(_pipe->fr+0);
906
221k
  *&qs_checkpoint=*(_pipe->qs+0);
907
221k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
908
221k
  mb_modes=_enc->state.mb_modes;
909
221k
  frags=_enc->state.frags;
910
221k
  coded_fragis=_pipe->coded_fragis[0];
911
221k
  ncoded_fragis=_pipe->ncoded_fragis[0];
912
221k
  uncoded_fragis=_pipe->uncoded_fragis[0];
913
221k
  nuncoded_fragis=_pipe->nuncoded_fragis[0];
914
221k
  mb_mode=mb_modes[_mbi];
915
221k
  refi=OC_FRAME_FOR_MODE(mb_mode);
916
221k
  ncoded=0;
917
221k
  stackptr=stack;
918
221k
  memset(&mo,0,sizeof(mo));
919
1.10M
  for(bi=0;bi<4;bi++){
920
886k
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
921
886k
    frags[fragi].refi=refi;
922
886k
    frags[fragi].mb_mode=mb_mode;
923
886k
    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
924
886k
     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
925
643k
      coded_fragis[ncoded_fragis++]=fragi;
926
643k
      ncoded++;
927
643k
    }
928
242k
    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
929
886k
  }
930
221k
  if(ncoded>0&&!mo.dc_flag){
931
184k
    int cost;
932
    /*Some individual blocks were worth coding.
933
      See if that's still true when accounting for mode and MV overhead.*/
934
184k
    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
935
184k
     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
936
184k
    if(mo.uncoded_ac_ssd<=cost){
937
      /*Taking macroblock overhead into account, it is not worth coding this
938
         MB.*/
939
3.36k
      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
940
3.36k
      *(_pipe->fr+0)=*&fr_checkpoint;
941
3.36k
      *(_pipe->qs+0)=*&qs_checkpoint;
942
16.8k
      for(bi=0;bi<4;bi++){
943
13.4k
        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
944
13.4k
        if(frags[fragi].coded){
945
5.32k
          *(uncoded_fragis-++nuncoded_fragis)=fragi;
946
5.32k
          frags[fragi].coded=0;
947
5.32k
          frags[fragi].refi=OC_FRAME_NONE;
948
5.32k
        }
949
13.4k
        oc_fr_skip_block(_pipe->fr+0);
950
13.4k
      }
951
3.36k
      ncoded_fragis-=ncoded;
952
3.36k
      ncoded=0;
953
3.36k
    }
954
184k
  }
955
  /*If no luma blocks coded, the mode is forced.*/
956
221k
  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
957
  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
958
     with a single coded block.
959
    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
960
     skipped blocks, while a 1MV does not.*/
961
180k
  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
962
161
    mb_modes[_mbi]=OC_MODE_INTER_MV;
963
161
  }
964
221k
  _pipe->ncoded_fragis[0]=ncoded_fragis;
965
221k
  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
966
221k
  return ncoded;
967
221k
}
968
969
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
970
59.4k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
971
59.4k
  const ogg_uint16_t *mcu_rd_scale;
972
59.4k
  const ogg_uint16_t *mcu_rd_iscale;
973
59.4k
  const oc_sb_map    *sb_maps;
974
59.4k
  oc_sb_flags        *sb_flags;
975
59.4k
  oc_fr_state        *fr;
976
59.4k
  ptrdiff_t          *coded_fragis;
977
59.4k
  ptrdiff_t           ncoded_fragis;
978
59.4k
  ptrdiff_t          *uncoded_fragis;
979
59.4k
  ptrdiff_t           nuncoded_fragis;
980
59.4k
  ptrdiff_t           froffset;
981
59.4k
  int                 sbi;
982
59.4k
  fr=_pipe->fr+_pli;
983
59.4k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
984
59.4k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
985
59.4k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
986
59.4k
  sb_flags=_enc->state.sb_flags;
987
59.4k
  coded_fragis=_pipe->coded_fragis[_pli];
988
59.4k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
989
59.4k
  uncoded_fragis=_pipe->uncoded_fragis[_pli];
990
59.4k
  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
991
59.4k
  froffset=_pipe->froffset[_pli];
992
145k
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
993
    /*Worst case token stack usage for 1 fragment.*/
994
86.0k
    oc_token_checkpoint stack[64];
995
86.0k
    oc_rd_metric        mo;
996
86.0k
    int                 quadi;
997
86.0k
    int                 bi;
998
86.0k
    memset(&mo,0,sizeof(mo));
999
1.72M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1000
1.37M
      ptrdiff_t fragi;
1001
1.37M
      fragi=sb_maps[sbi][quadi][bi];
1002
1.37M
      if(fragi>=0){
1003
496k
        oc_token_checkpoint *stackptr;
1004
496k
        unsigned             rd_scale;
1005
496k
        unsigned             rd_iscale;
1006
496k
        rd_scale=mcu_rd_scale[fragi-froffset];
1007
496k
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1008
496k
        stackptr=stack;
1009
496k
        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1010
496k
         rd_scale,rd_iscale,&mo,fr,&stackptr)){
1011
335k
          coded_fragis[ncoded_fragis++]=fragi;
1012
335k
        }
1013
160k
        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1014
496k
      }
1015
1.37M
    }
1016
86.0k
    oc_fr_state_flush_sb(fr);
1017
86.0k
    sb_flags[sbi].coded_fully=fr->sb_full;
1018
86.0k
    sb_flags[sbi].coded_partially=fr->sb_partial;
1019
86.0k
  }
1020
59.4k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1021
59.4k
  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1022
59.4k
}
1023
1024
/*Mode decision is done by exhaustively examining all potential choices.
1025
  Obviously, doing the motion compensation, fDCT, tokenization, and then
1026
   counting the bits each token uses is computationally expensive.
1027
  Theora's EOB runs can also split the cost of these tokens across multiple
1028
   fragments, and naturally we don't know what the optimal choice of Huffman
1029
   codes will be until we know all the tokens we're going to encode in all the
1030
   fragments.
1031
  So we use a simple approach to estimating the bit cost and distortion of each
1032
   mode based upon the SATD value of the residual before coding.
1033
  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1034
   the process (modified somewhat from that of the paper) is very simple.
1035
  We build a non-linear regression of the mappings from
1036
   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1037
   SSD for each qi.
1038
  A separate set of mappings is kept for each quantization type and color
1039
   plane.
1040
  The mappings are constructed by partitioning the SATD values into a small
1041
   number of bins (currently 24) and using a linear regression in each bin
1042
   (as opposed to the 0th-order regression used by Kim).
1043
  The bit counts and SSD measurements are obtained by examining actual encoded
1044
   frames, with appropriate lambda values and optimal Huffman codes selected.
1045
  EOB bits are assigned to the fragment that started the EOB run (as opposed to
1046
   dividing them among all the blocks in the run; the latter approach seems
1047
   more theoretically correct, but Monty's testing showed a small improvement
1048
   with the former, though that may have been merely statistical noise).
1049
1050
  @ARTICLE{Kim03,
1051
    author="Hyun Mun Kim",
1052
    title="Adaptive Rate Control Using Nonlinear Regression",
1053
    journal="IEEE Transactions on Circuits and Systems for Video Technology",
1054
    volume=13,
1055
    number=5,
1056
    pages="432--439",
1057
    month=May,
1058
    year=2003
1059
  }*/
1060
1061
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1062
   overflow for large lambda values.*/
1063
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1064
69.0M
 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1065
69.0M
 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1066
69.0M
 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1067
1068
51.4k
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1069
51.4k
#if !defined(OC_COLLECT_METRICS)
1070
51.4k
  const
1071
51.4k
#endif
1072
51.4k
  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1073
51.4k
   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1074
51.4k
  int qii;
1075
#if defined(OC_COLLECT_METRICS)
1076
  oc_enc_mode_metrics_load(_enc);
1077
#endif
1078
135k
  for(qii=0;qii<_enc->state.nqis;qii++){
1079
83.5k
    int qi;
1080
83.5k
    int pli;
1081
83.5k
    qi=_enc->state.qis[qii];
1082
334k
    for(pli=0;pli<3;pli++){
1083
250k
      int qti;
1084
752k
      for(qti=0;qti<2;qti++){
1085
501k
        int log_plq;
1086
501k
        int modeline;
1087
501k
        int bin;
1088
501k
        int dx;
1089
501k
        int dq;
1090
501k
        log_plq=_enc->log_plq[qi][pli][qti];
1091
        /*Find the pair of rows in the mode table that bracket this quantizer.
1092
          If it falls outside the range the table covers, then we just use a
1093
           pair on the edge for linear extrapolation.*/
1094
2.44M
        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1095
2.44M
         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1096
        /*Interpolate a row for this quantizer.*/
1097
501k
        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1098
501k
        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1099
501k
        if(dq==0)dq=1;
1100
12.5M
        for(bin=0;bin<OC_COMP_BINS;bin++){
1101
12.0M
          int y0;
1102
12.0M
          int z0;
1103
12.0M
          int dy;
1104
12.0M
          int dz;
1105
12.0M
          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1106
12.0M
          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1107
12.0M
          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1108
12.0M
          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1109
12.0M
          _enc->mode_rd[qii][pli][qti][bin].rate=
1110
12.0M
           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1111
12.0M
          _enc->mode_rd[qii][pli][qti][bin].rmse=
1112
12.0M
           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1113
12.0M
        }
1114
501k
      }
1115
250k
    }
1116
83.5k
  }
1117
51.4k
}
1118
1119
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1120
   prediction.*/
1121
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1122
45.5M
 int _qii,int _pli,int _qti,int _satd){
1123
45.5M
  unsigned rmse;
1124
45.5M
  int      shift;
1125
45.5M
  int      bin;
1126
45.5M
  int      dx;
1127
45.5M
  int      y0;
1128
45.5M
  int      z0;
1129
45.5M
  int      dy;
1130
45.5M
  int      dz;
1131
  /*SATD metrics for chroma planes vary much less than luma, so we scale them
1132
     by 4 to distribute them into the mode decision bins more evenly.*/
1133
45.5M
  _satd<<=_pli+1&2;
1134
45.5M
  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1135
45.5M
  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1136
45.5M
  dx=_satd-(bin<<shift);
1137
45.5M
  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1138
45.5M
  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1139
45.5M
  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1140
45.5M
  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1141
45.5M
  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1142
45.5M
  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1143
45.5M
  return OC_MAXI(y0+(dy*dx>>shift),0);
1144
45.5M
}
1145
1146
/*activity_avg must be positive, or flat regions could get a zero weight, which
1147
   confounds analysis.
1148
  We set the minimum to this value so that it also avoids the need for divide
1149
   by zero checks in oc_mb_masking().*/
1150
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1151
1152
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1153
2.71M
 unsigned _activity[4]){
1154
2.71M
  const unsigned char *src;
1155
2.71M
  const ptrdiff_t     *frag_buf_offs;
1156
2.71M
  const ptrdiff_t     *sb_map;
1157
2.71M
  unsigned             luma;
1158
2.71M
  int                  ystride;
1159
2.71M
  ptrdiff_t            frag_offs;
1160
2.71M
  ptrdiff_t            fragi;
1161
2.71M
  int                  bi;
1162
2.71M
  frag_buf_offs=_enc->state.frag_buf_offs;
1163
2.71M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1164
2.71M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1165
2.71M
  ystride=_enc->state.ref_ystride[0];
1166
2.71M
  luma=0;
1167
13.5M
  for(bi=0;bi<4;bi++){
1168
10.8M
    const unsigned char *s;
1169
10.8M
    unsigned             x;
1170
10.8M
    unsigned             x2;
1171
10.8M
    unsigned             act;
1172
10.8M
    int                  i;
1173
10.8M
    int                  j;
1174
10.8M
    fragi=sb_map[bi];
1175
10.8M
    frag_offs=frag_buf_offs[fragi];
1176
    /*TODO: This could be replaced with SATD^2, since we already have to
1177
       compute SATD.*/
1178
10.8M
    x=x2=0;
1179
10.8M
    s=src+frag_offs;
1180
97.6M
    for(i=0;i<8;i++){
1181
781M
      for(j=0;j<8;j++){
1182
694M
        unsigned c;
1183
694M
        c=s[j];
1184
694M
        x+=c;
1185
694M
        x2+=c*c;
1186
694M
      }
1187
86.8M
      s+=ystride;
1188
86.8M
    }
1189
10.8M
    luma+=x;
1190
10.8M
    act=(x2<<6)-x*x;
1191
10.8M
    if(act<8<<12){
1192
      /*The region is flat.*/
1193
8.65M
      act=OC_MINI(act,5<<12);
1194
8.65M
    }
1195
2.20M
    else{
1196
2.20M
      unsigned e1;
1197
2.20M
      unsigned e2;
1198
2.20M
      unsigned e3;
1199
2.20M
      unsigned e4;
1200
      /*Test for an edge.
1201
        TODO: There are probably much simpler ways to do this (e.g., it could
1202
         probably be combined with the SATD calculation).
1203
        Alternatively, we could split the block around the mean and compute the
1204
         reduction in variance in each half.
1205
        For a Gaussian source the reduction should be
1206
         (1-2/pi) ~= 0.36338022763241865692446494650994.
1207
        Significantly more reduction is a good indication of a bi-level image.
1208
        This has the advantage of identifying, in addition to straight edges,
1209
         small text regions, which would otherwise be classified as "texture".*/
1210
2.20M
      e1=e2=e3=e4=0;
1211
2.20M
      s=src+frag_offs-1;
1212
19.8M
      for(i=0;i<8;i++){
1213
158M
        for(j=0;j<8;j++){
1214
140M
          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1215
140M
           +(s+ystride)[j+2]-(s+ystride)[j]);
1216
140M
          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1217
140M
           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1218
140M
          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1219
140M
           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1220
140M
          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1221
140M
           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1222
140M
        }
1223
17.6M
        s+=ystride;
1224
17.6M
      }
1225
      /*If the largest component of the edge energy is at least 40% of the
1226
         total, then classify the block as an edge block.*/
1227
2.20M
      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1228
         /*act=act_th*(act/act_th)**0.7
1229
              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
1230
           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1231
31.8k
         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1232
31.8k
      }
1233
2.20M
    }
1234
10.8M
    _activity[bi]=act;
1235
10.8M
  }
1236
2.71M
  return luma;
1237
2.71M
}
1238
1239
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1240
0
 unsigned _activity[4],const unsigned _intra_satd[12]){
1241
0
  int bi;
1242
0
  for(bi=0;bi<4;bi++){
1243
0
    unsigned act;
1244
0
    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1245
0
    if(act<8<<12){
1246
      /*The region is flat.*/
1247
0
      act=OC_MINI(act,5<<12);
1248
0
    }
1249
0
    _activity[bi]=act;
1250
0
  }
1251
0
}
1252
1253
/*Compute the masking scales for the blocks in a macro block.
1254
  All masking is computed from the luma blocks.
1255
  We derive scaling factors for the chroma blocks from these, and use the same
1256
   ones for all chroma blocks, regardless of the subsampling.
1257
  It's possible for luma to be perfectly flat and yet have high chroma energy,
1258
   but this is unlikely in non-artificial images, and not a case that has been
1259
   addressed by any research to my knowledge.
1260
  The output of the masking process is two scale factors, which are fed into
1261
   the various R-D optimizations.
1262
  The first, rd_scale, is applied to D in the equation
1263
    D*rd_scale+lambda*R.
1264
  This is the form that must be used to properly combine scores from multiple
1265
   blocks, and can be interpreted as scaling distortions by their visibility.
1266
  The inverse, rd_iscale, is applied to lambda in the equation
1267
    D+rd_iscale*lambda*R.
1268
  This is equivalent to the first form within a single block, but much faster
1269
   to use when evaluating many possible distortions (e.g., during actual
1270
   quantization, where separate distortions are evaluated for every
1271
   coefficient).
1272
  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1273
   used to perform the multiplications with the proper re-scaling for the range
1274
   of the scaling factors.
1275
  Many researchers apply masking values directly to the quantizers used, and
1276
   not to the R-D cost.
1277
  Since we generally use MSE for D, rd_scale must use the square of their
1278
   values to generate an equivalent effect.*/
1279
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1280
 const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1281
2.71M
 unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1282
2.71M
  unsigned activity_sum;
1283
2.71M
  unsigned la;
1284
2.71M
  unsigned lb;
1285
2.71M
  unsigned d;
1286
2.71M
  int      bi;
1287
2.71M
  int      bi_min;
1288
2.71M
  int      bi_min2;
1289
  /*The ratio lb/la is meant to approximate
1290
     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1291
     effective luminance masking from~\cite{LKW06} (including the self-masking
1292
     deflator).
1293
    The following actually turns out to be a pretty good approximation for
1294
     _luma>75 or so.
1295
    For smaller values luminance does not really follow Weber's Law anyway, and
1296
     this approximation gives a much less aggressive bitrate boost in this
1297
     region.
1298
    Though some researchers claim that contrast sensitivity actually decreases
1299
     for very low luminance values, in my experience excessive brightness on
1300
     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1301
     of the CCIR 601 range) make artifacts in such regions extremely visible.
1302
    We substitute _luma_avg for 128 to allow the strength of the masking to
1303
     vary with the actual average image luminance, within certain limits (the
1304
     caller has clamped _luma_avg to the range [90,160], inclusive).
1305
    @ARTICLE{LKW06,
1306
      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1307
      title="{JPEG2000} Encoding With Perceptual Distortion Control",
1308
      journal="{IEEE} Transactions on Image Processing",
1309
      volume=15,
1310
      number=7,
1311
      pages="1763--1778",
1312
      month=Jul,
1313
      year=2006
1314
    }*/
1315
#if 0
1316
  la=_luma+4*_luma_avg;
1317
  lb=4*_luma+_luma_avg;
1318
#else
1319
  /*Disable luminance masking.*/
1320
2.71M
  la=lb=1;
1321
2.71M
#endif
1322
2.71M
  activity_sum=0;
1323
13.5M
  for(bi=0;bi<4;bi++){
1324
10.8M
    unsigned a;
1325
10.8M
    unsigned b;
1326
10.8M
    activity_sum+=_activity[bi];
1327
    /*Apply activity masking.*/
1328
10.8M
    a=_activity[bi]+4*_activity_avg;
1329
10.8M
    b=4*_activity[bi]+_activity_avg;
1330
10.8M
    d=OC_RD_SCALE(b,1);
1331
    /*And luminance masking.*/
1332
10.8M
    d=(a+(d>>1))/d;
1333
10.8M
    _rd_scale[bi]=(d*la+(lb>>1))/lb;
1334
    /*And now the inverse.*/
1335
10.8M
    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1336
10.8M
    d=(b+(d>>1))/d;
1337
10.8M
    _rd_iscale[bi]=(d*lb+(la>>1))/la;
1338
10.8M
  }
1339
  /*Now compute scaling factors for chroma blocks.
1340
    We start by finding the two smallest iscales from the luma blocks.*/
1341
2.71M
  bi_min=_rd_iscale[1]<_rd_iscale[0];
1342
2.71M
  bi_min2=1-bi_min;
1343
8.14M
  for(bi=2;bi<4;bi++){
1344
5.42M
    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1345
399k
      bi_min2=bi_min;
1346
399k
      bi_min=bi;
1347
399k
    }
1348
5.02M
    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1349
5.42M
  }
1350
  /*If the minimum iscale is less than 1.0, use the second smallest instead,
1351
     and force the value to at least 1.0 (inflating chroma is a waste).*/
1352
2.71M
  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1353
2.71M
  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1354
2.71M
  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1355
2.71M
  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1356
2.71M
  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1357
2.71M
  return activity_sum;
1358
2.71M
}
1359
1360
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1361
221k
 unsigned _frag_satd[12]){
1362
221k
  const unsigned char   *src;
1363
221k
  const ptrdiff_t       *frag_buf_offs;
1364
221k
  const ptrdiff_t       *sb_map;
1365
221k
  const oc_mb_map_plane *mb_map;
1366
221k
  const unsigned char   *map_idxs;
1367
221k
  int                    map_nidxs;
1368
221k
  int                    mapii;
1369
221k
  int                    mapi;
1370
221k
  int                    ystride;
1371
221k
  int                    pli;
1372
221k
  int                    bi;
1373
221k
  ptrdiff_t              fragi;
1374
221k
  ptrdiff_t              frag_offs;
1375
221k
  unsigned               luma;
1376
221k
  int                    dc;
1377
221k
  frag_buf_offs=_enc->state.frag_buf_offs;
1378
221k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1379
221k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1380
221k
  ystride=_enc->state.ref_ystride[0];
1381
221k
  luma=0;
1382
1.10M
  for(bi=0;bi<4;bi++){
1383
886k
    fragi=sb_map[bi];
1384
886k
    frag_offs=frag_buf_offs[fragi];
1385
886k
    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1386
886k
    luma+=dc;
1387
886k
  }
1388
221k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1389
221k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1390
221k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1391
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1392
221k
  ystride=_enc->state.ref_ystride[1];
1393
717k
  for(mapii=4;mapii<map_nidxs;mapii++){
1394
496k
    mapi=map_idxs[mapii];
1395
496k
    pli=mapi>>2;
1396
496k
    bi=mapi&3;
1397
496k
    fragi=mb_map[pli][bi];
1398
496k
    frag_offs=frag_buf_offs[fragi];
1399
496k
    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1400
496k
  }
1401
221k
  return luma;
1402
221k
}
1403
1404
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1405
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1406
2.49M
 const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1407
2.49M
  const unsigned char *src;
1408
2.49M
  const ptrdiff_t     *frag_buf_offs;
1409
2.49M
  const oc_sb_map     *sb_maps;
1410
2.49M
  oc_fragment         *frags;
1411
2.49M
  ptrdiff_t            frag_offs;
1412
2.49M
  ptrdiff_t            fragi;
1413
2.49M
  oc_qii_state         qs[4][3];
1414
2.49M
  unsigned             cost[4][3];
1415
2.49M
  unsigned             ssd[4][3];
1416
2.49M
  unsigned             rate[4][3];
1417
2.49M
  int                  prev[3][3];
1418
2.49M
  unsigned             satd;
1419
2.49M
  int                  dc;
1420
2.49M
  unsigned             best_cost;
1421
2.49M
  unsigned             best_ssd;
1422
2.49M
  unsigned             best_rate;
1423
2.49M
  int                  best_qii;
1424
2.49M
  int                  qii;
1425
2.49M
  int                  lambda;
1426
2.49M
  int                  ystride;
1427
2.49M
  int                  nqis;
1428
2.49M
  int                  bi;
1429
2.49M
  frag_buf_offs=_enc->state.frag_buf_offs;
1430
2.49M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1431
2.49M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1432
2.49M
  ystride=_enc->state.ref_ystride[0];
1433
2.49M
  fragi=sb_maps[_mbi>>2][_mbi&3][0];
1434
2.49M
  frag_offs=frag_buf_offs[fragi];
1435
2.49M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1436
2.49M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1437
2.49M
  }
1438
0
  else{
1439
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1440
0
  }
1441
2.49M
  nqis=_enc->state.nqis;
1442
2.49M
  lambda=_enc->lambda;
1443
6.65M
  for(qii=0;qii<nqis;qii++){
1444
4.16M
    oc_qii_state_advance(qs[0]+qii,_qs,qii);
1445
4.16M
    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1446
4.16M
     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1447
4.16M
    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1448
4.16M
    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1449
4.16M
  }
1450
9.96M
  for(bi=1;bi<4;bi++){
1451
7.47M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1452
7.47M
    frag_offs=frag_buf_offs[fragi];
1453
7.47M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1454
7.47M
      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1455
7.47M
    }
1456
0
    else{
1457
0
      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1458
0
    }
1459
19.9M
    for(qii=0;qii<nqis;qii++){
1460
12.4M
      oc_qii_state qt[3];
1461
12.4M
      unsigned     cur_ssd;
1462
12.4M
      unsigned     cur_rate;
1463
12.4M
      int          best_qij;
1464
12.4M
      int          qij;
1465
12.4M
      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1466
12.4M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1467
12.4M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1468
12.4M
      best_ssd=ssd[bi-1][0]+cur_ssd;
1469
12.4M
      best_rate=rate[bi-1][0]+cur_rate
1470
12.4M
       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1471
12.4M
      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1472
12.4M
      best_qij=0;
1473
27.2M
      for(qij=1;qij<nqis;qij++){
1474
14.7M
        unsigned chain_ssd;
1475
14.7M
        unsigned chain_rate;
1476
14.7M
        unsigned chain_cost;
1477
14.7M
        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1478
14.7M
        chain_ssd=ssd[bi-1][qij]+cur_ssd;
1479
14.7M
        chain_rate=rate[bi-1][qij]+cur_rate
1480
14.7M
         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1481
14.7M
        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1482
14.7M
        if(chain_cost<best_cost){
1483
6.29M
          best_cost=chain_cost;
1484
6.29M
          best_ssd=chain_ssd;
1485
6.29M
          best_rate=chain_rate;
1486
6.29M
          best_qij=qij;
1487
6.29M
        }
1488
14.7M
      }
1489
12.4M
      *(qs[bi]+qii)=*(qt+best_qij);
1490
12.4M
      cost[bi][qii]=best_cost;
1491
12.4M
      ssd[bi][qii]=best_ssd;
1492
12.4M
      rate[bi][qii]=best_rate;
1493
12.4M
      prev[bi-1][qii]=best_qij;
1494
12.4M
    }
1495
7.47M
  }
1496
2.49M
  best_qii=0;
1497
2.49M
  best_cost=cost[3][0];
1498
4.16M
  for(qii=1;qii<nqis;qii++){
1499
1.66M
    if(cost[3][qii]<best_cost){
1500
567k
      best_cost=cost[3][qii];
1501
567k
      best_qii=qii;
1502
567k
    }
1503
1.66M
  }
1504
2.49M
  frags=_enc->state.frags;
1505
9.96M
  for(bi=3;;){
1506
9.96M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1507
9.96M
    frags[fragi].qii=best_qii;
1508
9.96M
    if(bi--<=0)break;
1509
7.47M
    best_qii=prev[bi][best_qii];
1510
7.47M
  }
1511
2.49M
  return best_cost;
1512
2.49M
}
1513
1514
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1515
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1516
9.28M
 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1517
9.28M
  const unsigned char *src;
1518
9.28M
  oc_fragment         *frags;
1519
9.28M
  ptrdiff_t            frag_offs;
1520
9.28M
  oc_qii_state         qt[3];
1521
9.28M
  unsigned             cost[3];
1522
9.28M
  unsigned             satd;
1523
9.28M
  int                  dc;
1524
9.28M
  unsigned             best_cost;
1525
9.28M
  int                  best_qii;
1526
9.28M
  int                  qii;
1527
9.28M
  int                  lambda;
1528
9.28M
  int                  ystride;
1529
9.28M
  int                  nqis;
1530
9.28M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1531
9.28M
  ystride=_enc->state.ref_ystride[_pli];
1532
9.28M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
1533
9.28M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1534
9.28M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1535
9.28M
  }
1536
0
  else{
1537
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1538
0
  }
1539
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1540
     worth spending the bits to change the AC quantizer.
1541
    TODO: This may be worth revisiting when we separate out DC and AC
1542
     predictions from SATD.*/
1543
#if 0
1544
  nqis=_enc->state.nqis;
1545
#else
1546
9.28M
  nqis=1;
1547
9.28M
#endif
1548
9.28M
  lambda=_enc->lambda;
1549
9.28M
  best_qii=0;
1550
18.5M
  for(qii=0;qii<nqis;qii++){
1551
9.28M
    unsigned cur_rate;
1552
9.28M
    unsigned cur_ssd;
1553
9.28M
    oc_qii_state_advance(qt+qii,_qs,qii);
1554
9.28M
    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1555
9.28M
     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1556
9.28M
    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1557
9.28M
    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1558
9.28M
  }
1559
9.28M
  best_cost=cost[0];
1560
9.28M
  for(qii=1;qii<nqis;qii++){
1561
0
    if(cost[qii]<best_cost){
1562
0
      best_cost=cost[qii];
1563
0
      best_qii=qii;
1564
0
    }
1565
0
  }
1566
9.28M
  frags=_enc->state.frags;
1567
9.28M
  frags[_fragi].qii=best_qii;
1568
9.28M
  return best_cost;
1569
9.28M
}
1570
1571
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1572
 oc_enc_pipeline_state *_pipe,unsigned _mbi,
1573
2.49M
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1574
  /*Worst case token stack usage for 4 fragments.*/
1575
2.49M
  oc_token_checkpoint  stack[64*4];
1576
2.49M
  oc_token_checkpoint *stackptr;
1577
2.49M
  const oc_sb_map     *sb_maps;
1578
2.49M
  oc_fragment         *frags;
1579
2.49M
  ptrdiff_t           *coded_fragis;
1580
2.49M
  ptrdiff_t            ncoded_fragis;
1581
2.49M
  ptrdiff_t            fragi;
1582
2.49M
  int                  bi;
1583
2.49M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1584
2.49M
  frags=_enc->state.frags;
1585
2.49M
  coded_fragis=_pipe->coded_fragis[0];
1586
2.49M
  ncoded_fragis=_pipe->ncoded_fragis[0];
1587
2.49M
  stackptr=stack;
1588
12.4M
  for(bi=0;bi<4;bi++){
1589
9.96M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1590
9.96M
    frags[fragi].refi=OC_FRAME_SELF;
1591
9.96M
    frags[fragi].mb_mode=OC_MODE_INTRA;
1592
9.96M
    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1593
9.96M
     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1594
9.96M
    coded_fragis[ncoded_fragis++]=fragi;
1595
9.96M
  }
1596
2.49M
  _pipe->ncoded_fragis[0]=ncoded_fragis;
1597
2.49M
}
1598
1599
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1600
354k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1601
354k
  const ogg_uint16_t *mcu_rd_scale;
1602
354k
  const ogg_uint16_t *mcu_rd_iscale;
1603
354k
  const oc_sb_map    *sb_maps;
1604
354k
  ptrdiff_t          *coded_fragis;
1605
354k
  ptrdiff_t           ncoded_fragis;
1606
354k
  ptrdiff_t           froffset;
1607
354k
  int                 sbi;
1608
354k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1609
354k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1610
354k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1611
354k
  coded_fragis=_pipe->coded_fragis[_pli];
1612
354k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
1613
354k
  froffset=_pipe->froffset[_pli];
1614
1.86M
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1615
    /*Worst case token stack usage for 1 fragment.*/
1616
1.51M
    oc_token_checkpoint stack[64];
1617
1.51M
    int                 quadi;
1618
1.51M
    int                 bi;
1619
30.2M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1620
24.1M
      ptrdiff_t fragi;
1621
24.1M
      fragi=sb_maps[sbi][quadi][bi];
1622
24.1M
      if(fragi>=0){
1623
9.28M
        oc_token_checkpoint *stackptr;
1624
9.28M
        unsigned             rd_scale;
1625
9.28M
        unsigned             rd_iscale;
1626
9.28M
        rd_scale=mcu_rd_scale[fragi-froffset];
1627
9.28M
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1628
9.28M
        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1629
9.28M
        stackptr=stack;
1630
9.28M
        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1631
9.28M
         rd_scale,rd_iscale,NULL,NULL,&stackptr);
1632
9.28M
        coded_fragis[ncoded_fragis++]=fragi;
1633
9.28M
      }
1634
24.1M
    }
1635
1.51M
  }
1636
354k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1637
354k
}
1638
1639
/*Analysis stage for an INTRA frame.*/
1640
23.4k
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1641
23.4k
  ogg_int64_t             activity_sum;
1642
23.4k
  ogg_int64_t             luma_sum;
1643
23.4k
  unsigned                activity_avg;
1644
23.4k
  unsigned                luma_avg;
1645
23.4k
  const ogg_uint16_t     *chroma_rd_scale;
1646
23.4k
  ogg_uint16_t           *mcu_rd_scale;
1647
23.4k
  ogg_uint16_t           *mcu_rd_iscale;
1648
23.4k
  const unsigned char    *map_idxs;
1649
23.4k
  int                     nmap_idxs;
1650
23.4k
  oc_sb_flags            *sb_flags;
1651
23.4k
  signed char            *mb_modes;
1652
23.4k
  const oc_mb_map        *mb_maps;
1653
23.4k
  const oc_sb_map        *sb_maps;
1654
23.4k
  oc_fragment            *frags;
1655
23.4k
  unsigned                stripe_sby;
1656
23.4k
  unsigned                mcu_nvsbs;
1657
23.4k
  int                     notstart;
1658
23.4k
  int                     notdone;
1659
23.4k
  int                     refi;
1660
23.4k
  int                     pli;
1661
23.4k
  _enc->state.frame_type=OC_INTRA_FRAME;
1662
23.4k
  oc_enc_tokenize_start(_enc);
1663
23.4k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
1664
23.4k
  oc_enc_mode_rd_init(_enc);
1665
23.4k
  activity_sum=luma_sum=0;
1666
23.4k
  activity_avg=_enc->activity_avg;
1667
23.4k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1668
23.4k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1669
23.4k
  mcu_rd_scale=_enc->mcu_rd_scale;
1670
23.4k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
1671
  /*Choose MVs and MB modes and quantize and code luma.
1672
    Must be done in Hilbert order.*/
1673
23.4k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1674
23.4k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1675
23.4k
  _enc->state.ncoded_fragis[0]=0;
1676
23.4k
  _enc->state.ncoded_fragis[1]=0;
1677
23.4k
  _enc->state.ncoded_fragis[2]=0;
1678
23.4k
  sb_flags=_enc->state.sb_flags;
1679
23.4k
  mb_modes=_enc->state.mb_modes;
1680
23.4k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1681
23.4k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1682
23.4k
  frags=_enc->state.frags;
1683
23.4k
  notstart=0;
1684
23.4k
  notdone=1;
1685
23.4k
  mcu_nvsbs=_enc->mcu_nvsbs;
1686
200k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1687
177k
    ptrdiff_t cfroffset;
1688
177k
    unsigned  sbi;
1689
177k
    unsigned  sbi_end;
1690
177k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1691
177k
    sbi_end=_enc->pipe.sbi_end[0];
1692
177k
    cfroffset=_enc->pipe.froffset[1];
1693
1.34M
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1694
1.16M
      int quadi;
1695
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
1696
5.82M
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1697
2.49M
        unsigned  activity[4];
1698
2.49M
        unsigned  rd_scale[5];
1699
2.49M
        unsigned  rd_iscale[5];
1700
2.49M
        unsigned  luma;
1701
2.49M
        unsigned  mbi;
1702
2.49M
        int       mapii;
1703
2.49M
        int       mapi;
1704
2.49M
        int       bi;
1705
2.49M
        ptrdiff_t fragi;
1706
2.49M
        mbi=sbi<<2|quadi;
1707
        /*Activity masking.*/
1708
2.49M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1709
2.49M
          luma=oc_mb_activity(_enc,mbi,activity);
1710
2.49M
        }
1711
0
        else{
1712
0
          unsigned intra_satd[12];
1713
0
          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1714
0
          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1715
0
          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1716
0
        }
1717
2.49M
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1718
2.49M
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1719
2.49M
        luma_sum+=luma;
1720
        /*Motion estimation:
1721
          We do a basic 1MV search for all macroblocks, coded or not,
1722
           keyframe or not, unless we aren't using motion estimation at all.*/
1723
2.49M
        if(!_recode&&_enc->state.curframe_num>0&&
1724
9.40k
         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1725
400
          oc_mcenc_search(_enc,mbi);
1726
400
        }
1727
2.49M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1728
2.49M
          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1729
2.49M
        }
1730
2.49M
        mb_modes[mbi]=OC_MODE_INTRA;
1731
2.49M
        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1732
2.49M
         mbi,rd_scale,rd_iscale);
1733
        /*Propagate final MB mode and MVs to the chroma blocks.*/
1734
11.7M
        for(mapii=4;mapii<nmap_idxs;mapii++){
1735
9.28M
          mapi=map_idxs[mapii];
1736
9.28M
          pli=mapi>>2;
1737
9.28M
          bi=mapi&3;
1738
9.28M
          fragi=mb_maps[mbi][pli][bi];
1739
9.28M
          frags[fragi].refi=OC_FRAME_SELF;
1740
9.28M
          frags[fragi].mb_mode=OC_MODE_INTRA;
1741
9.28M
        }
1742
        /*Save masking scale factors for chroma blocks.*/
1743
7.13M
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1744
4.64M
          mapi=map_idxs[mapii];
1745
4.64M
          bi=mapi&3;
1746
4.64M
          fragi=mb_maps[mbi][1][bi];
1747
4.64M
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1748
4.64M
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1749
4.64M
        }
1750
2.49M
      }
1751
1.16M
    }
1752
177k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1753
    /*Code chroma planes.*/
1754
531k
    for(pli=1;pli<3;pli++){
1755
354k
      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1756
354k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1757
354k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1758
354k
    }
1759
177k
    notstart=1;
1760
177k
  }
1761
  /*Compute the average block activity and MB luma score for the frame.*/
1762
23.4k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1763
23.4k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1764
23.4k
   _enc->state.fplanes[0].nfrags));
1765
23.4k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1766
  /*Finish filling in the reference frame borders.*/
1767
23.4k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1768
93.8k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1769
23.4k
  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1770
23.4k
}
1771
1772
1773
1774
/*Cost information about a MB mode.*/
1775
struct oc_mode_choice{
1776
  unsigned      cost;
1777
  unsigned      ssd;
1778
  unsigned      rate;
1779
  unsigned      overhead;
1780
  unsigned char qii[12];
1781
};
1782
1783
1784
1785
3.02M
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1786
3.02M
  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1787
3.02M
   _modec->rate+_modec->overhead,_lambda);
1788
3.02M
}
1789
1790
/*A set of skip SSD's to use to disable early skipping.*/
1791
static const unsigned OC_NOSKIP[12]={
1792
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1793
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1795
};
1796
1797
/*The estimated number of bits used by a coded chroma block to specify the AC
1798
   quantizer.
1799
  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1800
   measurements suggest this is in the right ballpark, but it varies somewhat
1801
   with lambda.*/
1802
5.17M
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1803
1804
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1805
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1806
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1807
2.30M
 const unsigned _rd_scale[4],int _qti){
1808
2.30M
  oc_fr_state  fr;
1809
2.30M
  oc_qii_state qs;
1810
2.30M
  unsigned     ssd;
1811
2.30M
  unsigned     rate;
1812
2.30M
  unsigned     satd;
1813
2.30M
  unsigned     best_ssd;
1814
2.30M
  unsigned     best_rate;
1815
2.30M
  int          best_fri;
1816
2.30M
  int          best_qii;
1817
2.30M
  int          lambda;
1818
2.30M
  int          nqis;
1819
2.30M
  int          nskipped;
1820
2.30M
  int          bi;
1821
2.30M
  lambda=_enc->lambda;
1822
2.30M
  nqis=_enc->state.nqis;
1823
  /*We could do a trellis optimization here, but we don't make final skip
1824
     decisions until after transform+quantization, so the result wouldn't be
1825
     optimal anyway.
1826
    Instead we just use a greedy approach; for most SATD values, the
1827
     differences between the qiis are large enough to drown out the cost to
1828
     code the flags, anyway.*/
1829
2.30M
  *&fr=*_fr;
1830
2.30M
  *&qs=*_qs;
1831
2.30M
  ssd=rate=nskipped=0;
1832
11.5M
  for(bi=0;bi<4;bi++){
1833
9.23M
    oc_fr_state  ft[2];
1834
9.23M
    oc_qii_state qt[3];
1835
9.23M
    unsigned     best_cost;
1836
9.23M
    unsigned     cur_cost;
1837
9.23M
    unsigned     cur_ssd;
1838
9.23M
    unsigned     cur_rate;
1839
9.23M
    unsigned     cur_overhead;
1840
9.23M
    int          qii;
1841
9.23M
    satd=_frag_satd[bi];
1842
9.23M
    *(ft+0)=*&fr;
1843
9.23M
    oc_fr_code_block(ft+0);
1844
9.23M
    cur_overhead=ft[0].bits-fr.bits;
1845
9.23M
    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1846
9.23M
     +(cur_overhead<<OC_BIT_SCALE);
1847
9.23M
    if(nqis>1){
1848
4.12M
      oc_qii_state_advance(qt+0,&qs,0);
1849
4.12M
      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1850
4.12M
    }
1851
9.23M
    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1852
9.23M
    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1853
9.23M
    best_fri=0;
1854
9.23M
    best_qii=0;
1855
14.4M
    for(qii=1;qii<nqis;qii++){
1856
5.19M
      oc_qii_state_advance(qt+qii,&qs,qii);
1857
5.19M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1858
5.19M
       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1859
5.19M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1860
5.19M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1861
5.19M
      if(cur_cost<best_cost){
1862
1.81M
        best_cost=cur_cost;
1863
1.81M
        best_ssd=cur_ssd;
1864
1.81M
        best_rate=cur_rate;
1865
1.81M
        best_qii=qii;
1866
1.81M
      }
1867
5.19M
    }
1868
9.23M
    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1869
3.29M
      *(ft+1)=*&fr;
1870
3.29M
      oc_fr_skip_block(ft+1);
1871
3.29M
      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1872
3.29M
      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1873
3.29M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1874
3.29M
      if(cur_cost<=best_cost){
1875
737k
        best_ssd=cur_ssd;
1876
737k
        best_rate=cur_overhead;
1877
737k
        best_fri=1;
1878
737k
        best_qii+=4;
1879
737k
      }
1880
3.29M
    }
1881
9.23M
    rate+=best_rate;
1882
9.23M
    ssd+=best_ssd;
1883
9.23M
    *&fr=*(ft+best_fri);
1884
9.23M
    if(best_fri==0)*&qs=*(qt+best_qii);
1885
737k
    else nskipped++;
1886
9.23M
    _modec->qii[bi]=best_qii;
1887
9.23M
  }
1888
2.30M
  _modec->ssd=ssd;
1889
2.30M
  _modec->rate=rate;
1890
2.30M
}
1891
1892
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1893
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1894
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1895
2.30M
 unsigned _rd_scale,int _qti){
1896
2.30M
  unsigned ssd;
1897
2.30M
  unsigned rate;
1898
2.30M
  unsigned satd;
1899
2.30M
  unsigned best_ssd;
1900
2.30M
  unsigned best_rate;
1901
2.30M
  int      best_qii;
1902
2.30M
  unsigned cur_cost;
1903
2.30M
  unsigned cur_ssd;
1904
2.30M
  unsigned cur_rate;
1905
2.30M
  int      lambda;
1906
2.30M
  int      nblocks;
1907
2.30M
  int      nqis;
1908
2.30M
  int      pli;
1909
2.30M
  int      bi;
1910
2.30M
  int      qii;
1911
2.30M
  lambda=_enc->lambda;
1912
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1913
     worth spending the bits to change the AC quantizer.
1914
    TODO: This may be worth revisiting when we separate out DC and AC
1915
     predictions from SATD.*/
1916
#if 0
1917
  nqis=_enc->state.nqis;
1918
#else
1919
2.30M
  nqis=1;
1920
2.30M
#endif
1921
2.30M
  ssd=_modec->ssd;
1922
2.30M
  rate=_modec->rate;
1923
  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1924
     order, we assume a constant overhead for coded block and qii flags.*/
1925
2.30M
  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1926
2.30M
  nblocks=(nblocks-4>>1)+4;
1927
2.30M
  bi=4;
1928
6.92M
  for(pli=1;pli<3;pli++){
1929
9.78M
    for(;bi<nblocks;bi++){
1930
5.17M
      unsigned best_cost;
1931
5.17M
      satd=_frag_satd[bi];
1932
5.17M
      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1933
5.17M
       +OC_CHROMA_QII_RATE;
1934
5.17M
      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1935
5.17M
      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1936
5.17M
      best_qii=0;
1937
5.17M
      for(qii=1;qii<nqis;qii++){
1938
0
        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1939
0
         +OC_CHROMA_QII_RATE;
1940
0
        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1941
0
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1942
0
        if(cur_cost<best_cost){
1943
0
          best_cost=cur_cost;
1944
0
          best_ssd=cur_ssd;
1945
0
          best_rate=cur_rate;
1946
0
          best_qii=qii;
1947
0
        }
1948
0
      }
1949
5.17M
      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1950
2.46M
        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1951
2.46M
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1952
2.46M
        if(cur_cost<=best_cost){
1953
826k
          best_ssd=cur_ssd;
1954
826k
          best_rate=0;
1955
826k
          best_qii+=4;
1956
826k
        }
1957
2.46M
      }
1958
5.17M
      rate+=best_rate;
1959
5.17M
      ssd+=best_ssd;
1960
5.17M
      _modec->qii[bi]=best_qii;
1961
5.17M
    }
1962
4.61M
    nblocks=(nblocks-4<<1)+4;
1963
4.61M
  }
1964
2.30M
  _modec->ssd=ssd;
1965
2.30M
  _modec->rate=rate;
1966
2.30M
}
1967
1968
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1969
221k
 unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1970
221k
  const unsigned char   *src;
1971
221k
  const unsigned char   *ref;
1972
221k
  int                    ystride;
1973
221k
  const oc_fragment     *frags;
1974
221k
  const ptrdiff_t       *frag_buf_offs;
1975
221k
  const ptrdiff_t       *sb_map;
1976
221k
  const oc_mb_map_plane *mb_map;
1977
221k
  const unsigned char   *map_idxs;
1978
221k
  oc_mv                 *mvs;
1979
221k
  int                    map_nidxs;
1980
221k
  unsigned               uncoded_ssd;
1981
221k
  int                    mapii;
1982
221k
  int                    mapi;
1983
221k
  int                    pli;
1984
221k
  int                    bi;
1985
221k
  ptrdiff_t              fragi;
1986
221k
  ptrdiff_t              frag_offs;
1987
221k
  int                    borderi;
1988
221k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1989
221k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1990
221k
  ystride=_enc->state.ref_ystride[0];
1991
221k
  frags=_enc->state.frags;
1992
221k
  frag_buf_offs=_enc->state.frag_buf_offs;
1993
221k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1994
221k
  mvs=_enc->mb_info[_mbi].block_mv;
1995
1.10M
  for(bi=0;bi<4;bi++){
1996
886k
    fragi=sb_map[bi];
1997
886k
    borderi=frags[fragi].borderi;
1998
886k
    frag_offs=frag_buf_offs[fragi];
1999
886k
    if(borderi<0){
2000
571k
      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2001
571k
    }
2002
314k
    else{
2003
314k
      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2004
314k
       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2005
314k
    }
2006
    /*Scale to match DCT domain and RD.*/
2007
886k
    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2008
    /*Motion is a special case; if there is more than a full-pixel motion
2009
       against the prior frame, penalize skipping.
2010
      TODO: The factor of two here is a kludge, but it tested out better than a
2011
       hard limit.*/
2012
886k
    if(mvs[bi]!=0)uncoded_ssd*=2;
2013
886k
    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2014
886k
  }
2015
221k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2016
221k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2017
221k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2018
221k
  map_nidxs=(map_nidxs-4>>1)+4;
2019
221k
  mapii=4;
2020
221k
  mvs=_enc->mb_info[_mbi].unref_mv;
2021
664k
  for(pli=1;pli<3;pli++){
2022
443k
    ystride=_enc->state.ref_ystride[pli];
2023
939k
    for(;mapii<map_nidxs;mapii++){
2024
496k
      mapi=map_idxs[mapii];
2025
496k
      bi=mapi&3;
2026
496k
      fragi=mb_map[pli][bi];
2027
496k
      borderi=frags[fragi].borderi;
2028
496k
      frag_offs=frag_buf_offs[fragi];
2029
496k
      if(borderi<0){
2030
306k
        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2031
306k
      }
2032
189k
      else{
2033
189k
        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2034
189k
         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2035
189k
      }
2036
      /*Scale to match DCT domain and RD.*/
2037
496k
      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2038
      /*Motion is a special case; if there is more than a full-pixel motion
2039
         against the prior frame, penalize skipping.
2040
        TODO: The factor of two here is a kludge, but it tested out better than
2041
         a hard limit*/
2042
496k
      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2043
496k
      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2044
496k
    }
2045
443k
    map_nidxs=(map_nidxs-4<<1)+4;
2046
443k
  }
2047
221k
}
2048
2049
2050
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2051
 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2052
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2053
443k
 const unsigned _rd_scale[5]){
2054
443k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2055
443k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2056
443k
   _frag_satd,_skip_ssd,_rd_scale[4],0);
2057
443k
  _modec->overhead=
2058
443k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2059
443k
  oc_mode_set_cost(_modec,_enc->lambda);
2060
443k
}
2061
2062
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2063
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2064
 const oc_fr_state *_fr,const oc_qii_state *_qs,
2065
1.60M
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2066
1.60M
  unsigned               frag_satd[12];
2067
1.60M
  const unsigned char   *src;
2068
1.60M
  const unsigned char   *ref;
2069
1.60M
  int                    ystride;
2070
1.60M
  const ptrdiff_t       *frag_buf_offs;
2071
1.60M
  const ptrdiff_t       *sb_map;
2072
1.60M
  const oc_mb_map_plane *mb_map;
2073
1.60M
  const unsigned char   *map_idxs;
2074
1.60M
  int                    map_nidxs;
2075
1.60M
  int                    mapii;
2076
1.60M
  int                    mapi;
2077
1.60M
  int                    mv_offs[2];
2078
1.60M
  int                    pli;
2079
1.60M
  int                    bi;
2080
1.60M
  ptrdiff_t              fragi;
2081
1.60M
  ptrdiff_t              frag_offs;
2082
1.60M
  int                    dc;
2083
1.60M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2084
1.60M
  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2085
1.60M
  ystride=_enc->state.ref_ystride[0];
2086
1.60M
  frag_buf_offs=_enc->state.frag_buf_offs;
2087
1.60M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2088
1.60M
  _modec->rate=_modec->ssd=0;
2089
1.60M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2090
1.68M
    for(bi=0;bi<4;bi++){
2091
1.34M
      fragi=sb_map[bi];
2092
1.34M
      frag_offs=frag_buf_offs[fragi];
2093
1.34M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2094
1.34M
        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2095
1.34M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2096
1.34M
        frag_satd[bi]+=abs(dc);
2097
1.34M
      }
2098
0
      else{
2099
0
        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2100
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2101
0
      }
2102
1.34M
    }
2103
337k
  }
2104
1.26M
  else{
2105
6.34M
    for(bi=0;bi<4;bi++){
2106
5.07M
      fragi=sb_map[bi];
2107
5.07M
      frag_offs=frag_buf_offs[fragi];
2108
5.07M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2109
5.07M
        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2110
5.07M
         ref+frag_offs+mv_offs[0],ystride);
2111
5.07M
        frag_satd[bi]+=abs(dc);
2112
5.07M
      }
2113
0
      else{
2114
0
        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2115
0
         ref+frag_offs+mv_offs[0],ystride);
2116
0
      }
2117
5.07M
    }
2118
1.26M
  }
2119
1.60M
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2120
1.60M
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2121
1.60M
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2122
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2123
1.60M
  ystride=_enc->state.ref_ystride[1];
2124
1.60M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2125
1.89M
    for(mapii=4;mapii<map_nidxs;mapii++){
2126
1.28M
      mapi=map_idxs[mapii];
2127
1.28M
      pli=mapi>>2;
2128
1.28M
      bi=mapi&3;
2129
1.28M
      fragi=mb_map[pli][bi];
2130
1.28M
      frag_offs=frag_buf_offs[fragi];
2131
1.28M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2132
1.28M
        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2133
1.28M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2134
1.28M
        frag_satd[mapii]+=abs(dc);
2135
1.28M
      }
2136
0
      else{
2137
0
        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2138
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2139
0
      }
2140
1.28M
    }
2141
605k
  }
2142
1.00M
  else{
2143
3.30M
    for(mapii=4;mapii<map_nidxs;mapii++){
2144
2.30M
      mapi=map_idxs[mapii];
2145
2.30M
      pli=mapi>>2;
2146
2.30M
      bi=mapi&3;
2147
2.30M
      fragi=mb_map[pli][bi];
2148
2.30M
      frag_offs=frag_buf_offs[fragi];
2149
2.30M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2150
2.30M
        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2151
2.30M
         ref+frag_offs+mv_offs[0],ystride);
2152
2.30M
        frag_satd[mapii]+=abs(dc);
2153
2.30M
      }
2154
0
      else{
2155
0
        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2156
0
         ref+frag_offs+mv_offs[0],ystride);
2157
0
      }
2158
2.30M
    }
2159
1.00M
  }
2160
1.60M
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2161
1.60M
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2162
1.60M
   frag_satd,_skip_ssd,_rd_scale[4],1);
2163
1.60M
  _modec->overhead=
2164
1.60M
   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2165
1.60M
  oc_mode_set_cost(_modec,_enc->lambda);
2166
1.60M
}
2167
2168
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2169
 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2170
443k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2171
443k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2172
443k
}
2173
2174
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2175
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2176
 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2177
720k
 const unsigned _rd_scale[5]){
2178
720k
  int bits0;
2179
720k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2180
720k
  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2181
720k
  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2182
720k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2183
720k
  oc_mode_set_cost(_modec,_enc->lambda);
2184
720k
  return bits0;
2185
720k
}
2186
2187
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2188
static const unsigned char OC_MB_PHASE[4][4]={
2189
  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2190
};
2191
2192
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2193
 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2194
258k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2195
258k
  unsigned               frag_satd[12];
2196
258k
  oc_mv                  lbmvs[4];
2197
258k
  oc_mv                  cbmvs[4];
2198
258k
  const unsigned char   *src;
2199
258k
  const unsigned char   *ref;
2200
258k
  int                    ystride;
2201
258k
  const ptrdiff_t       *frag_buf_offs;
2202
258k
  oc_mv                 *frag_mvs;
2203
258k
  const oc_mb_map_plane *mb_map;
2204
258k
  const unsigned char   *map_idxs;
2205
258k
  int                    map_nidxs;
2206
258k
  int                    nqis;
2207
258k
  int                    mapii;
2208
258k
  int                    mapi;
2209
258k
  int                    mv_offs[2];
2210
258k
  int                    pli;
2211
258k
  int                    bi;
2212
258k
  ptrdiff_t              fragi;
2213
258k
  ptrdiff_t              frag_offs;
2214
258k
  int                    bits0;
2215
258k
  int                    bits1;
2216
258k
  unsigned               satd;
2217
258k
  int                    dc;
2218
258k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2219
258k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2220
258k
  ystride=_enc->state.ref_ystride[0];
2221
258k
  frag_buf_offs=_enc->state.frag_buf_offs;
2222
258k
  frag_mvs=_enc->state.frag_mvs;
2223
258k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2224
258k
  _modec->rate=_modec->ssd=0;
2225
1.29M
  for(bi=0;bi<4;bi++){
2226
1.03M
    fragi=mb_map[0][bi];
2227
    /*Save the block MVs as the current ones while we're here; we'll replace
2228
       them if we don't ultimately choose 4MV mode.*/
2229
1.03M
    frag_mvs[fragi]=_mv[bi];
2230
1.03M
    frag_offs=frag_buf_offs[fragi];
2231
1.03M
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2232
131k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2233
131k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2234
131k
    }
2235
902k
    else{
2236
902k
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2237
902k
       ref+frag_offs+mv_offs[0],ystride);
2238
902k
    }
2239
1.03M
    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2240
1.03M
  }
2241
258k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2242
258k
   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2243
  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2244
258k
  bits0=0;
2245
258k
  bits1=0;
2246
258k
  nqis=_enc->state.nqis;
2247
1.29M
  for(bi=0;bi<4;bi++){
2248
1.03M
    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2249
949k
    else{
2250
949k
      lbmvs[bi]=_mv[bi];
2251
949k
      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2252
949k
       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2253
949k
      bits1+=12;
2254
949k
    }
2255
1.03M
  }
2256
258k
  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2257
258k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2258
258k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2259
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2260
258k
  ystride=_enc->state.ref_ystride[1];
2261
842k
  for(mapii=4;mapii<map_nidxs;mapii++){
2262
583k
    mapi=map_idxs[mapii];
2263
583k
    pli=mapi>>2;
2264
583k
    bi=mapi&3;
2265
583k
    fragi=mb_map[pli][bi];
2266
583k
    frag_offs=frag_buf_offs[fragi];
2267
    /*TODO: We could save half these calls by re-using the results for the Cb
2268
       and Cr planes; is it worth it?*/
2269
583k
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2270
393k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2271
393k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2272
393k
    }
2273
189k
    else{
2274
189k
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2275
189k
       ref+frag_offs+mv_offs[0],ystride);
2276
189k
    }
2277
583k
    frag_satd[mapii]=satd+abs(dc);
2278
583k
  }
2279
258k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2280
258k
   frag_satd,_skip_ssd,_rd_scale[4],1);
2281
258k
  _modec->overhead=
2282
258k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2283
258k
   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2284
258k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2285
258k
  oc_mode_set_cost(_modec,_enc->lambda);
2286
258k
}
2287
2288
27.9k
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2289
27.9k
  oc_set_chroma_mvs_func  set_chroma_mvs;
2290
27.9k
  oc_qii_state            intra_luma_qs;
2291
27.9k
  oc_mv                   last_mv;
2292
27.9k
  oc_mv                   prior_mv;
2293
27.9k
  ogg_int64_t             interbits;
2294
27.9k
  ogg_int64_t             intrabits;
2295
27.9k
  ogg_int64_t             activity_sum;
2296
27.9k
  ogg_int64_t             luma_sum;
2297
27.9k
  unsigned                activity_avg;
2298
27.9k
  unsigned                luma_avg;
2299
27.9k
  const ogg_uint16_t     *chroma_rd_scale;
2300
27.9k
  ogg_uint16_t           *mcu_rd_scale;
2301
27.9k
  ogg_uint16_t           *mcu_rd_iscale;
2302
27.9k
  const unsigned char    *map_idxs;
2303
27.9k
  int                     nmap_idxs;
2304
27.9k
  unsigned               *coded_mbis;
2305
27.9k
  unsigned               *uncoded_mbis;
2306
27.9k
  size_t                  ncoded_mbis;
2307
27.9k
  size_t                  nuncoded_mbis;
2308
27.9k
  oc_sb_flags            *sb_flags;
2309
27.9k
  signed char            *mb_modes;
2310
27.9k
  const oc_sb_map        *sb_maps;
2311
27.9k
  const oc_mb_map        *mb_maps;
2312
27.9k
  oc_mb_enc_info         *embs;
2313
27.9k
  oc_fragment            *frags;
2314
27.9k
  oc_mv                  *frag_mvs;
2315
27.9k
  unsigned                stripe_sby;
2316
27.9k
  unsigned                mcu_nvsbs;
2317
27.9k
  int                     notstart;
2318
27.9k
  int                     notdone;
2319
27.9k
  unsigned                sbi;
2320
27.9k
  unsigned                sbi_end;
2321
27.9k
  int                     refi;
2322
27.9k
  int                     pli;
2323
27.9k
  int                     sp_level;
2324
27.9k
  sp_level=_enc->sp_level;
2325
27.9k
  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2326
27.9k
  _enc->state.frame_type=OC_INTER_FRAME;
2327
27.9k
  oc_mode_scheme_chooser_reset(&_enc->chooser);
2328
27.9k
  oc_enc_tokenize_start(_enc);
2329
27.9k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
2330
27.9k
  oc_enc_mode_rd_init(_enc);
2331
27.9k
  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2332
27.9k
  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
2333
27.9k
  interbits=intrabits=0;
2334
27.9k
  activity_sum=luma_sum=0;
2335
27.9k
  activity_avg=_enc->activity_avg;
2336
27.9k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2337
27.9k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2338
27.9k
  mcu_rd_scale=_enc->mcu_rd_scale;
2339
27.9k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
2340
27.9k
  last_mv=prior_mv=0;
2341
  /*Choose MVs and MB modes and quantize and code luma.
2342
    Must be done in Hilbert order.*/
2343
27.9k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2344
27.9k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2345
27.9k
  coded_mbis=_enc->coded_mbis;
2346
27.9k
  uncoded_mbis=coded_mbis+_enc->state.nmbs;
2347
27.9k
  ncoded_mbis=0;
2348
27.9k
  nuncoded_mbis=0;
2349
27.9k
  _enc->state.ncoded_fragis[0]=0;
2350
27.9k
  _enc->state.ncoded_fragis[1]=0;
2351
27.9k
  _enc->state.ncoded_fragis[2]=0;
2352
27.9k
  sb_flags=_enc->state.sb_flags;
2353
27.9k
  mb_modes=_enc->state.mb_modes;
2354
27.9k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2355
27.9k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2356
27.9k
  embs=_enc->mb_info;
2357
27.9k
  frags=_enc->state.frags;
2358
27.9k
  frag_mvs=_enc->state.frag_mvs;
2359
27.9k
  notstart=0;
2360
27.9k
  notdone=1;
2361
27.9k
  mcu_nvsbs=_enc->mcu_nvsbs;
2362
57.6k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2363
29.7k
    ptrdiff_t cfroffset;
2364
29.7k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2365
29.7k
    sbi_end=_enc->pipe.sbi_end[0];
2366
29.7k
    cfroffset=_enc->pipe.froffset[1];
2367
97.8k
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2368
68.1k
      int quadi;
2369
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
2370
340k
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2371
221k
        oc_mode_choice modes[8];
2372
221k
        unsigned       activity[4];
2373
221k
        unsigned       rd_scale[5];
2374
221k
        unsigned       rd_iscale[5];
2375
221k
        unsigned       skip_ssd[12];
2376
221k
        unsigned       intra_satd[12];
2377
221k
        unsigned       luma;
2378
221k
        int            mb_mv_bits_0;
2379
221k
        int            mb_gmv_bits_0;
2380
221k
        int            inter_mv_pref;
2381
221k
        int            mb_mode;
2382
221k
        int            refi;
2383
221k
        int            mv;
2384
221k
        unsigned       mbi;
2385
221k
        int            mapii;
2386
221k
        int            mapi;
2387
221k
        int            bi;
2388
221k
        ptrdiff_t      fragi;
2389
221k
        mbi=sbi<<2|quadi;
2390
221k
        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2391
        /*Activity masking.*/
2392
221k
        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2393
221k
          oc_mb_activity(_enc,mbi,activity);
2394
221k
        }
2395
0
        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2396
221k
        luma_sum+=luma;
2397
221k
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2398
221k
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2399
        /*Motion estimation:
2400
          We always do a basic 1MV search for all macroblocks, coded or not,
2401
           keyframe or not.*/
2402
221k
        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2403
221k
        mv=0;
2404
        /*Find the block choice with the lowest estimated coding cost.
2405
          If a Cb or Cr block is coded but no Y' block from a macro block then
2406
           the mode MUST be OC_MODE_INTER_NOMV.
2407
          This is the default state to which the mode data structure is
2408
           initialised in encoder and decoder at the start of each frame.*/
2409
        /*Block coding cost is estimated from correlated SATD metrics.*/
2410
        /*At this point, all blocks that are in frame are still marked coded.*/
2411
221k
        if(!_recode){
2412
168k
          embs[mbi].unref_mv[OC_FRAME_GOLD]=
2413
168k
           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2414
168k
          embs[mbi].unref_mv[OC_FRAME_PREV]=
2415
168k
           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2416
168k
          embs[mbi].refined=0;
2417
168k
        }
2418
        /*Estimate the cost of coding this MB in a keyframe.*/
2419
221k
        if(_allow_keyframe){
2420
221k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2421
221k
           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2422
221k
          intrabits+=modes[OC_MODE_INTRA].rate;
2423
1.10M
          for(bi=0;bi<4;bi++){
2424
886k
            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2425
886k
             modes[OC_MODE_INTRA].qii[bi]);
2426
886k
          }
2427
221k
        }
2428
        /*Estimate the cost in a delta frame for various modes.*/
2429
221k
        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2430
221k
        if(sp_level<OC_SP_LEVEL_NOMC){
2431
221k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2432
221k
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2433
221k
           skip_ssd,rd_scale);
2434
221k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2435
221k
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2436
221k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2437
221k
           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2438
221k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2439
221k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2440
221k
           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2441
221k
           skip_ssd,rd_scale);
2442
221k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2443
221k
           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2444
221k
           skip_ssd,rd_scale);
2445
221k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2446
221k
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2447
221k
           skip_ssd,rd_scale);
2448
221k
          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2449
221k
           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2450
221k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2451
          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
2452
             refinement.
2453
            We choose the explicit MV mode that's already furthest ahead on
2454
             R-D cost and refine only that one.
2455
            We have to be careful to remember which ones we've refined so that
2456
             we don't refine it again if we re-encode this frame.*/
2457
221k
          inter_mv_pref=_enc->lambda*3;
2458
221k
          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2459
221k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2460
221k
             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2461
221k
             skip_ssd,rd_scale);
2462
221k
          }
2463
0
          else{
2464
0
            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2465
0
          }
2466
221k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2467
51.8k
           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2468
36.8k
            if(!(embs[mbi].refined&0x80)){
2469
27.1k
              oc_mcenc_refine4mv(_enc,mbi);
2470
27.1k
              embs[mbi].refined|=0x80;
2471
27.1k
            }
2472
36.8k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2473
36.8k
             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2474
36.8k
             skip_ssd,rd_scale);
2475
36.8k
          }
2476
184k
          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2477
184k
           modes[OC_MODE_INTER_MV].cost){
2478
55.6k
            if(!(embs[mbi].refined&0x40)){
2479
47.6k
              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2480
47.6k
              embs[mbi].refined|=0x40;
2481
47.6k
            }
2482
55.6k
            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2483
55.6k
             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2484
55.6k
             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2485
55.6k
          }
2486
221k
          if(!(embs[mbi].refined&0x04)){
2487
168k
            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2488
168k
            embs[mbi].refined|=0x04;
2489
168k
          }
2490
221k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2491
221k
           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2492
221k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2493
          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2494
221k
          mb_mode=OC_MODE_INTER_NOMV;
2495
221k
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2496
172k
            mb_mode=OC_MODE_INTRA;
2497
172k
          }
2498
221k
          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2499
24.6k
            mb_mode=OC_MODE_INTER_MV_LAST;
2500
24.6k
          }
2501
221k
          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2502
6.27k
            mb_mode=OC_MODE_INTER_MV_LAST2;
2503
6.27k
          }
2504
221k
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2505
4.35k
            mb_mode=OC_MODE_GOLDEN_NOMV;
2506
4.35k
          }
2507
221k
          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2508
22.1k
            mb_mode=OC_MODE_GOLDEN_MV;
2509
22.1k
          }
2510
221k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2511
11.1k
            mb_mode=OC_MODE_INTER_MV_FOUR;
2512
11.1k
          }
2513
          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2514
221k
          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2515
20.2k
            inter_mv_pref=0;
2516
20.2k
          }
2517
221k
          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2518
18.0k
            mb_mode=OC_MODE_INTER_MV;
2519
18.0k
          }
2520
221k
        }
2521
0
        else{
2522
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2523
0
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2524
0
           skip_ssd,rd_scale);
2525
0
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2526
0
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2527
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2528
0
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2529
0
           skip_ssd,rd_scale);
2530
0
          mb_mode=OC_MODE_INTER_NOMV;
2531
0
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2532
0
            mb_mode=OC_MODE_INTRA;
2533
0
          }
2534
0
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2535
0
            mb_mode=OC_MODE_GOLDEN_NOMV;
2536
0
          }
2537
0
          mb_mv_bits_0=mb_gmv_bits_0=0;
2538
0
        }
2539
221k
        mb_modes[mbi]=mb_mode;
2540
        /*Propagate the MVs to the luma blocks.*/
2541
221k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2542
214k
          switch(mb_mode){
2543
18.0k
            case OC_MODE_INTER_MV:{
2544
18.0k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2545
18.0k
            }break;
2546
14.0k
            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2547
3.36k
            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2548
14.0k
            case OC_MODE_GOLDEN_MV:{
2549
14.0k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2550
14.0k
            }break;
2551
214k
          }
2552
1.07M
          for(bi=0;bi<4;bi++){
2553
856k
            fragi=mb_maps[mbi][0][bi];
2554
856k
            frag_mvs[fragi]=mv;
2555
856k
          }
2556
214k
        }
2557
1.10M
        for(bi=0;bi<4;bi++){
2558
886k
          fragi=sb_maps[mbi>>2][mbi&3][bi];
2559
886k
          frags[fragi].qii=modes[mb_mode].qii[bi];
2560
886k
        }
2561
221k
        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2562
221k
         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2563
180k
          int orig_mb_mode;
2564
180k
          orig_mb_mode=mb_mode;
2565
180k
          mb_mode=mb_modes[mbi];
2566
180k
          refi=OC_FRAME_FOR_MODE(mb_mode);
2567
180k
          switch(mb_mode){
2568
9.02k
            case OC_MODE_INTER_MV:{
2569
9.02k
              prior_mv=last_mv;
2570
              /*If we're backing out from 4MV, find the MV we're actually
2571
                 using.*/
2572
9.02k
              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2573
428
                for(bi=0;;bi++){
2574
428
                  fragi=mb_maps[mbi][0][bi];
2575
428
                  if(frags[fragi].coded){
2576
161
                    mv=last_mv=frag_mvs[fragi];
2577
161
                    break;
2578
161
                  }
2579
428
                }
2580
161
                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2581
161
                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
2582
161
              }
2583
              /*Otherwise we used the original analysis MV.*/
2584
8.86k
              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2585
9.02k
              _enc->mv_bits[0]+=mb_mv_bits_0;
2586
9.02k
              _enc->mv_bits[1]+=12;
2587
9.02k
            }break;
2588
2.82k
            case OC_MODE_INTER_MV_LAST2:{
2589
2.82k
              oc_mv tmp_mv;
2590
2.82k
              tmp_mv=prior_mv;
2591
2.82k
              prior_mv=last_mv;
2592
2.82k
              last_mv=tmp_mv;
2593
2.82k
            }break;
2594
7.30k
            case OC_MODE_GOLDEN_MV:{
2595
7.30k
              _enc->mv_bits[0]+=mb_gmv_bits_0;
2596
7.30k
              _enc->mv_bits[1]+=12;
2597
7.30k
            }break;
2598
5.46k
            case OC_MODE_INTER_MV_FOUR:{
2599
5.46k
              oc_mv lbmvs[4];
2600
5.46k
              oc_mv cbmvs[4];
2601
5.46k
              prior_mv=last_mv;
2602
27.3k
              for(bi=0;bi<4;bi++){
2603
21.8k
                fragi=mb_maps[mbi][0][bi];
2604
21.8k
                if(frags[fragi].coded){
2605
20.5k
                  lbmvs[bi]=last_mv=frag_mvs[fragi];
2606
20.5k
                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2607
20.5k
                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2608
20.5k
                  _enc->mv_bits[1]+=12;
2609
20.5k
                }
2610
                /*Replace the block MVs for not-coded blocks with (0,0).*/
2611
1.27k
                else lbmvs[bi]=0;
2612
21.8k
              }
2613
5.46k
              (*set_chroma_mvs)(cbmvs,lbmvs);
2614
17.2k
              for(mapii=4;mapii<nmap_idxs;mapii++){
2615
11.7k
                mapi=map_idxs[mapii];
2616
11.7k
                pli=mapi>>2;
2617
11.7k
                bi=mapi&3;
2618
11.7k
                fragi=mb_maps[mbi][pli][bi];
2619
11.7k
                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2620
11.7k
                frags[fragi].refi=refi;
2621
11.7k
                frags[fragi].mb_mode=mb_mode;
2622
11.7k
                frag_mvs[fragi]=cbmvs[bi];
2623
11.7k
              }
2624
5.46k
            }break;
2625
180k
          }
2626
180k
          coded_mbis[ncoded_mbis++]=mbi;
2627
180k
          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2628
180k
          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2629
180k
        }
2630
40.6k
        else{
2631
40.6k
          *(uncoded_mbis-++nuncoded_mbis)=mbi;
2632
40.6k
          mb_mode=OC_MODE_INTER_NOMV;
2633
40.6k
          refi=OC_FRAME_PREV;
2634
40.6k
          mv=0;
2635
40.6k
        }
2636
        /*Propagate final MB mode and MVs to the chroma blocks.
2637
          This has already been done for 4MV mode, since it requires individual
2638
           block motion vectors.*/
2639
221k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2640
700k
          for(mapii=4;mapii<nmap_idxs;mapii++){
2641
484k
            mapi=map_idxs[mapii];
2642
484k
            pli=mapi>>2;
2643
484k
            bi=mapi&3;
2644
484k
            fragi=mb_maps[mbi][pli][bi];
2645
            /*If we switched from 4MV mode to INTER_MV mode, then the qii
2646
               values won't have been chosen with the right MV, but it's
2647
               probably not worth re-estimating them.*/
2648
484k
            frags[fragi].qii=modes[mb_mode].qii[mapii];
2649
484k
            frags[fragi].refi=refi;
2650
484k
            frags[fragi].mb_mode=mb_mode;
2651
484k
            frag_mvs[fragi]=mv;
2652
484k
          }
2653
216k
        }
2654
        /*Save masking scale factors for chroma blocks.*/
2655
469k
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2656
248k
          mapi=map_idxs[mapii];
2657
248k
          bi=mapi&3;
2658
248k
          fragi=mb_maps[mbi][1][bi];
2659
248k
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2660
248k
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2661
248k
        }
2662
221k
      }
2663
68.1k
      oc_fr_state_flush_sb(_enc->pipe.fr+0);
2664
68.1k
      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2665
68.1k
      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2666
68.1k
    }
2667
29.7k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2668
    /*Code chroma planes.*/
2669
89.1k
    for(pli=1;pli<3;pli++){
2670
59.4k
      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2671
59.4k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2672
59.4k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2673
59.4k
    }
2674
29.7k
    notstart=1;
2675
29.7k
  }
2676
  /*Update the average block activity and MB luma score for the frame.
2677
    We could use a Bessel follower here, but fast reaction is probably almost
2678
     always best.*/
2679
27.9k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2680
27.9k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2681
27.9k
   _enc->state.fplanes[0].nfrags));
2682
27.9k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2683
  /*Finish filling in the reference frame borders.*/
2684
27.9k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2685
111k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2686
  /*Finish adding flagging overhead costs to inter bit counts to determine if
2687
     we should have coded a key frame instead.*/
2688
27.9k
  if(_allow_keyframe){
2689
    /*Technically the chroma plane counts are over-estimations, because they
2690
       don't account for continuing runs from the luma planes, but the
2691
       inaccuracy is small.
2692
      We don't need to add the luma plane coding flag costs, because they are
2693
       already included in the MB rate estimates.*/
2694
83.8k
    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2695
27.9k
    if(interbits>intrabits)return 1;
2696
27.9k
  }
2697
12.9k
  _enc->ncoded_mbis=ncoded_mbis;
2698
  /*Compact the coded fragment list.*/
2699
12.9k
  {
2700
12.9k
    ptrdiff_t ncoded_fragis;
2701
12.9k
    ncoded_fragis=_enc->state.ncoded_fragis[0];
2702
38.8k
    for(pli=1;pli<3;pli++){
2703
25.9k
      memmove(_enc->state.coded_fragis+ncoded_fragis,
2704
25.9k
       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2705
25.9k
       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2706
25.9k
      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2707
25.9k
    }
2708
12.9k
    _enc->state.ntotal_coded_fragis=ncoded_fragis;
2709
12.9k
  }
2710
12.9k
  return 0;
2711
27.9k
}