Coverage Report

Created: 2025-11-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/theora/lib/analyze.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025           *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function: mode selection code
14
15
 ********************************************************************/
16
#include <limits.h>
17
#include <string.h>
18
#include "encint.h"
19
#include "modedec.h"
20
#if defined(OC_COLLECT_METRICS)
21
# include "collect.c"
22
#endif
23
24
25
26
typedef struct oc_rd_metric          oc_rd_metric;
27
typedef struct oc_mode_choice        oc_mode_choice;
28
29
30
31
/*There are 8 possible schemes used to encode macro block modes.
32
  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
33
  The same set of Huffman codes is used for each of these 7 schemes, but the
34
   mode assigned to each codeword varies.
35
  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
36
   while schemes 1-6 have a fixed mapping.
37
  Scheme 7 just encodes each mode directly in 3 bits.*/
38
39
/*The mode orderings for the various mode coding schemes.
40
  Scheme 0 uses a custom alphabet, which is not stored in this table.
41
  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
42
   decoder.*/
43
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
44
  /*Last MV dominates.*/
45
  /*L P M N I G GM 4*/
46
  {3,4,2,0,1,5,6,7},
47
  /*L P N M I G GM 4*/
48
  {2,4,3,0,1,5,6,7},
49
  /*L M P N I G GM 4*/
50
  {3,4,1,0,2,5,6,7},
51
  /*L M N P I G GM 4*/
52
  {2,4,1,0,3,5,6,7},
53
  /*No MV dominates.*/
54
  /*N L P M I G GM 4*/
55
  {0,4,3,1,2,5,6,7},
56
  /*N G L P M I GM 4*/
57
  {0,5,4,2,3,1,6,7},
58
  /*Default ordering.*/
59
  /*N I M L P G GM 4*/
60
  {0,1,2,3,4,5,6,7}
61
};
62
63
64
65
/*Initialize the mode scheme chooser.
66
  This need only be called once per encoder.*/
67
3.32k
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
68
3.32k
  int si;
69
3.32k
  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
70
26.5k
  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
71
3.32k
}
72
73
/*Reset the mode scheme chooser.
74
  This needs to be called once for each frame, including the first.*/
75
29.1k
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
76
29.1k
  int si;
77
29.1k
  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
78
  /*Scheme 0 starts with 24 bits to store the mode list in.*/
79
29.1k
  _chooser->scheme_bits[0]=24;
80
29.1k
  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
81
262k
  for(si=0;si<8;si++){
82
    /*Scheme 7 should always start first, and scheme 0 should always start
83
       last.*/
84
232k
    _chooser->scheme_list[si]=7-si;
85
232k
    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
86
232k
  }
87
29.1k
}
88
89
/*Return the cost of coding _mb_mode in the specified scheme.*/
90
static int oc_mode_scheme_chooser_scheme_mb_cost(
91
9.80M
 const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
92
9.80M
  int codebook;
93
9.80M
  int ri;
94
9.80M
  codebook=_scheme+1>>3;
95
  /*For any scheme except 0, we can just use the bit cost of the mode's rank
96
     in that scheme.*/
97
9.80M
  ri=_chooser->mode_ranks[_scheme][_mb_mode];
98
9.80M
  if(_scheme==0){
99
1.73M
    int mc;
100
    /*For scheme 0, incrementing the mode count could potentially change the
101
       mode's rank.
102
      Find the index where the mode would be moved to in the optimal list,
103
       and use its bit cost instead of the one for the mode's current
104
       position in the list.*/
105
    /*We don't actually reorder the list; this is for computing opportunity
106
       cost, not an update.*/
107
1.73M
    mc=_chooser->mode_counts[_mb_mode];
108
4.89M
    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
109
1.73M
  }
110
9.80M
  return OC_MODE_BITS[codebook][ri];
111
9.80M
}
112
113
/*This is the real purpose of this data structure: not actually selecting a
114
   mode scheme, but estimating the cost of coding a given mode given all the
115
   modes selected so far.
116
  This is done via opportunity cost: the cost is defined as the number of bits
117
   required to encode all the modes selected so far including the current one
118
   using the best possible scheme, minus the number of bits required to encode
119
   all the modes selected so far not including the current one using the best
120
   possible scheme.
121
  The computational expense of doing this probably makes it overkill.
122
  Just be happy we take a greedy approach instead of trying to solve the
123
   global mode-selection problem (which is NP-hard).
124
  _mb_mode: The mode to determine the cost of.
125
  Return: The number of bits required to code this mode.*/
126
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
127
3.16M
 int _mb_mode){
128
3.16M
  int scheme0;
129
3.16M
  int scheme1;
130
3.16M
  int best_bits;
131
3.16M
  int mode_bits;
132
3.16M
  int si;
133
3.16M
  int scheme0_bits;
134
3.16M
  int scheme1_bits;
135
3.16M
  scheme0=_chooser->scheme_list[0];
136
3.16M
  scheme1=_chooser->scheme_list[1];
137
3.16M
  scheme0_bits=_chooser->scheme_bits[scheme0];
138
3.16M
  scheme1_bits=_chooser->scheme_bits[scheme1];
139
3.16M
  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
140
  /*Typical case: If the difference between the best scheme and the next best
141
     is greater than 6 bits, then adding just one mode cannot change which
142
     scheme we use.*/
143
3.16M
  if(scheme1_bits-scheme0_bits>6)return mode_bits;
144
  /*Otherwise, check to see if adding this mode selects a different scheme as
145
     the best.*/
146
1.30M
  si=1;
147
1.30M
  best_bits=scheme0_bits+mode_bits;
148
6.64M
  do{
149
6.64M
    int cur_bits;
150
6.64M
    cur_bits=scheme1_bits+
151
6.64M
     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
152
6.64M
    if(cur_bits<best_bits)best_bits=cur_bits;
153
6.64M
    if(++si>=8)break;
154
6.64M
    scheme1=_chooser->scheme_list[si];
155
6.64M
    scheme1_bits=_chooser->scheme_bits[scheme1];
156
6.64M
  }
157
6.64M
  while(scheme1_bits-scheme0_bits<=6);
158
1.30M
  return best_bits-scheme0_bits;
159
3.16M
}
160
161
/*Incrementally update the mode counts and per-scheme bit counts and re-order
162
   the scheme lists once a mode has been selected.
163
  _mb_mode: The mode that was chosen.*/
164
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
165
249k
 int _mb_mode){
166
249k
  int ri;
167
249k
  int si;
168
249k
  _chooser->mode_counts[_mb_mode]++;
169
  /*Re-order the scheme0 mode list if necessary.*/
170
325k
  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
171
109k
    int pmode;
172
109k
    pmode=_chooser->scheme0_list[ri-1];
173
109k
    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
174
    /*Reorder the mode ranking.*/
175
76.6k
    _chooser->scheme0_ranks[pmode]++;
176
76.6k
    _chooser->scheme0_list[ri]=pmode;
177
76.6k
  }
178
249k
  _chooser->scheme0_ranks[_mb_mode]=ri;
179
249k
  _chooser->scheme0_list[ri]=_mb_mode;
180
  /*Now add the bit cost for the mode to each scheme.*/
181
2.24M
  for(si=0;si<8;si++){
182
1.99M
    _chooser->scheme_bits[si]+=
183
1.99M
     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
184
1.99M
  }
185
  /*Finally, re-order the list of schemes.*/
186
1.99M
  for(si=1;si<8;si++){
187
1.74M
    int sj;
188
1.74M
    int scheme0;
189
1.74M
    int bits0;
190
1.74M
    sj=si;
191
1.74M
    scheme0=_chooser->scheme_list[si];
192
1.74M
    bits0=_chooser->scheme_bits[scheme0];
193
1.92M
    do{
194
1.92M
      int scheme1;
195
1.92M
      scheme1=_chooser->scheme_list[sj-1];
196
1.92M
      if(bits0>=_chooser->scheme_bits[scheme1])break;
197
191k
      _chooser->scheme_list[sj]=scheme1;
198
191k
    }
199
1.74M
    while(--sj>0);
200
1.74M
    _chooser->scheme_list[sj]=scheme0;
201
1.74M
  }
202
249k
}
203
204
205
206
/*The number of bits required to encode a super block run.
207
  _run_count: The desired run count; must be positive and less than 4130.*/
208
169M
static int oc_sb_run_bits(int _run_count){
209
169M
  int i;
210
620M
  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
211
169M
  return OC_SB_RUN_CODE_NBITS[i];
212
169M
}
213
214
/*The number of bits required to encode a block run.
215
  _run_count: The desired run count; must be positive and less than 30.*/
216
20.2M
static int oc_block_run_bits(int _run_count){
217
20.2M
  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
218
20.2M
}
219
220
221
222
147k
static void oc_fr_state_init(oc_fr_state *_fr){
223
147k
  _fr->bits=0;
224
147k
  _fr->sb_partial_count=0;
225
147k
  _fr->sb_full_count=0;
226
147k
  _fr->b_coded_count_prev=0;
227
147k
  _fr->b_coded_count=0;
228
147k
  _fr->b_count=0;
229
147k
  _fr->sb_prefer_partial=0;
230
147k
  _fr->sb_bits=0;
231
147k
  _fr->sb_partial=-1;
232
147k
  _fr->sb_full=-1;
233
147k
  _fr->b_coded_prev=-1;
234
147k
  _fr->b_coded=-1;
235
147k
}
236
237
238
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
239
10.5M
 int _sb_partial,int _sb_full){
240
10.5M
  int bits;
241
10.5M
  int sb_partial_count;
242
10.5M
  int sb_full_count;
243
10.5M
  bits=0;
244
10.5M
  sb_partial_count=_fr->sb_partial_count;
245
  /*Extend the sb_partial run, or start a new one.*/
246
10.5M
  if(_fr->sb_partial==_sb_partial){
247
2.54M
    if(sb_partial_count>=4129){
248
0
      bits++;
249
0
      sb_partial_count=0;
250
0
    }
251
2.54M
    else bits-=oc_sb_run_bits(sb_partial_count);
252
2.54M
  }
253
8.05M
  else sb_partial_count=0;
254
10.5M
  bits+=oc_sb_run_bits(++sb_partial_count);
255
10.5M
  if(!_sb_partial){
256
    /*Extend the sb_full run, or start a new one.*/
257
3.15M
    sb_full_count=_fr->sb_full_count;
258
3.15M
    if(_fr->sb_full==_sb_full){
259
1.41M
      if(sb_full_count>=4129){
260
0
        bits++;
261
0
        sb_full_count=0;
262
0
      }
263
1.41M
      else bits-=oc_sb_run_bits(sb_full_count);
264
1.41M
    }
265
1.74M
    else sb_full_count=0;
266
3.15M
    bits+=oc_sb_run_bits(++sb_full_count);
267
3.15M
  }
268
10.5M
  return bits;
269
10.5M
}
270
271
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
272
208k
 int _sb_partial,int _sb_full){
273
208k
  int sb_partial_count;
274
208k
  int sb_full_count;
275
208k
  sb_partial_count=_fr->sb_partial_count;
276
208k
  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
277
208k
  sb_partial_count++;
278
208k
  if(!_sb_partial){
279
148k
    sb_full_count=_fr->sb_full_count;
280
148k
    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
281
148k
    sb_full_count++;
282
148k
    _fr->sb_full_count=sb_full_count;
283
148k
    _fr->sb_full=_sb_full;
284
    /*Roll back the partial block state.*/
285
148k
    _fr->b_coded=_fr->b_coded_prev;
286
148k
    _fr->b_coded_count=_fr->b_coded_count_prev;
287
148k
  }
288
60.2k
  else{
289
    /*Commit back the partial block state.*/
290
60.2k
    _fr->b_coded_prev=_fr->b_coded;
291
60.2k
    _fr->b_coded_count_prev=_fr->b_coded_count;
292
60.2k
  }
293
208k
  _fr->sb_partial_count=sb_partial_count;
294
208k
  _fr->sb_partial=_sb_partial;
295
208k
  _fr->b_count=0;
296
208k
  _fr->sb_prefer_partial=0;
297
208k
  _fr->sb_bits=0;
298
208k
}
299
300
/*Commit the state of the current super block and advance to the next.*/
301
208k
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
302
208k
  int sb_partial;
303
208k
  int sb_full;
304
208k
  int b_coded_count;
305
208k
  int b_count;
306
208k
  b_count=_fr->b_count;
307
208k
  b_coded_count=_fr->b_coded_count;
308
208k
  sb_full=_fr->b_coded;
309
208k
  sb_partial=b_coded_count<b_count;
310
208k
  if(!sb_partial){
311
    /*If the super block is fully coded/uncoded...*/
312
149k
    if(_fr->sb_prefer_partial){
313
      /*So far coding this super block as partial was cheaper anyway.*/
314
2.15k
      if(b_coded_count>15||_fr->b_coded_prev<0){
315
1.38k
        int sb_bits;
316
        /*If the block run is too long, this will limit how far it can be
317
           extended into the next partial super block.
318
          If we need to extend it farther, we don't want to have to roll all
319
           the way back here (since there could be many full SBs between now
320
           and then), so we disallow this.
321
          Similarly, if this is the start of a stripe, we don't know how the
322
           length of the outstanding block run from the previous stripe.*/
323
1.38k
        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
324
1.38k
        _fr->bits+=sb_bits-_fr->sb_bits;
325
1.38k
        _fr->sb_bits=sb_bits;
326
1.38k
      }
327
765
      else sb_partial=1;
328
2.15k
    }
329
149k
  }
330
208k
  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
331
208k
}
332
333
24.9M
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
334
24.9M
  ptrdiff_t bits;
335
24.9M
  int       sb_bits;
336
24.9M
  int       b_coded_count;
337
24.9M
  int       b_count;
338
24.9M
  int       sb_prefer_partial;
339
24.9M
  sb_bits=_fr->sb_bits;
340
24.9M
  bits=_fr->bits-sb_bits;
341
24.9M
  b_count=_fr->b_count;
342
24.9M
  b_coded_count=_fr->b_coded_count;
343
24.9M
  sb_prefer_partial=_fr->sb_prefer_partial;
344
24.9M
  if(b_coded_count>=b_count){
345
18.3M
    int sb_partial_bits;
346
    /*This super block is currently fully coded/uncoded.*/
347
18.3M
    if(b_count<=0){
348
      /*This is the first block in this SB.*/
349
2.24M
      b_count=1;
350
      /*Check to see whether it's cheaper to code it partially or fully.*/
351
2.24M
      if(_fr->b_coded==_b_coded){
352
529k
        sb_partial_bits=-oc_block_run_bits(b_coded_count);
353
529k
        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
354
529k
      }
355
1.71M
      else{
356
1.71M
        b_coded_count=1;
357
1.71M
        sb_partial_bits=2;
358
1.71M
      }
359
2.24M
      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
360
2.24M
      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
361
2.24M
      sb_prefer_partial=sb_partial_bits<sb_bits;
362
2.24M
      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
363
2.24M
    }
364
16.1M
    else if(_fr->b_coded==_b_coded){
365
10.3M
      b_coded_count++;
366
10.3M
      if(++b_count<16){
367
9.95M
        if(sb_prefer_partial){
368
          /*Check to see if it's cheaper to code it fully.*/
369
900k
          sb_partial_bits=sb_bits;
370
900k
          sb_partial_bits+=oc_block_run_bits(b_coded_count);
371
900k
          if(b_coded_count>0){
372
900k
            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
373
900k
          }
374
900k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
375
900k
          sb_prefer_partial=sb_partial_bits<sb_bits;
376
900k
          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
377
900k
        }
378
        /*There's no need to check the converse (whether it's cheaper to code
379
           this SB partially if we were coding it fully), since the cost to
380
           code a SB partially can only increase as we add more blocks, whereas
381
           the cost to code it fully stays constant.*/
382
9.95M
      }
383
401k
      else{
384
        /*If we get to the end and this SB is still full, then force it to be
385
           coded full.
386
          Otherwise we might not be able to extend the block run far enough
387
           into the next partial SB.*/
388
401k
        if(sb_prefer_partial){
389
16.1k
          sb_prefer_partial=0;
390
16.1k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
391
16.1k
        }
392
401k
      }
393
10.3M
    }
394
5.77M
    else{
395
      /*This SB was full, but now must be made partial.*/
396
5.77M
      if(!sb_prefer_partial){
397
5.19M
        sb_bits=oc_block_run_bits(b_coded_count);
398
5.19M
        if(b_coded_count>b_count){
399
1.42M
          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
400
1.42M
        }
401
5.19M
        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
402
5.19M
      }
403
5.77M
      b_count++;
404
5.77M
      b_coded_count=1;
405
5.77M
      sb_prefer_partial=1;
406
5.77M
      sb_bits+=2;
407
5.77M
    }
408
18.3M
  }
409
6.61M
  else{
410
6.61M
    b_count++;
411
6.61M
    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
412
2.39M
    else b_coded_count=0;
413
6.61M
    sb_bits+=oc_block_run_bits(++b_coded_count);
414
6.61M
  }
415
24.9M
  _fr->bits=bits+sb_bits;
416
24.9M
  _fr->b_coded_count=b_coded_count;
417
24.9M
  _fr->b_coded=_b_coded;
418
24.9M
  _fr->b_count=b_count;
419
24.9M
  _fr->sb_prefer_partial=sb_prefer_partial;
420
24.9M
  _fr->sb_bits=sb_bits;
421
24.9M
}
422
423
9.44M
static void oc_fr_skip_block(oc_fr_state *_fr){
424
9.44M
  oc_fr_state_advance_block(_fr,0);
425
9.44M
}
426
427
15.5M
static void oc_fr_code_block(oc_fr_state *_fr){
428
15.5M
  oc_fr_state_advance_block(_fr,1);
429
15.5M
}
430
431
1.50M
static int oc_fr_cost1(const oc_fr_state *_fr){
432
1.50M
  oc_fr_state tmp;
433
1.50M
  ptrdiff_t   bits;
434
1.50M
  *&tmp=*_fr;
435
1.50M
  oc_fr_skip_block(&tmp);
436
1.50M
  bits=tmp.bits;
437
1.50M
  *&tmp=*_fr;
438
1.50M
  oc_fr_code_block(&tmp);
439
1.50M
  return (int)(tmp.bits-bits);
440
1.50M
}
441
442
255k
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
443
255k
  oc_fr_state tmp;
444
255k
  *&tmp=*_pre;
445
255k
  oc_fr_skip_block(&tmp);
446
255k
  oc_fr_skip_block(&tmp);
447
255k
  oc_fr_skip_block(&tmp);
448
255k
  oc_fr_skip_block(&tmp);
449
255k
  return (int)(_post->bits-tmp.bits);
450
255k
}
451
452
453
454
176k
static void oc_qii_state_init(oc_qii_state *_qs){
455
176k
  _qs->bits=0;
456
176k
  _qs->qi01_count=0;
457
176k
  _qs->qi01=-1;
458
176k
  _qs->qi12_count=0;
459
176k
  _qs->qi12=-1;
460
176k
}
461
462
463
static void oc_qii_state_advance(oc_qii_state *_qd,
464
67.5M
 const oc_qii_state *_qs,int _qii){
465
67.5M
  ptrdiff_t bits;
466
67.5M
  int       qi01;
467
67.5M
  int       qi01_count;
468
67.5M
  int       qi12;
469
67.5M
  int       qi12_count;
470
67.5M
  bits=_qs->bits;
471
67.5M
  qi01=_qii+1>>1;
472
67.5M
  qi01_count=_qs->qi01_count;
473
67.5M
  if(qi01==_qs->qi01){
474
39.6M
    if(qi01_count>=4129){
475
3.23k
      bits++;
476
3.23k
      qi01_count=0;
477
3.23k
    }
478
39.6M
    else bits-=oc_sb_run_bits(qi01_count);
479
39.6M
  }
480
27.9M
  else qi01_count=0;
481
67.5M
  qi01_count++;
482
67.5M
  bits+=oc_sb_run_bits(qi01_count);
483
67.5M
  qi12_count=_qs->qi12_count;
484
67.5M
  if(_qii){
485
29.4M
    qi12=_qii>>1;
486
29.4M
    if(qi12==_qs->qi12){
487
15.5M
      if(qi12_count>=4129){
488
15.1k
        bits++;
489
15.1k
        qi12_count=0;
490
15.1k
      }
491
15.5M
      else bits-=oc_sb_run_bits(qi12_count);
492
15.5M
    }
493
13.9M
    else qi12_count=0;
494
29.4M
    qi12_count++;
495
29.4M
    bits+=oc_sb_run_bits(qi12_count);
496
29.4M
  }
497
38.1M
  else qi12=_qs->qi12;
498
67.5M
  _qd->bits=bits;
499
67.5M
  _qd->qi01=qi01;
500
67.5M
  _qd->qi01_count=qi01_count;
501
67.5M
  _qd->qi12=qi12;
502
67.5M
  _qd->qi12_count=qi12_count;
503
67.5M
}
504
505
506
507
49.0k
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
508
49.0k
  ptrdiff_t *coded_fragis;
509
49.0k
  unsigned   mcu_nvsbs;
510
49.0k
  ptrdiff_t  mcu_nfrags;
511
49.0k
  int        flimit;
512
49.0k
  int        hdec;
513
49.0k
  int        vdec;
514
49.0k
  int        pli;
515
49.0k
  int        nqis;
516
49.0k
  int        qii;
517
49.0k
  int        qi0;
518
49.0k
  int        qti;
519
  /*Initialize the per-plane coded block flag trackers.
520
    These are used for bit-estimation purposes only; the real flag bits span
521
     all three planes, so we can't compute them in parallel.*/
522
196k
  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
523
196k
  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
524
  /*Set up the per-plane skip SSD storage pointers.*/
525
49.0k
  mcu_nvsbs=_enc->mcu_nvsbs;
526
49.0k
  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
527
49.0k
  hdec=!(_enc->state.info.pixel_fmt&1);
528
49.0k
  vdec=!(_enc->state.info.pixel_fmt&2);
529
49.0k
  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
530
49.0k
  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
531
49.0k
  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
532
  /*Set up per-plane pointers to the coded and uncoded fragments lists.
533
    Unlike the decoder, each planes' coded and uncoded fragment list is kept
534
     separate during the analysis stage; we only make the coded list for all
535
     three planes contiguous right before the final packet is output
536
     (destroying the uncoded lists, which are no longer needed).*/
537
49.0k
  coded_fragis=_enc->state.coded_fragis;
538
196k
  for(pli=0;pli<3;pli++){
539
147k
    _pipe->coded_fragis[pli]=coded_fragis;
540
147k
    coded_fragis+=_enc->state.fplanes[pli].nfrags;
541
147k
    _pipe->uncoded_fragis[pli]=coded_fragis;
542
147k
  }
543
49.0k
  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
544
49.0k
  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
545
  /*Set up condensed quantizer tables.*/
546
49.0k
  qi0=_enc->state.qis[0];
547
49.0k
  nqis=_enc->state.nqis;
548
196k
  for(pli=0;pli<3;pli++){
549
403k
    for(qii=0;qii<nqis;qii++){
550
256k
      int qi;
551
256k
      qi=_enc->state.qis[qii];
552
769k
      for(qti=0;qti<2;qti++){
553
        /*Set the DC coefficient in the dequantization table.*/
554
513k
        _enc->state.dequant_tables[qi][pli][qti][0]=
555
513k
         _enc->dequant_dc[qi0][pli][qti];
556
513k
        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
557
        /*Copy over the quantization table.*/
558
513k
        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
559
513k
         _enc->opt_data.enquant_table_size);
560
513k
      }
561
256k
    }
562
147k
  }
563
  /*Fix up the DC coefficients in the quantization tables.*/
564
49.0k
  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
565
  /*Initialize the tokenization state.*/
566
196k
  for(pli=0;pli<3;pli++){
567
147k
    _pipe->ndct_tokens1[pli]=0;
568
147k
    _pipe->eob_run1[pli]=0;
569
147k
  }
570
  /*Initialize the bounding value array for the loop filter.*/
571
49.0k
  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
572
49.0k
  _pipe->loop_filter=flimit!=0;
573
49.0k
  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
574
  /*Clear the temporary DCT scratch space.*/
575
49.0k
  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
576
49.0k
}
577
578
/*Sets the current MCU stripe to super block row _sby.
579
  Return: A non-zero value if this was the last MCU.*/
580
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
581
241k
 oc_enc_pipeline_state *_pipe,int _sby){
582
241k
  const oc_fragment_plane *fplane;
583
241k
  unsigned                 mcu_nvsbs;
584
241k
  int                      sby_end;
585
241k
  int                      notdone;
586
241k
  int                      vdec;
587
241k
  int                      pli;
588
241k
  mcu_nvsbs=_enc->mcu_nvsbs;
589
241k
  sby_end=_enc->state.fplanes[0].nvsbs;
590
241k
  notdone=_sby+mcu_nvsbs<sby_end;
591
241k
  if(notdone)sby_end=_sby+mcu_nvsbs;
592
241k
  vdec=0;
593
967k
  for(pli=0;pli<3;pli++){
594
725k
    fplane=_enc->state.fplanes+pli;
595
725k
    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
596
725k
    _pipe->fragy0[pli]=_sby<<2-vdec;
597
725k
    _pipe->froffset[pli]=fplane->froffset
598
725k
     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
599
725k
    if(notdone){
600
578k
      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
601
578k
      _pipe->fragy_end[pli]=sby_end<<2-vdec;
602
578k
    }
603
147k
    else{
604
147k
      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
605
147k
      _pipe->fragy_end[pli]=fplane->nvfrags;
606
147k
    }
607
725k
    vdec=!(_enc->state.info.pixel_fmt&2);
608
725k
  }
609
241k
  return notdone;
610
241k
}
611
612
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
613
725k
 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
614
  /*Copy over all the uncoded fragments from this plane and advance the uncoded
615
     fragment list.*/
616
725k
  if(_pipe->nuncoded_fragis[_pli]>0){
617
48.7k
    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
618
48.7k
    oc_frag_copy_list(&_enc->state,
619
48.7k
     _enc->state.ref_frame_data[OC_FRAME_SELF],
620
48.7k
     _enc->state.ref_frame_data[OC_FRAME_PREV],
621
48.7k
     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
622
48.7k
     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
623
48.7k
    _pipe->nuncoded_fragis[_pli]=0;
624
48.7k
  }
625
  /*Perform DC prediction.*/
626
725k
  oc_enc_pred_dc_frag_rows(_enc,_pli,
627
725k
   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
628
  /*Finish DC tokenization.*/
629
725k
  oc_enc_tokenize_dc_frag_list(_enc,_pli,
630
725k
   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
631
725k
   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
632
725k
  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
633
725k
  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
634
  /*And advance the coded fragment list.*/
635
725k
  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
636
725k
  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
725k
  _pipe->ncoded_fragis[_pli]=0;
638
  /*Apply the loop filter if necessary.*/
639
725k
  if(_pipe->loop_filter){
640
462k
    oc_state_loop_filter_frag_rows(&_enc->state,
641
462k
     _pipe->bounding_values,OC_FRAME_SELF,_pli,
642
462k
     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
643
462k
  }
644
262k
  else _sdelay=_edelay=0;
645
  /*To fill borders, we have an additional two pixel delay, since a fragment
646
     in the next row could filter its top edge, using two pixels from a
647
     fragment in this row.
648
    But there's no reason to delay a full fragment between the two.*/
649
725k
  oc_state_borders_fill_rows(&_enc->state,
650
725k
   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
651
725k
   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
652
725k
   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
653
725k
}
654
655
656
657
/*Cost information about the coded blocks in a MB.*/
658
struct oc_rd_metric{
659
  int uncoded_ac_ssd;
660
  int coded_ac_ssd;
661
  int ac_bits;
662
  int dc_flag;
663
};
664
665
666
667
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
668
 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
669
 unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
670
22.2M
 oc_fr_state *_fr,oc_token_checkpoint **_stack){
671
22.2M
  ogg_int16_t            *data;
672
22.2M
  ogg_int16_t            *dct;
673
22.2M
  ogg_int16_t            *idct;
674
22.2M
  oc_qii_state            qs;
675
22.2M
  const ogg_uint16_t     *dequant;
676
22.2M
  ogg_uint16_t            dequant_dc;
677
22.2M
  ptrdiff_t               frag_offs;
678
22.2M
  int                     ystride;
679
22.2M
  const unsigned char    *src;
680
22.2M
  const unsigned char    *ref;
681
22.2M
  unsigned char          *dst;
682
22.2M
  int                     nonzero;
683
22.2M
  unsigned                uncoded_ssd;
684
22.2M
  unsigned                coded_ssd;
685
22.2M
  oc_token_checkpoint    *checkpoint;
686
22.2M
  oc_fragment            *frags;
687
22.2M
  int                     mb_mode;
688
22.2M
  int                     refi;
689
22.2M
  int                     mv_offs[2];
690
22.2M
  int                     nmv_offs;
691
22.2M
  int                     ac_bits;
692
22.2M
  int                     borderi;
693
22.2M
  int                     nqis;
694
22.2M
  int                     qti;
695
22.2M
  int                     qii;
696
22.2M
  int                     dc;
697
22.2M
  nqis=_enc->state.nqis;
698
22.2M
  frags=_enc->state.frags;
699
22.2M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
700
22.2M
  ystride=_enc->state.ref_ystride[_pli];
701
22.2M
  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
702
22.2M
  borderi=frags[_fragi].borderi;
703
22.2M
  qii=frags[_fragi].qii;
704
22.2M
  data=_enc->pipe.dct_data;
705
22.2M
  dct=data+64;
706
22.2M
  idct=data+128;
707
22.2M
  if(qii&~3){
708
424k
#if !defined(OC_COLLECT_METRICS)
709
424k
    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
710
      /*Enable early skip detection.*/
711
424k
      frags[_fragi].coded=0;
712
424k
      frags[_fragi].refi=OC_FRAME_NONE;
713
424k
      oc_fr_skip_block(_fr);
714
424k
      return 0;
715
424k
    }
716
0
#endif
717
    /*Try and code this block anyway.*/
718
0
    qii&=3;
719
0
  }
720
21.8M
  refi=frags[_fragi].refi;
721
21.8M
  mb_mode=frags[_fragi].mb_mode;
722
21.8M
  ref=_enc->state.ref_frame_data[refi]+frag_offs;
723
21.8M
  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
724
  /*Motion compensation:*/
725
21.8M
  switch(mb_mode){
726
21.4M
    case OC_MODE_INTRA:{
727
21.4M
      nmv_offs=0;
728
21.4M
      oc_enc_frag_sub_128(_enc,data,src,ystride);
729
21.4M
    }break;
730
15.8k
    case OC_MODE_GOLDEN_NOMV:
731
115k
    case OC_MODE_INTER_NOMV:{
732
115k
      nmv_offs=1;
733
115k
      mv_offs[0]=0;
734
115k
      oc_enc_frag_sub(_enc,data,src,ref,ystride);
735
115k
    }break;
736
279k
    default:{
737
279k
      const oc_mv *frag_mvs;
738
279k
      frag_mvs=_enc->state.frag_mvs;
739
279k
      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
740
279k
       _pli,frag_mvs[_fragi]);
741
279k
      if(nmv_offs>1){
742
218k
        oc_enc_frag_copy2(_enc,dst,
743
218k
         ref+mv_offs[0],ref+mv_offs[1],ystride);
744
218k
        oc_enc_frag_sub(_enc,data,src,dst,ystride);
745
218k
      }
746
60.3k
      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
747
279k
    }break;
748
21.8M
  }
749
#if defined(OC_COLLECT_METRICS)
750
  {
751
    unsigned sad;
752
    unsigned satd;
753
    switch(nmv_offs){
754
      case 0:{
755
        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
756
        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
757
      }break;
758
      case 1:{
759
        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
760
        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
761
        satd+=abs(dc);
762
      }break;
763
      default:{
764
        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
765
        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
766
        satd+=abs(dc);
767
      }break;
768
    }
769
    _enc->frag_sad[_fragi]=sad;
770
    _enc->frag_satd[_fragi]=satd;
771
  }
772
#endif
773
  /*Transform:*/
774
21.8M
  oc_enc_fdct8x8(_enc,dct,data);
775
  /*Quantize:*/
776
21.8M
  qti=mb_mode!=OC_MODE_INTRA;
777
21.8M
  dequant=_enc->dequant[_pli][qii][qti];
778
21.8M
  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
779
21.8M
  dc=data[0];
780
  /*Tokenize.*/
781
21.8M
  checkpoint=*_stack;
782
21.8M
  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
783
21.8M
    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
784
21.8M
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
785
21.8M
  }
786
0
  else{
787
0
    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
788
0
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
789
0
  }
790
  /*Reconstruct.
791
    TODO: nonzero may need to be adjusted after tokenization.*/
792
21.8M
  dequant_dc=dequant[0];
793
21.8M
  if(nonzero==0){
794
18.5M
    ogg_int16_t p;
795
18.5M
    int         ci;
796
18.5M
    int         qi01;
797
18.5M
    int         qi12;
798
    /*We round this dequant product (and not any of the others) because there's
799
       no iDCT rounding.*/
800
18.5M
    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
801
    /*LOOP VECTORIZES.*/
802
1.20G
    for(ci=0;ci<64;ci++)data[ci]=p;
803
    /*We didn't code any AC coefficients, so don't change the quantizer.*/
804
18.5M
    qi01=_pipe->qs[_pli].qi01;
805
18.5M
    qi12=_pipe->qs[_pli].qi12;
806
18.5M
    if(qi01>0)qii=1+qi12;
807
16.2M
    else if(qi01>=0)qii=0;
808
18.5M
  }
809
3.34M
  else{
810
3.34M
    idct[0]=dc*dequant_dc;
811
    /*Note: This clears idct[] back to zero for the next block.*/
812
3.34M
    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
813
3.34M
  }
814
21.8M
  frags[_fragi].qii=qii;
815
21.8M
  if(nqis>1){
816
7.95M
    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
817
7.95M
    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
818
7.95M
  }
819
21.8M
  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
820
394k
  else{
821
394k
    oc_enc_frag_recon_inter(_enc,dst,
822
394k
     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
823
394k
  }
824
  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
825
21.8M
#if !defined(OC_COLLECT_METRICS)
826
21.8M
  if(_fr!=NULL)
827
1.50M
#endif
828
1.50M
  {
829
    /*In retrospect, should we have skipped this block?*/
830
1.50M
    if(borderi<0){
831
912k
      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
832
912k
    }
833
595k
    else{
834
595k
      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
835
595k
       _enc->state.borders[borderi].mask);
836
595k
    }
837
    /*Scale to match DCT domain.*/
838
1.50M
    coded_ssd<<=4;
839
#if defined(OC_COLLECT_METRICS)
840
    _enc->frag_ssd[_fragi]=coded_ssd;
841
  }
842
  if(_fr!=NULL){
843
#endif
844
1.50M
    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
845
1.50M
    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
846
1.50M
    if(uncoded_ssd<UINT_MAX&&
847
     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
848
        is enabled.*/
849
1.50M
     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
850
1.50M
      int overhead_bits;
851
1.50M
      overhead_bits=oc_fr_cost1(_fr);
852
      /*Although the fragment coding overhead determination is accurate, it is
853
         greedy, using very coarse-grained local information.
854
        Allowing it to mildly discourage coding turns out to be beneficial, but
855
         it's not clear that allowing it to encourage coding through negative
856
         coding overhead deltas is useful.
857
        For that reason, we disallow negative coding overheads.*/
858
1.50M
      if(overhead_bits<0)overhead_bits=0;
859
1.50M
      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
860
        /*Hm, not worth it; roll back.*/
861
130k
        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
862
130k
        *_stack=checkpoint;
863
130k
        frags[_fragi].coded=0;
864
130k
        frags[_fragi].refi=OC_FRAME_NONE;
865
130k
        oc_fr_skip_block(_fr);
866
130k
        return 0;
867
130k
      }
868
1.50M
    }
869
0
    else _mo->dc_flag=1;
870
1.37M
    _mo->uncoded_ac_ssd+=uncoded_ssd;
871
1.37M
    _mo->coded_ac_ssd+=coded_ssd;
872
1.37M
    _mo->ac_bits+=ac_bits;
873
1.37M
    oc_fr_code_block(_fr);
874
1.37M
  }
875
  /*GCC 4.4.4 generates a warning here because it can't tell that
876
     the init code in the nqis check above will run anytime this
877
     line runs.*/
878
21.7M
  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
879
21.7M
  frags[_fragi].dc=dc;
880
21.7M
  frags[_fragi].coded=1;
881
21.7M
  return 1;
882
21.8M
}
883
884
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
885
 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
886
305k
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
887
  /*Worst case token stack usage for 4 fragments.*/
888
305k
  oc_token_checkpoint  stack[64*4];
889
305k
  oc_token_checkpoint *stackptr;
890
305k
  const oc_sb_map     *sb_maps;
891
305k
  signed char         *mb_modes;
892
305k
  oc_fragment         *frags;
893
305k
  ptrdiff_t           *coded_fragis;
894
305k
  ptrdiff_t            ncoded_fragis;
895
305k
  ptrdiff_t           *uncoded_fragis;
896
305k
  ptrdiff_t            nuncoded_fragis;
897
305k
  oc_rd_metric         mo;
898
305k
  oc_fr_state          fr_checkpoint;
899
305k
  oc_qii_state         qs_checkpoint;
900
305k
  int                  mb_mode;
901
305k
  int                  refi;
902
305k
  int                  ncoded;
903
305k
  ptrdiff_t            fragi;
904
305k
  int                  bi;
905
305k
  *&fr_checkpoint=*(_pipe->fr+0);
906
305k
  *&qs_checkpoint=*(_pipe->qs+0);
907
305k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
908
305k
  mb_modes=_enc->state.mb_modes;
909
305k
  frags=_enc->state.frags;
910
305k
  coded_fragis=_pipe->coded_fragis[0];
911
305k
  ncoded_fragis=_pipe->ncoded_fragis[0];
912
305k
  uncoded_fragis=_pipe->uncoded_fragis[0];
913
305k
  nuncoded_fragis=_pipe->nuncoded_fragis[0];
914
305k
  mb_mode=mb_modes[_mbi];
915
305k
  refi=OC_FRAME_FOR_MODE(mb_mode);
916
305k
  ncoded=0;
917
305k
  stackptr=stack;
918
305k
  memset(&mo,0,sizeof(mo));
919
1.52M
  for(bi=0;bi<4;bi++){
920
1.22M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
921
1.22M
    frags[fragi].refi=refi;
922
1.22M
    frags[fragi].mb_mode=mb_mode;
923
1.22M
    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
924
1.22M
     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
925
922k
      coded_fragis[ncoded_fragis++]=fragi;
926
922k
      ncoded++;
927
922k
    }
928
298k
    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
929
1.22M
  }
930
305k
  if(ncoded>0&&!mo.dc_flag){
931
255k
    int cost;
932
    /*Some individual blocks were worth coding.
933
      See if that's still true when accounting for mode and MV overhead.*/
934
255k
    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
935
255k
     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
936
255k
    if(mo.uncoded_ac_ssd<=cost){
937
      /*Taking macroblock overhead into account, it is not worth coding this
938
         MB.*/
939
6.35k
      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
940
6.35k
      *(_pipe->fr+0)=*&fr_checkpoint;
941
6.35k
      *(_pipe->qs+0)=*&qs_checkpoint;
942
31.7k
      for(bi=0;bi<4;bi++){
943
25.4k
        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
944
25.4k
        if(frags[fragi].coded){
945
9.44k
          *(uncoded_fragis-++nuncoded_fragis)=fragi;
946
9.44k
          frags[fragi].coded=0;
947
9.44k
          frags[fragi].refi=OC_FRAME_NONE;
948
9.44k
        }
949
25.4k
        oc_fr_skip_block(_pipe->fr+0);
950
25.4k
      }
951
6.35k
      ncoded_fragis-=ncoded;
952
6.35k
      ncoded=0;
953
6.35k
    }
954
255k
  }
955
  /*If no luma blocks coded, the mode is forced.*/
956
305k
  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
957
  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
958
     with a single coded block.
959
    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
960
     skipped blocks, while a 1MV does not.*/
961
249k
  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
962
163
    mb_modes[_mbi]=OC_MODE_INTER_MV;
963
163
  }
964
305k
  _pipe->ncoded_fragis[0]=ncoded_fragis;
965
305k
  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
966
305k
  return ncoded;
967
305k
}
968
969
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
970
62.5k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
971
62.5k
  const ogg_uint16_t *mcu_rd_scale;
972
62.5k
  const ogg_uint16_t *mcu_rd_iscale;
973
62.5k
  const oc_sb_map    *sb_maps;
974
62.5k
  oc_sb_flags        *sb_flags;
975
62.5k
  oc_fr_state        *fr;
976
62.5k
  ptrdiff_t          *coded_fragis;
977
62.5k
  ptrdiff_t           ncoded_fragis;
978
62.5k
  ptrdiff_t          *uncoded_fragis;
979
62.5k
  ptrdiff_t           nuncoded_fragis;
980
62.5k
  ptrdiff_t           froffset;
981
62.5k
  int                 sbi;
982
62.5k
  fr=_pipe->fr+_pli;
983
62.5k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
984
62.5k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
985
62.5k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
986
62.5k
  sb_flags=_enc->state.sb_flags;
987
62.5k
  coded_fragis=_pipe->coded_fragis[_pli];
988
62.5k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
989
62.5k
  uncoded_fragis=_pipe->uncoded_fragis[_pli];
990
62.5k
  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
991
62.5k
  froffset=_pipe->froffset[_pli];
992
173k
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
993
    /*Worst case token stack usage for 1 fragment.*/
994
110k
    oc_token_checkpoint stack[64];
995
110k
    oc_rd_metric        mo;
996
110k
    int                 quadi;
997
110k
    int                 bi;
998
110k
    memset(&mo,0,sizeof(mo));
999
2.20M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1000
1.76M
      ptrdiff_t fragi;
1001
1.76M
      fragi=sb_maps[sbi][quadi][bi];
1002
1.76M
      if(fragi>=0){
1003
711k
        oc_token_checkpoint *stackptr;
1004
711k
        unsigned             rd_scale;
1005
711k
        unsigned             rd_iscale;
1006
711k
        rd_scale=mcu_rd_scale[fragi-froffset];
1007
711k
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1008
711k
        stackptr=stack;
1009
711k
        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1010
711k
         rd_scale,rd_iscale,&mo,fr,&stackptr)){
1011
455k
          coded_fragis[ncoded_fragis++]=fragi;
1012
455k
        }
1013
256k
        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1014
711k
      }
1015
1.76M
    }
1016
110k
    oc_fr_state_flush_sb(fr);
1017
110k
    sb_flags[sbi].coded_fully=fr->sb_full;
1018
110k
    sb_flags[sbi].coded_partially=fr->sb_partial;
1019
110k
  }
1020
62.5k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1021
62.5k
  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1022
62.5k
}
1023
1024
/*Mode decision is done by exhaustively examining all potential choices.
1025
  Obviously, doing the motion compensation, fDCT, tokenization, and then
1026
   counting the bits each token uses is computationally expensive.
1027
  Theora's EOB runs can also split the cost of these tokens across multiple
1028
   fragments, and naturally we don't know what the optimal choice of Huffman
1029
   codes will be until we know all the tokens we're going to encode in all the
1030
   fragments.
1031
  So we use a simple approach to estimating the bit cost and distortion of each
1032
   mode based upon the SATD value of the residual before coding.
1033
  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1034
   the process (modified somewhat from that of the paper) is very simple.
1035
  We build a non-linear regression of the mappings from
1036
   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1037
   SSD for each qi.
1038
  A separate set of mappings is kept for each quantization type and color
1039
   plane.
1040
  The mappings are constructed by partitioning the SATD values into a small
1041
   number of bins (currently 24) and using a linear regression in each bin
1042
   (as opposed to the 0th-order regression used by Kim).
1043
  The bit counts and SSD measurements are obtained by examining actual encoded
1044
   frames, with appropriate lambda values and optimal Huffman codes selected.
1045
  EOB bits are assigned to the fragment that started the EOB run (as opposed to
1046
   dividing them among all the blocks in the run; the latter approach seems
1047
   more theoretically correct, but Monty's testing showed a small improvement
1048
   with the former, though that may have been merely statistical noise).
1049
1050
  @ARTICLE{Kim03,
1051
    author="Hyun Mun Kim",
1052
    title="Adaptive Rate Control Using Nonlinear Regression",
1053
    journal="IEEE Transactions on Circuits and Systems for Video Technology",
1054
    volume=13,
1055
    number=5,
1056
    pages="432--439",
1057
    month=May,
1058
    year=2003
1059
  }*/
1060
1061
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1062
   overflow for large lambda values.*/
1063
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1064
87.9M
 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1065
87.9M
 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1066
87.9M
 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1067
1068
49.0k
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1069
49.0k
#if !defined(OC_COLLECT_METRICS)
1070
49.0k
  const
1071
49.0k
#endif
1072
49.0k
  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1073
49.0k
   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1074
49.0k
  int qii;
1075
#if defined(OC_COLLECT_METRICS)
1076
  oc_enc_mode_metrics_load(_enc);
1077
#endif
1078
134k
  for(qii=0;qii<_enc->state.nqis;qii++){
1079
85.5k
    int qi;
1080
85.5k
    int pli;
1081
85.5k
    qi=_enc->state.qis[qii];
1082
342k
    for(pli=0;pli<3;pli++){
1083
256k
      int qti;
1084
769k
      for(qti=0;qti<2;qti++){
1085
513k
        int log_plq;
1086
513k
        int modeline;
1087
513k
        int bin;
1088
513k
        int dx;
1089
513k
        int dq;
1090
513k
        log_plq=_enc->log_plq[qi][pli][qti];
1091
        /*Find the pair of rows in the mode table that bracket this quantizer.
1092
          If it falls outside the range the table covers, then we just use a
1093
           pair on the edge for linear extrapolation.*/
1094
2.38M
        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1095
2.38M
         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1096
        /*Interpolate a row for this quantizer.*/
1097
513k
        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1098
513k
        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1099
513k
        if(dq==0)dq=1;
1100
12.8M
        for(bin=0;bin<OC_COMP_BINS;bin++){
1101
12.3M
          int y0;
1102
12.3M
          int z0;
1103
12.3M
          int dy;
1104
12.3M
          int dz;
1105
12.3M
          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1106
12.3M
          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1107
12.3M
          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1108
12.3M
          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1109
12.3M
          _enc->mode_rd[qii][pli][qti][bin].rate=
1110
12.3M
           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1111
12.3M
          _enc->mode_rd[qii][pli][qti][bin].rmse=
1112
12.3M
           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1113
12.3M
        }
1114
513k
      }
1115
256k
    }
1116
85.5k
  }
1117
49.0k
}
1118
1119
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1120
   prediction.*/
1121
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1122
56.4M
 int _qii,int _pli,int _qti,int _satd){
1123
56.4M
  unsigned rmse;
1124
56.4M
  int      shift;
1125
56.4M
  int      bin;
1126
56.4M
  int      dx;
1127
56.4M
  int      y0;
1128
56.4M
  int      z0;
1129
56.4M
  int      dy;
1130
56.4M
  int      dz;
1131
  /*SATD metrics for chroma planes vary much less than luma, so we scale them
1132
     by 4 to distribute them into the mode decision bins more evenly.*/
1133
56.4M
  _satd<<=_pli+1&2;
1134
56.4M
  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1135
56.4M
  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1136
56.4M
  dx=_satd-(bin<<shift);
1137
56.4M
  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1138
56.4M
  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1139
56.4M
  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1140
56.4M
  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1141
56.4M
  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1142
56.4M
  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1143
56.4M
  return OC_MAXI(y0+(dy*dx>>shift),0);
1144
56.4M
}
1145
1146
/*activity_avg must be positive, or flat regions could get a zero weight, which
1147
   confounds analysis.
1148
  We set the minimum to this value so that it also avoids the need for divide
1149
   by zero checks in oc_mb_masking().*/
1150
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1151
1152
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1153
2.98M
 unsigned _activity[4]){
1154
2.98M
  const unsigned char *src;
1155
2.98M
  const ptrdiff_t     *frag_buf_offs;
1156
2.98M
  const ptrdiff_t     *sb_map;
1157
2.98M
  unsigned             luma;
1158
2.98M
  int                  ystride;
1159
2.98M
  ptrdiff_t            frag_offs;
1160
2.98M
  ptrdiff_t            fragi;
1161
2.98M
  int                  bi;
1162
2.98M
  frag_buf_offs=_enc->state.frag_buf_offs;
1163
2.98M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1164
2.98M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1165
2.98M
  ystride=_enc->state.ref_ystride[0];
1166
2.98M
  luma=0;
1167
14.9M
  for(bi=0;bi<4;bi++){
1168
11.9M
    const unsigned char *s;
1169
11.9M
    unsigned             x;
1170
11.9M
    unsigned             x2;
1171
11.9M
    unsigned             act;
1172
11.9M
    int                  i;
1173
11.9M
    int                  j;
1174
11.9M
    fragi=sb_map[bi];
1175
11.9M
    frag_offs=frag_buf_offs[fragi];
1176
    /*TODO: This could be replaced with SATD^2, since we already have to
1177
       compute SATD.*/
1178
11.9M
    x=x2=0;
1179
11.9M
    s=src+frag_offs;
1180
107M
    for(i=0;i<8;i++){
1181
861M
      for(j=0;j<8;j++){
1182
765M
        unsigned c;
1183
765M
        c=s[j];
1184
765M
        x+=c;
1185
765M
        x2+=c*c;
1186
765M
      }
1187
95.6M
      s+=ystride;
1188
95.6M
    }
1189
11.9M
    luma+=x;
1190
11.9M
    act=(x2<<6)-x*x;
1191
11.9M
    if(act<8<<12){
1192
      /*The region is flat.*/
1193
9.40M
      act=OC_MINI(act,5<<12);
1194
9.40M
    }
1195
2.55M
    else{
1196
2.55M
      unsigned e1;
1197
2.55M
      unsigned e2;
1198
2.55M
      unsigned e3;
1199
2.55M
      unsigned e4;
1200
      /*Test for an edge.
1201
        TODO: There are probably much simpler ways to do this (e.g., it could
1202
         probably be combined with the SATD calculation).
1203
        Alternatively, we could split the block around the mean and compute the
1204
         reduction in variance in each half.
1205
        For a Gaussian source the reduction should be
1206
         (1-2/pi) ~= 0.36338022763241865692446494650994.
1207
        Significantly more reduction is a good indication of a bi-level image.
1208
        This has the advantage of identifying, in addition to straight edges,
1209
         small text regions, which would otherwise be classified as "texture".*/
1210
2.55M
      e1=e2=e3=e4=0;
1211
2.55M
      s=src+frag_offs-1;
1212
22.9M
      for(i=0;i<8;i++){
1213
183M
        for(j=0;j<8;j++){
1214
163M
          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1215
163M
           +(s+ystride)[j+2]-(s+ystride)[j]);
1216
163M
          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1217
163M
           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1218
163M
          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1219
163M
           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1220
163M
          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1221
163M
           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1222
163M
        }
1223
20.4M
        s+=ystride;
1224
20.4M
      }
1225
      /*If the largest component of the edge energy is at least 40% of the
1226
         total, then classify the block as an edge block.*/
1227
2.55M
      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1228
         /*act=act_th*(act/act_th)**0.7
1229
              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
1230
           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1231
35.5k
         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1232
35.5k
      }
1233
2.55M
    }
1234
11.9M
    _activity[bi]=act;
1235
11.9M
  }
1236
2.98M
  return luma;
1237
2.98M
}
1238
1239
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1240
0
 unsigned _activity[4],const unsigned _intra_satd[12]){
1241
0
  int bi;
1242
0
  for(bi=0;bi<4;bi++){
1243
0
    unsigned act;
1244
0
    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1245
0
    if(act<8<<12){
1246
      /*The region is flat.*/
1247
0
      act=OC_MINI(act,5<<12);
1248
0
    }
1249
0
    _activity[bi]=act;
1250
0
  }
1251
0
}
1252
1253
/*Compute the masking scales for the blocks in a macro block.
1254
  All masking is computed from the luma blocks.
1255
  We derive scaling factors for the chroma blocks from these, and use the same
1256
   ones for all chroma blocks, regardless of the subsampling.
1257
  It's possible for luma to be perfectly flat and yet have high chroma energy,
1258
   but this is unlikely in non-artificial images, and not a case that has been
1259
   addressed by any research to my knowledge.
1260
  The output of the masking process is two scale factors, which are fed into
1261
   the various R-D optimizations.
1262
  The first, rd_scale, is applied to D in the equation
1263
    D*rd_scale+lambda*R.
1264
  This is the form that must be used to properly combine scores from multiple
1265
   blocks, and can be interpreted as scaling distortions by their visibility.
1266
  The inverse, rd_iscale, is applied to lambda in the equation
1267
    D+rd_iscale*lambda*R.
1268
  This is equivalent to the first form within a single block, but much faster
1269
   to use when evaluating many possible distortions (e.g., during actual
1270
   quantization, where separate distortions are evaluated for every
1271
   coefficient).
1272
  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1273
   used to perform the multiplications with the proper re-scaling for the range
1274
   of the scaling factors.
1275
  Many researchers apply masking values directly to the quantizers used, and
1276
   not to the R-D cost.
1277
  Since we generally use MSE for D, rd_scale must use the square of their
1278
   values to generate an equivalent effect.*/
1279
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1280
 const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1281
2.98M
 unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1282
2.98M
  unsigned activity_sum;
1283
2.98M
  unsigned la;
1284
2.98M
  unsigned lb;
1285
2.98M
  unsigned d;
1286
2.98M
  int      bi;
1287
2.98M
  int      bi_min;
1288
2.98M
  int      bi_min2;
1289
  /*The ratio lb/la is meant to approximate
1290
     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1291
     effective luminance masking from~\cite{LKW06} (including the self-masking
1292
     deflator).
1293
    The following actually turns out to be a pretty good approximation for
1294
     _luma>75 or so.
1295
    For smaller values luminance does not really follow Weber's Law anyway, and
1296
     this approximation gives a much less aggressive bitrate boost in this
1297
     region.
1298
    Though some researchers claim that contrast sensitivity actually decreases
1299
     for very low luminance values, in my experience excessive brightness on
1300
     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1301
     of the CCIR 601 range) make artifacts in such regions extremely visible.
1302
    We substitute _luma_avg for 128 to allow the strength of the masking to
1303
     vary with the actual average image luminance, within certain limits (the
1304
     caller has clamped _luma_avg to the range [90,160], inclusive).
1305
    @ARTICLE{LKW06,
1306
      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1307
      title="{JPEG2000} Encoding With Perceptual Distortion Control",
1308
      journal="{IEEE} Transactions on Image Processing",
1309
      volume=15,
1310
      number=7,
1311
      pages="1763--1778",
1312
      month=Jul,
1313
      year=2006
1314
    }*/
1315
#if 0
1316
  la=_luma+4*_luma_avg;
1317
  lb=4*_luma+_luma_avg;
1318
#else
1319
  /*Disable luminance masking.*/
1320
2.98M
  la=lb=1;
1321
2.98M
#endif
1322
2.98M
  activity_sum=0;
1323
14.9M
  for(bi=0;bi<4;bi++){
1324
11.9M
    unsigned a;
1325
11.9M
    unsigned b;
1326
11.9M
    activity_sum+=_activity[bi];
1327
    /*Apply activity masking.*/
1328
11.9M
    a=_activity[bi]+4*_activity_avg;
1329
11.9M
    b=4*_activity[bi]+_activity_avg;
1330
11.9M
    d=OC_RD_SCALE(b,1);
1331
    /*And luminance masking.*/
1332
11.9M
    d=(a+(d>>1))/d;
1333
11.9M
    _rd_scale[bi]=(d*la+(lb>>1))/lb;
1334
    /*And now the inverse.*/
1335
11.9M
    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1336
11.9M
    d=(b+(d>>1))/d;
1337
11.9M
    _rd_iscale[bi]=(d*lb+(la>>1))/la;
1338
11.9M
  }
1339
  /*Now compute scaling factors for chroma blocks.
1340
    We start by finding the two smallest iscales from the luma blocks.*/
1341
2.98M
  bi_min=_rd_iscale[1]<_rd_iscale[0];
1342
2.98M
  bi_min2=1-bi_min;
1343
8.96M
  for(bi=2;bi<4;bi++){
1344
5.97M
    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1345
467k
      bi_min2=bi_min;
1346
467k
      bi_min=bi;
1347
467k
    }
1348
5.51M
    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1349
5.97M
  }
1350
  /*If the minimum iscale is less than 1.0, use the second smallest instead,
1351
     and force the value to at least 1.0 (inflating chroma is a waste).*/
1352
2.98M
  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1353
2.98M
  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1354
2.98M
  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1355
2.98M
  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1356
2.98M
  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1357
2.98M
  return activity_sum;
1358
2.98M
}
1359
1360
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1361
305k
 unsigned _frag_satd[12]){
1362
305k
  const unsigned char   *src;
1363
305k
  const ptrdiff_t       *frag_buf_offs;
1364
305k
  const ptrdiff_t       *sb_map;
1365
305k
  const oc_mb_map_plane *mb_map;
1366
305k
  const unsigned char   *map_idxs;
1367
305k
  int                    map_nidxs;
1368
305k
  int                    mapii;
1369
305k
  int                    mapi;
1370
305k
  int                    ystride;
1371
305k
  int                    pli;
1372
305k
  int                    bi;
1373
305k
  ptrdiff_t              fragi;
1374
305k
  ptrdiff_t              frag_offs;
1375
305k
  unsigned               luma;
1376
305k
  int                    dc;
1377
305k
  frag_buf_offs=_enc->state.frag_buf_offs;
1378
305k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1379
305k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1380
305k
  ystride=_enc->state.ref_ystride[0];
1381
305k
  luma=0;
1382
1.52M
  for(bi=0;bi<4;bi++){
1383
1.22M
    fragi=sb_map[bi];
1384
1.22M
    frag_offs=frag_buf_offs[fragi];
1385
1.22M
    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1386
1.22M
    luma+=dc;
1387
1.22M
  }
1388
305k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1389
305k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1390
305k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1391
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1392
305k
  ystride=_enc->state.ref_ystride[1];
1393
1.01M
  for(mapii=4;mapii<map_nidxs;mapii++){
1394
711k
    mapi=map_idxs[mapii];
1395
711k
    pli=mapi>>2;
1396
711k
    bi=mapi&3;
1397
711k
    fragi=mb_map[pli][bi];
1398
711k
    frag_offs=frag_buf_offs[fragi];
1399
711k
    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1400
711k
  }
1401
305k
  return luma;
1402
305k
}
1403
1404
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1405
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1406
2.68M
 const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1407
2.68M
  const unsigned char *src;
1408
2.68M
  const ptrdiff_t     *frag_buf_offs;
1409
2.68M
  const oc_sb_map     *sb_maps;
1410
2.68M
  oc_fragment         *frags;
1411
2.68M
  ptrdiff_t            frag_offs;
1412
2.68M
  ptrdiff_t            fragi;
1413
2.68M
  oc_qii_state         qs[4][3];
1414
2.68M
  unsigned             cost[4][3];
1415
2.68M
  unsigned             ssd[4][3];
1416
2.68M
  unsigned             rate[4][3];
1417
2.68M
  int                  prev[3][3];
1418
2.68M
  unsigned             satd;
1419
2.68M
  int                  dc;
1420
2.68M
  unsigned             best_cost;
1421
2.68M
  unsigned             best_ssd;
1422
2.68M
  unsigned             best_rate;
1423
2.68M
  int                  best_qii;
1424
2.68M
  int                  qii;
1425
2.68M
  int                  lambda;
1426
2.68M
  int                  ystride;
1427
2.68M
  int                  nqis;
1428
2.68M
  int                  bi;
1429
2.68M
  frag_buf_offs=_enc->state.frag_buf_offs;
1430
2.68M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1431
2.68M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1432
2.68M
  ystride=_enc->state.ref_ystride[0];
1433
2.68M
  fragi=sb_maps[_mbi>>2][_mbi&3][0];
1434
2.68M
  frag_offs=frag_buf_offs[fragi];
1435
2.68M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1436
2.68M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1437
2.68M
  }
1438
0
  else{
1439
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1440
0
  }
1441
2.68M
  nqis=_enc->state.nqis;
1442
2.68M
  lambda=_enc->lambda;
1443
7.18M
  for(qii=0;qii<nqis;qii++){
1444
4.49M
    oc_qii_state_advance(qs[0]+qii,_qs,qii);
1445
4.49M
    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1446
4.49M
     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1447
4.49M
    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1448
4.49M
    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1449
4.49M
  }
1450
10.7M
  for(bi=1;bi<4;bi++){
1451
8.05M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1452
8.05M
    frag_offs=frag_buf_offs[fragi];
1453
8.05M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1454
8.05M
      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1455
8.05M
    }
1456
0
    else{
1457
0
      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1458
0
    }
1459
21.5M
    for(qii=0;qii<nqis;qii++){
1460
13.4M
      oc_qii_state qt[3];
1461
13.4M
      unsigned     cur_ssd;
1462
13.4M
      unsigned     cur_rate;
1463
13.4M
      int          best_qij;
1464
13.4M
      int          qij;
1465
13.4M
      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1466
13.4M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1467
13.4M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1468
13.4M
      best_ssd=ssd[bi-1][0]+cur_ssd;
1469
13.4M
      best_rate=rate[bi-1][0]+cur_rate
1470
13.4M
       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1471
13.4M
      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1472
13.4M
      best_qij=0;
1473
29.4M
      for(qij=1;qij<nqis;qij++){
1474
15.9M
        unsigned chain_ssd;
1475
15.9M
        unsigned chain_rate;
1476
15.9M
        unsigned chain_cost;
1477
15.9M
        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1478
15.9M
        chain_ssd=ssd[bi-1][qij]+cur_ssd;
1479
15.9M
        chain_rate=rate[bi-1][qij]+cur_rate
1480
15.9M
         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1481
15.9M
        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1482
15.9M
        if(chain_cost<best_cost){
1483
6.38M
          best_cost=chain_cost;
1484
6.38M
          best_ssd=chain_ssd;
1485
6.38M
          best_rate=chain_rate;
1486
6.38M
          best_qij=qij;
1487
6.38M
        }
1488
15.9M
      }
1489
13.4M
      *(qs[bi]+qii)=*(qt+best_qij);
1490
13.4M
      cost[bi][qii]=best_cost;
1491
13.4M
      ssd[bi][qii]=best_ssd;
1492
13.4M
      rate[bi][qii]=best_rate;
1493
13.4M
      prev[bi-1][qii]=best_qij;
1494
13.4M
    }
1495
8.05M
  }
1496
2.68M
  best_qii=0;
1497
2.68M
  best_cost=cost[3][0];
1498
4.49M
  for(qii=1;qii<nqis;qii++){
1499
1.81M
    if(cost[3][qii]<best_cost){
1500
558k
      best_cost=cost[3][qii];
1501
558k
      best_qii=qii;
1502
558k
    }
1503
1.81M
  }
1504
2.68M
  frags=_enc->state.frags;
1505
10.7M
  for(bi=3;;){
1506
10.7M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1507
10.7M
    frags[fragi].qii=best_qii;
1508
10.7M
    if(bi--<=0)break;
1509
8.05M
    best_qii=prev[bi][best_qii];
1510
8.05M
  }
1511
2.68M
  return best_cost;
1512
2.68M
}
1513
1514
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1515
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1516
9.60M
 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1517
9.60M
  const unsigned char *src;
1518
9.60M
  oc_fragment         *frags;
1519
9.60M
  ptrdiff_t            frag_offs;
1520
9.60M
  oc_qii_state         qt[3];
1521
9.60M
  unsigned             cost[3];
1522
9.60M
  unsigned             satd;
1523
9.60M
  int                  dc;
1524
9.60M
  unsigned             best_cost;
1525
9.60M
  int                  best_qii;
1526
9.60M
  int                  qii;
1527
9.60M
  int                  lambda;
1528
9.60M
  int                  ystride;
1529
9.60M
  int                  nqis;
1530
9.60M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1531
9.60M
  ystride=_enc->state.ref_ystride[_pli];
1532
9.60M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
1533
9.60M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1534
9.60M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1535
9.60M
  }
1536
0
  else{
1537
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1538
0
  }
1539
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1540
     worth spending the bits to change the AC quantizer.
1541
    TODO: This may be worth revisiting when we separate out DC and AC
1542
     predictions from SATD.*/
1543
#if 0
1544
  nqis=_enc->state.nqis;
1545
#else
1546
9.60M
  nqis=1;
1547
9.60M
#endif
1548
9.60M
  lambda=_enc->lambda;
1549
9.60M
  best_qii=0;
1550
19.2M
  for(qii=0;qii<nqis;qii++){
1551
9.60M
    unsigned cur_rate;
1552
9.60M
    unsigned cur_ssd;
1553
9.60M
    oc_qii_state_advance(qt+qii,_qs,qii);
1554
9.60M
    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1555
9.60M
     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1556
9.60M
    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1557
9.60M
    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1558
9.60M
  }
1559
9.60M
  best_cost=cost[0];
1560
9.60M
  for(qii=1;qii<nqis;qii++){
1561
0
    if(cost[qii]<best_cost){
1562
0
      best_cost=cost[qii];
1563
0
      best_qii=qii;
1564
0
    }
1565
0
  }
1566
9.60M
  frags=_enc->state.frags;
1567
9.60M
  frags[_fragi].qii=best_qii;
1568
9.60M
  return best_cost;
1569
9.60M
}
1570
1571
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1572
 oc_enc_pipeline_state *_pipe,unsigned _mbi,
1573
2.68M
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1574
  /*Worst case token stack usage for 4 fragments.*/
1575
2.68M
  oc_token_checkpoint  stack[64*4];
1576
2.68M
  oc_token_checkpoint *stackptr;
1577
2.68M
  const oc_sb_map     *sb_maps;
1578
2.68M
  oc_fragment         *frags;
1579
2.68M
  ptrdiff_t           *coded_fragis;
1580
2.68M
  ptrdiff_t            ncoded_fragis;
1581
2.68M
  ptrdiff_t            fragi;
1582
2.68M
  int                  bi;
1583
2.68M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1584
2.68M
  frags=_enc->state.frags;
1585
2.68M
  coded_fragis=_pipe->coded_fragis[0];
1586
2.68M
  ncoded_fragis=_pipe->ncoded_fragis[0];
1587
2.68M
  stackptr=stack;
1588
13.4M
  for(bi=0;bi<4;bi++){
1589
10.7M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1590
10.7M
    frags[fragi].refi=OC_FRAME_SELF;
1591
10.7M
    frags[fragi].mb_mode=OC_MODE_INTRA;
1592
10.7M
    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1593
10.7M
     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1594
10.7M
    coded_fragis[ncoded_fragis++]=fragi;
1595
10.7M
  }
1596
2.68M
  _pipe->ncoded_fragis[0]=ncoded_fragis;
1597
2.68M
}
1598
1599
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1600
421k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1601
421k
  const ogg_uint16_t *mcu_rd_scale;
1602
421k
  const ogg_uint16_t *mcu_rd_iscale;
1603
421k
  const oc_sb_map    *sb_maps;
1604
421k
  ptrdiff_t          *coded_fragis;
1605
421k
  ptrdiff_t           ncoded_fragis;
1606
421k
  ptrdiff_t           froffset;
1607
421k
  int                 sbi;
1608
421k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1609
421k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1610
421k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1611
421k
  coded_fragis=_pipe->coded_fragis[_pli];
1612
421k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
1613
421k
  froffset=_pipe->froffset[_pli];
1614
1.99M
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1615
    /*Worst case token stack usage for 1 fragment.*/
1616
1.57M
    oc_token_checkpoint stack[64];
1617
1.57M
    int                 quadi;
1618
1.57M
    int                 bi;
1619
31.5M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1620
25.2M
      ptrdiff_t fragi;
1621
25.2M
      fragi=sb_maps[sbi][quadi][bi];
1622
25.2M
      if(fragi>=0){
1623
9.60M
        oc_token_checkpoint *stackptr;
1624
9.60M
        unsigned             rd_scale;
1625
9.60M
        unsigned             rd_iscale;
1626
9.60M
        rd_scale=mcu_rd_scale[fragi-froffset];
1627
9.60M
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1628
9.60M
        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1629
9.60M
        stackptr=stack;
1630
9.60M
        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1631
9.60M
         rd_scale,rd_iscale,NULL,NULL,&stackptr);
1632
9.60M
        coded_fragis[ncoded_fragis++]=fragi;
1633
9.60M
      }
1634
25.2M
    }
1635
1.57M
  }
1636
421k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1637
421k
}
1638
1639
/*Analysis stage for an INTRA frame.*/
1640
19.9k
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1641
19.9k
  ogg_int64_t             activity_sum;
1642
19.9k
  ogg_int64_t             luma_sum;
1643
19.9k
  unsigned                activity_avg;
1644
19.9k
  unsigned                luma_avg;
1645
19.9k
  const ogg_uint16_t     *chroma_rd_scale;
1646
19.9k
  ogg_uint16_t           *mcu_rd_scale;
1647
19.9k
  ogg_uint16_t           *mcu_rd_iscale;
1648
19.9k
  const unsigned char    *map_idxs;
1649
19.9k
  int                     nmap_idxs;
1650
19.9k
  oc_sb_flags            *sb_flags;
1651
19.9k
  signed char            *mb_modes;
1652
19.9k
  const oc_mb_map        *mb_maps;
1653
19.9k
  const oc_sb_map        *sb_maps;
1654
19.9k
  oc_fragment            *frags;
1655
19.9k
  unsigned                stripe_sby;
1656
19.9k
  unsigned                mcu_nvsbs;
1657
19.9k
  int                     notstart;
1658
19.9k
  int                     notdone;
1659
19.9k
  int                     refi;
1660
19.9k
  int                     pli;
1661
19.9k
  _enc->state.frame_type=OC_INTRA_FRAME;
1662
19.9k
  oc_enc_tokenize_start(_enc);
1663
19.9k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
1664
19.9k
  oc_enc_mode_rd_init(_enc);
1665
19.9k
  activity_sum=luma_sum=0;
1666
19.9k
  activity_avg=_enc->activity_avg;
1667
19.9k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1668
19.9k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1669
19.9k
  mcu_rd_scale=_enc->mcu_rd_scale;
1670
19.9k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
1671
  /*Choose MVs and MB modes and quantize and code luma.
1672
    Must be done in Hilbert order.*/
1673
19.9k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1674
19.9k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1675
19.9k
  _enc->state.ncoded_fragis[0]=0;
1676
19.9k
  _enc->state.ncoded_fragis[1]=0;
1677
19.9k
  _enc->state.ncoded_fragis[2]=0;
1678
19.9k
  sb_flags=_enc->state.sb_flags;
1679
19.9k
  mb_modes=_enc->state.mb_modes;
1680
19.9k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1681
19.9k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1682
19.9k
  frags=_enc->state.frags;
1683
19.9k
  notstart=0;
1684
19.9k
  notdone=1;
1685
19.9k
  mcu_nvsbs=_enc->mcu_nvsbs;
1686
230k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1687
210k
    ptrdiff_t cfroffset;
1688
210k
    unsigned  sbi;
1689
210k
    unsigned  sbi_end;
1690
210k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1691
210k
    sbi_end=_enc->pipe.sbi_end[0];
1692
210k
    cfroffset=_enc->pipe.froffset[1];
1693
1.45M
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1694
1.24M
      int quadi;
1695
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
1696
6.22M
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1697
2.68M
        unsigned  activity[4];
1698
2.68M
        unsigned  rd_scale[5];
1699
2.68M
        unsigned  rd_iscale[5];
1700
2.68M
        unsigned  luma;
1701
2.68M
        unsigned  mbi;
1702
2.68M
        int       mapii;
1703
2.68M
        int       mapi;
1704
2.68M
        int       bi;
1705
2.68M
        ptrdiff_t fragi;
1706
2.68M
        mbi=sbi<<2|quadi;
1707
        /*Activity masking.*/
1708
2.68M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1709
2.68M
          luma=oc_mb_activity(_enc,mbi,activity);
1710
2.68M
        }
1711
0
        else{
1712
0
          unsigned intra_satd[12];
1713
0
          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1714
0
          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1715
0
          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1716
0
        }
1717
2.68M
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1718
2.68M
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1719
2.68M
        luma_sum+=luma;
1720
        /*Motion estimation:
1721
          We do a basic 1MV search for all macroblocks, coded or not,
1722
           keyframe or not, unless we aren't using motion estimation at all.*/
1723
2.68M
        if(!_recode&&_enc->state.curframe_num>0&&
1724
4.74k
         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1725
353
          oc_mcenc_search(_enc,mbi);
1726
353
        }
1727
2.68M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1728
2.68M
          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1729
2.68M
        }
1730
2.68M
        mb_modes[mbi]=OC_MODE_INTRA;
1731
2.68M
        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1732
2.68M
         mbi,rd_scale,rd_iscale);
1733
        /*Propagate final MB mode and MVs to the chroma blocks.*/
1734
12.2M
        for(mapii=4;mapii<nmap_idxs;mapii++){
1735
9.60M
          mapi=map_idxs[mapii];
1736
9.60M
          pli=mapi>>2;
1737
9.60M
          bi=mapi&3;
1738
9.60M
          fragi=mb_maps[mbi][pli][bi];
1739
9.60M
          frags[fragi].refi=OC_FRAME_SELF;
1740
9.60M
          frags[fragi].mb_mode=OC_MODE_INTRA;
1741
9.60M
        }
1742
        /*Save masking scale factors for chroma blocks.*/
1743
7.48M
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1744
4.80M
          mapi=map_idxs[mapii];
1745
4.80M
          bi=mapi&3;
1746
4.80M
          fragi=mb_maps[mbi][1][bi];
1747
4.80M
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1748
4.80M
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1749
4.80M
        }
1750
2.68M
      }
1751
1.24M
    }
1752
210k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1753
    /*Code chroma planes.*/
1754
631k
    for(pli=1;pli<3;pli++){
1755
421k
      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1756
421k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1757
421k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1758
421k
    }
1759
210k
    notstart=1;
1760
210k
  }
1761
  /*Compute the average block activity and MB luma score for the frame.*/
1762
19.9k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1763
19.9k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1764
19.9k
   _enc->state.fplanes[0].nfrags));
1765
19.9k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1766
  /*Finish filling in the reference frame borders.*/
1767
19.9k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1768
79.9k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1769
19.9k
  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1770
19.9k
}
1771
1772
1773
1774
/*Cost information about a MB mode.*/
1775
struct oc_mode_choice{
1776
  unsigned      cost;
1777
  unsigned      ssd;
1778
  unsigned      rate;
1779
  unsigned      overhead;
1780
  unsigned char qii[12];
1781
};
1782
1783
1784
1785
4.14M
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1786
4.14M
  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1787
4.14M
   _modec->rate+_modec->overhead,_lambda);
1788
4.14M
}
1789
1790
/*A set of skip SSD's to use to disable early skipping.*/
1791
static const unsigned OC_NOSKIP[12]={
1792
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1793
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1795
};
1796
1797
/*The estimated number of bits used by a coded chroma block to specify the AC
1798
   quantizer.
1799
  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1800
   measurements suggest this is in the right ballpark, but it varies somewhat
1801
   with lambda.*/
1802
7.38M
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1803
1804
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1805
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1806
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1807
3.16M
 const unsigned _rd_scale[4],int _qti){
1808
3.16M
  oc_fr_state  fr;
1809
3.16M
  oc_qii_state qs;
1810
3.16M
  unsigned     ssd;
1811
3.16M
  unsigned     rate;
1812
3.16M
  unsigned     satd;
1813
3.16M
  unsigned     best_ssd;
1814
3.16M
  unsigned     best_rate;
1815
3.16M
  int          best_fri;
1816
3.16M
  int          best_qii;
1817
3.16M
  int          lambda;
1818
3.16M
  int          nqis;
1819
3.16M
  int          nskipped;
1820
3.16M
  int          bi;
1821
3.16M
  lambda=_enc->lambda;
1822
3.16M
  nqis=_enc->state.nqis;
1823
  /*We could do a trellis optimization here, but we don't make final skip
1824
     decisions until after transform+quantization, so the result wouldn't be
1825
     optimal anyway.
1826
    Instead we just use a greedy approach; for most SATD values, the
1827
     differences between the qiis are large enough to drown out the cost to
1828
     code the flags, anyway.*/
1829
3.16M
  *&fr=*_fr;
1830
3.16M
  *&qs=*_qs;
1831
3.16M
  ssd=rate=nskipped=0;
1832
15.8M
  for(bi=0;bi<4;bi++){
1833
12.6M
    oc_fr_state  ft[2];
1834
12.6M
    oc_qii_state qt[3];
1835
12.6M
    unsigned     best_cost;
1836
12.6M
    unsigned     cur_cost;
1837
12.6M
    unsigned     cur_ssd;
1838
12.6M
    unsigned     cur_rate;
1839
12.6M
    unsigned     cur_overhead;
1840
12.6M
    int          qii;
1841
12.6M
    satd=_frag_satd[bi];
1842
12.6M
    *(ft+0)=*&fr;
1843
12.6M
    oc_fr_code_block(ft+0);
1844
12.6M
    cur_overhead=ft[0].bits-fr.bits;
1845
12.6M
    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1846
12.6M
     +(cur_overhead<<OC_BIT_SCALE);
1847
12.6M
    if(nqis>1){
1848
6.03M
      oc_qii_state_advance(qt+0,&qs,0);
1849
6.03M
      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1850
6.03M
    }
1851
12.6M
    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1852
12.6M
    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1853
12.6M
    best_fri=0;
1854
12.6M
    best_qii=0;
1855
21.4M
    for(qii=1;qii<nqis;qii++){
1856
8.76M
      oc_qii_state_advance(qt+qii,&qs,qii);
1857
8.76M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1858
8.76M
       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1859
8.76M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1860
8.76M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1861
8.76M
      if(cur_cost<best_cost){
1862
2.60M
        best_cost=cur_cost;
1863
2.60M
        best_ssd=cur_ssd;
1864
2.60M
        best_rate=cur_rate;
1865
2.60M
        best_qii=qii;
1866
2.60M
      }
1867
8.76M
    }
1868
12.6M
    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1869
6.33M
      *(ft+1)=*&fr;
1870
6.33M
      oc_fr_skip_block(ft+1);
1871
6.33M
      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1872
6.33M
      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1873
6.33M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1874
6.33M
      if(cur_cost<=best_cost){
1875
2.13M
        best_ssd=cur_ssd;
1876
2.13M
        best_rate=cur_overhead;
1877
2.13M
        best_fri=1;
1878
2.13M
        best_qii+=4;
1879
2.13M
      }
1880
6.33M
    }
1881
12.6M
    rate+=best_rate;
1882
12.6M
    ssd+=best_ssd;
1883
12.6M
    *&fr=*(ft+best_fri);
1884
12.6M
    if(best_fri==0)*&qs=*(qt+best_qii);
1885
2.13M
    else nskipped++;
1886
12.6M
    _modec->qii[bi]=best_qii;
1887
12.6M
  }
1888
3.16M
  _modec->ssd=ssd;
1889
3.16M
  _modec->rate=rate;
1890
3.16M
}
1891
1892
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1893
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1894
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1895
3.16M
 unsigned _rd_scale,int _qti){
1896
3.16M
  unsigned ssd;
1897
3.16M
  unsigned rate;
1898
3.16M
  unsigned satd;
1899
3.16M
  unsigned best_ssd;
1900
3.16M
  unsigned best_rate;
1901
3.16M
  int      best_qii;
1902
3.16M
  unsigned cur_cost;
1903
3.16M
  unsigned cur_ssd;
1904
3.16M
  unsigned cur_rate;
1905
3.16M
  int      lambda;
1906
3.16M
  int      nblocks;
1907
3.16M
  int      nqis;
1908
3.16M
  int      pli;
1909
3.16M
  int      bi;
1910
3.16M
  int      qii;
1911
3.16M
  lambda=_enc->lambda;
1912
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1913
     worth spending the bits to change the AC quantizer.
1914
    TODO: This may be worth revisiting when we separate out DC and AC
1915
     predictions from SATD.*/
1916
#if 0
1917
  nqis=_enc->state.nqis;
1918
#else
1919
3.16M
  nqis=1;
1920
3.16M
#endif
1921
3.16M
  ssd=_modec->ssd;
1922
3.16M
  rate=_modec->rate;
1923
  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1924
     order, we assume a constant overhead for coded block and qii flags.*/
1925
3.16M
  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1926
3.16M
  nblocks=(nblocks-4>>1)+4;
1927
3.16M
  bi=4;
1928
9.49M
  for(pli=1;pli<3;pli++){
1929
13.7M
    for(;bi<nblocks;bi++){
1930
7.38M
      unsigned best_cost;
1931
7.38M
      satd=_frag_satd[bi];
1932
7.38M
      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1933
7.38M
       +OC_CHROMA_QII_RATE;
1934
7.38M
      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1935
7.38M
      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1936
7.38M
      best_qii=0;
1937
7.38M
      for(qii=1;qii<nqis;qii++){
1938
0
        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1939
0
         +OC_CHROMA_QII_RATE;
1940
0
        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1941
0
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1942
0
        if(cur_cost<best_cost){
1943
0
          best_cost=cur_cost;
1944
0
          best_ssd=cur_ssd;
1945
0
          best_rate=cur_rate;
1946
0
          best_qii=qii;
1947
0
        }
1948
0
      }
1949
7.38M
      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1950
5.08M
        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1951
5.08M
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1952
5.08M
        if(cur_cost<=best_cost){
1953
2.42M
          best_ssd=cur_ssd;
1954
2.42M
          best_rate=0;
1955
2.42M
          best_qii+=4;
1956
2.42M
        }
1957
5.08M
      }
1958
7.38M
      rate+=best_rate;
1959
7.38M
      ssd+=best_ssd;
1960
7.38M
      _modec->qii[bi]=best_qii;
1961
7.38M
    }
1962
6.32M
    nblocks=(nblocks-4<<1)+4;
1963
6.32M
  }
1964
3.16M
  _modec->ssd=ssd;
1965
3.16M
  _modec->rate=rate;
1966
3.16M
}
1967
1968
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1969
305k
 unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1970
305k
  const unsigned char   *src;
1971
305k
  const unsigned char   *ref;
1972
305k
  int                    ystride;
1973
305k
  const oc_fragment     *frags;
1974
305k
  const ptrdiff_t       *frag_buf_offs;
1975
305k
  const ptrdiff_t       *sb_map;
1976
305k
  const oc_mb_map_plane *mb_map;
1977
305k
  const unsigned char   *map_idxs;
1978
305k
  oc_mv                 *mvs;
1979
305k
  int                    map_nidxs;
1980
305k
  unsigned               uncoded_ssd;
1981
305k
  int                    mapii;
1982
305k
  int                    mapi;
1983
305k
  int                    pli;
1984
305k
  int                    bi;
1985
305k
  ptrdiff_t              fragi;
1986
305k
  ptrdiff_t              frag_offs;
1987
305k
  int                    borderi;
1988
305k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1989
305k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1990
305k
  ystride=_enc->state.ref_ystride[0];
1991
305k
  frags=_enc->state.frags;
1992
305k
  frag_buf_offs=_enc->state.frag_buf_offs;
1993
305k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1994
305k
  mvs=_enc->mb_info[_mbi].block_mv;
1995
1.52M
  for(bi=0;bi<4;bi++){
1996
1.22M
    fragi=sb_map[bi];
1997
1.22M
    borderi=frags[fragi].borderi;
1998
1.22M
    frag_offs=frag_buf_offs[fragi];
1999
1.22M
    if(borderi<0){
2000
674k
      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2001
674k
    }
2002
546k
    else{
2003
546k
      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2004
546k
       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2005
546k
    }
2006
    /*Scale to match DCT domain and RD.*/
2007
1.22M
    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2008
    /*Motion is a special case; if there is more than a full-pixel motion
2009
       against the prior frame, penalize skipping.
2010
      TODO: The factor of two here is a kludge, but it tested out better than a
2011
       hard limit.*/
2012
1.22M
    if(mvs[bi]!=0)uncoded_ssd*=2;
2013
1.22M
    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2014
1.22M
  }
2015
305k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2016
305k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2017
305k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2018
305k
  map_nidxs=(map_nidxs-4>>1)+4;
2019
305k
  mapii=4;
2020
305k
  mvs=_enc->mb_info[_mbi].unref_mv;
2021
915k
  for(pli=1;pli<3;pli++){
2022
610k
    ystride=_enc->state.ref_ystride[pli];
2023
1.32M
    for(;mapii<map_nidxs;mapii++){
2024
711k
      mapi=map_idxs[mapii];
2025
711k
      bi=mapi&3;
2026
711k
      fragi=mb_map[pli][bi];
2027
711k
      borderi=frags[fragi].borderi;
2028
711k
      frag_offs=frag_buf_offs[fragi];
2029
711k
      if(borderi<0){
2030
398k
        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2031
398k
      }
2032
313k
      else{
2033
313k
        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2034
313k
         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2035
313k
      }
2036
      /*Scale to match DCT domain and RD.*/
2037
711k
      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2038
      /*Motion is a special case; if there is more than a full-pixel motion
2039
         against the prior frame, penalize skipping.
2040
        TODO: The factor of two here is a kludge, but it tested out better than
2041
         a hard limit*/
2042
711k
      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2043
711k
      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2044
711k
    }
2045
610k
    map_nidxs=(map_nidxs-4<<1)+4;
2046
610k
  }
2047
305k
}
2048
2049
2050
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2051
 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2052
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2053
610k
 const unsigned _rd_scale[5]){
2054
610k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2055
610k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2056
610k
   _frag_satd,_skip_ssd,_rd_scale[4],0);
2057
610k
  _modec->overhead=
2058
610k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2059
610k
  oc_mode_set_cost(_modec,_enc->lambda);
2060
610k
}
2061
2062
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2063
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2064
 const oc_fr_state *_fr,const oc_qii_state *_qs,
2065
2.20M
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2066
2.20M
  unsigned               frag_satd[12];
2067
2.20M
  const unsigned char   *src;
2068
2.20M
  const unsigned char   *ref;
2069
2.20M
  int                    ystride;
2070
2.20M
  const ptrdiff_t       *frag_buf_offs;
2071
2.20M
  const ptrdiff_t       *sb_map;
2072
2.20M
  const oc_mb_map_plane *mb_map;
2073
2.20M
  const unsigned char   *map_idxs;
2074
2.20M
  int                    map_nidxs;
2075
2.20M
  int                    mapii;
2076
2.20M
  int                    mapi;
2077
2.20M
  int                    mv_offs[2];
2078
2.20M
  int                    pli;
2079
2.20M
  int                    bi;
2080
2.20M
  ptrdiff_t              fragi;
2081
2.20M
  ptrdiff_t              frag_offs;
2082
2.20M
  int                    dc;
2083
2.20M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2084
2.20M
  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2085
2.20M
  ystride=_enc->state.ref_ystride[0];
2086
2.20M
  frag_buf_offs=_enc->state.frag_buf_offs;
2087
2.20M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2088
2.20M
  _modec->rate=_modec->ssd=0;
2089
2.20M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2090
2.02M
    for(bi=0;bi<4;bi++){
2091
1.61M
      fragi=sb_map[bi];
2092
1.61M
      frag_offs=frag_buf_offs[fragi];
2093
1.61M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2094
1.61M
        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2095
1.61M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2096
1.61M
        frag_satd[bi]+=abs(dc);
2097
1.61M
      }
2098
0
      else{
2099
0
        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2100
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2101
0
      }
2102
1.61M
    }
2103
404k
  }
2104
1.79M
  else{
2105
8.97M
    for(bi=0;bi<4;bi++){
2106
7.18M
      fragi=sb_map[bi];
2107
7.18M
      frag_offs=frag_buf_offs[fragi];
2108
7.18M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2109
7.18M
        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2110
7.18M
         ref+frag_offs+mv_offs[0],ystride);
2111
7.18M
        frag_satd[bi]+=abs(dc);
2112
7.18M
      }
2113
0
      else{
2114
0
        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2115
0
         ref+frag_offs+mv_offs[0],ystride);
2116
0
      }
2117
7.18M
    }
2118
1.79M
  }
2119
2.20M
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2120
2.20M
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2121
2.20M
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2122
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2123
2.20M
  ystride=_enc->state.ref_ystride[1];
2124
2.20M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2125
2.49M
    for(mapii=4;mapii<map_nidxs;mapii++){
2126
1.71M
      mapi=map_idxs[mapii];
2127
1.71M
      pli=mapi>>2;
2128
1.71M
      bi=mapi&3;
2129
1.71M
      fragi=mb_map[pli][bi];
2130
1.71M
      frag_offs=frag_buf_offs[fragi];
2131
1.71M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2132
1.71M
        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2133
1.71M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2134
1.71M
        frag_satd[mapii]+=abs(dc);
2135
1.71M
      }
2136
0
      else{
2137
0
        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2138
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2139
0
      }
2140
1.71M
    }
2141
779k
  }
2142
1.42M
  else{
2143
4.83M
    for(mapii=4;mapii<map_nidxs;mapii++){
2144
3.41M
      mapi=map_idxs[mapii];
2145
3.41M
      pli=mapi>>2;
2146
3.41M
      bi=mapi&3;
2147
3.41M
      fragi=mb_map[pli][bi];
2148
3.41M
      frag_offs=frag_buf_offs[fragi];
2149
3.41M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2150
3.41M
        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2151
3.41M
         ref+frag_offs+mv_offs[0],ystride);
2152
3.41M
        frag_satd[mapii]+=abs(dc);
2153
3.41M
      }
2154
0
      else{
2155
0
        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2156
0
         ref+frag_offs+mv_offs[0],ystride);
2157
0
      }
2158
3.41M
    }
2159
1.42M
  }
2160
2.20M
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2161
2.20M
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2162
2.20M
   frag_satd,_skip_ssd,_rd_scale[4],1);
2163
2.20M
  _modec->overhead=
2164
2.20M
   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2165
2.20M
  oc_mode_set_cost(_modec,_enc->lambda);
2166
2.20M
}
2167
2168
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2169
 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2170
610k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2171
610k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2172
610k
}
2173
2174
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2175
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2176
 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2177
979k
 const unsigned _rd_scale[5]){
2178
979k
  int bits0;
2179
979k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2180
979k
  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2181
979k
  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2182
979k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2183
979k
  oc_mode_set_cost(_modec,_enc->lambda);
2184
979k
  return bits0;
2185
979k
}
2186
2187
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2188
static const unsigned char OC_MB_PHASE[4][4]={
2189
  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2190
};
2191
2192
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2193
 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2194
353k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2195
353k
  unsigned               frag_satd[12];
2196
353k
  oc_mv                  lbmvs[4];
2197
353k
  oc_mv                  cbmvs[4];
2198
353k
  const unsigned char   *src;
2199
353k
  const unsigned char   *ref;
2200
353k
  int                    ystride;
2201
353k
  const ptrdiff_t       *frag_buf_offs;
2202
353k
  oc_mv                 *frag_mvs;
2203
353k
  const oc_mb_map_plane *mb_map;
2204
353k
  const unsigned char   *map_idxs;
2205
353k
  int                    map_nidxs;
2206
353k
  int                    nqis;
2207
353k
  int                    mapii;
2208
353k
  int                    mapi;
2209
353k
  int                    mv_offs[2];
2210
353k
  int                    pli;
2211
353k
  int                    bi;
2212
353k
  ptrdiff_t              fragi;
2213
353k
  ptrdiff_t              frag_offs;
2214
353k
  int                    bits0;
2215
353k
  int                    bits1;
2216
353k
  unsigned               satd;
2217
353k
  int                    dc;
2218
353k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2219
353k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2220
353k
  ystride=_enc->state.ref_ystride[0];
2221
353k
  frag_buf_offs=_enc->state.frag_buf_offs;
2222
353k
  frag_mvs=_enc->state.frag_mvs;
2223
353k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2224
353k
  _modec->rate=_modec->ssd=0;
2225
1.76M
  for(bi=0;bi<4;bi++){
2226
1.41M
    fragi=mb_map[0][bi];
2227
    /*Save the block MVs as the current ones while we're here; we'll replace
2228
       them if we don't ultimately choose 4MV mode.*/
2229
1.41M
    frag_mvs[fragi]=_mv[bi];
2230
1.41M
    frag_offs=frag_buf_offs[fragi];
2231
1.41M
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2232
168k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2233
168k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2234
168k
    }
2235
1.24M
    else{
2236
1.24M
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2237
1.24M
       ref+frag_offs+mv_offs[0],ystride);
2238
1.24M
    }
2239
1.41M
    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2240
1.41M
  }
2241
353k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2242
353k
   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2243
  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2244
353k
  bits0=0;
2245
353k
  bits1=0;
2246
353k
  nqis=_enc->state.nqis;
2247
1.76M
  for(bi=0;bi<4;bi++){
2248
1.41M
    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2249
1.17M
    else{
2250
1.17M
      lbmvs[bi]=_mv[bi];
2251
1.17M
      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2252
1.17M
       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2253
1.17M
      bits1+=12;
2254
1.17M
    }
2255
1.41M
  }
2256
353k
  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2257
353k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2258
353k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2259
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2260
353k
  ystride=_enc->state.ref_ystride[1];
2261
1.18M
  for(mapii=4;mapii<map_nidxs;mapii++){
2262
833k
    mapi=map_idxs[mapii];
2263
833k
    pli=mapi>>2;
2264
833k
    bi=mapi&3;
2265
833k
    fragi=mb_map[pli][bi];
2266
833k
    frag_offs=frag_buf_offs[fragi];
2267
    /*TODO: We could save half these calls by re-using the results for the Cb
2268
       and Cr planes; is it worth it?*/
2269
833k
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2270
526k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2271
526k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2272
526k
    }
2273
307k
    else{
2274
307k
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2275
307k
       ref+frag_offs+mv_offs[0],ystride);
2276
307k
    }
2277
833k
    frag_satd[mapii]=satd+abs(dc);
2278
833k
  }
2279
353k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2280
353k
   frag_satd,_skip_ssd,_rd_scale[4],1);
2281
353k
  _modec->overhead=
2282
353k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2283
353k
   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2284
353k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2285
353k
  oc_mode_set_cost(_modec,_enc->lambda);
2286
353k
}
2287
2288
29.1k
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2289
29.1k
  oc_set_chroma_mvs_func  set_chroma_mvs;
2290
29.1k
  oc_qii_state            intra_luma_qs;
2291
29.1k
  oc_mv                   last_mv;
2292
29.1k
  oc_mv                   prior_mv;
2293
29.1k
  ogg_int64_t             interbits;
2294
29.1k
  ogg_int64_t             intrabits;
2295
29.1k
  ogg_int64_t             activity_sum;
2296
29.1k
  ogg_int64_t             luma_sum;
2297
29.1k
  unsigned                activity_avg;
2298
29.1k
  unsigned                luma_avg;
2299
29.1k
  const ogg_uint16_t     *chroma_rd_scale;
2300
29.1k
  ogg_uint16_t           *mcu_rd_scale;
2301
29.1k
  ogg_uint16_t           *mcu_rd_iscale;
2302
29.1k
  const unsigned char    *map_idxs;
2303
29.1k
  int                     nmap_idxs;
2304
29.1k
  unsigned               *coded_mbis;
2305
29.1k
  unsigned               *uncoded_mbis;
2306
29.1k
  size_t                  ncoded_mbis;
2307
29.1k
  size_t                  nuncoded_mbis;
2308
29.1k
  oc_sb_flags            *sb_flags;
2309
29.1k
  signed char            *mb_modes;
2310
29.1k
  const oc_sb_map        *sb_maps;
2311
29.1k
  const oc_mb_map        *mb_maps;
2312
29.1k
  oc_mb_enc_info         *embs;
2313
29.1k
  oc_fragment            *frags;
2314
29.1k
  oc_mv                  *frag_mvs;
2315
29.1k
  unsigned                stripe_sby;
2316
29.1k
  unsigned                mcu_nvsbs;
2317
29.1k
  int                     notstart;
2318
29.1k
  int                     notdone;
2319
29.1k
  unsigned                sbi;
2320
29.1k
  unsigned                sbi_end;
2321
29.1k
  int                     refi;
2322
29.1k
  int                     pli;
2323
29.1k
  int                     sp_level;
2324
29.1k
  sp_level=_enc->sp_level;
2325
29.1k
  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2326
29.1k
  _enc->state.frame_type=OC_INTER_FRAME;
2327
29.1k
  oc_mode_scheme_chooser_reset(&_enc->chooser);
2328
29.1k
  oc_enc_tokenize_start(_enc);
2329
29.1k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
2330
29.1k
  oc_enc_mode_rd_init(_enc);
2331
29.1k
  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2332
29.1k
  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
2333
29.1k
  interbits=intrabits=0;
2334
29.1k
  activity_sum=luma_sum=0;
2335
29.1k
  activity_avg=_enc->activity_avg;
2336
29.1k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2337
29.1k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2338
29.1k
  mcu_rd_scale=_enc->mcu_rd_scale;
2339
29.1k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
2340
29.1k
  last_mv=prior_mv=0;
2341
  /*Choose MVs and MB modes and quantize and code luma.
2342
    Must be done in Hilbert order.*/
2343
29.1k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2344
29.1k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2345
29.1k
  coded_mbis=_enc->coded_mbis;
2346
29.1k
  uncoded_mbis=coded_mbis+_enc->state.nmbs;
2347
29.1k
  ncoded_mbis=0;
2348
29.1k
  nuncoded_mbis=0;
2349
29.1k
  _enc->state.ncoded_fragis[0]=0;
2350
29.1k
  _enc->state.ncoded_fragis[1]=0;
2351
29.1k
  _enc->state.ncoded_fragis[2]=0;
2352
29.1k
  sb_flags=_enc->state.sb_flags;
2353
29.1k
  mb_modes=_enc->state.mb_modes;
2354
29.1k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2355
29.1k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2356
29.1k
  embs=_enc->mb_info;
2357
29.1k
  frags=_enc->state.frags;
2358
29.1k
  frag_mvs=_enc->state.frag_mvs;
2359
29.1k
  notstart=0;
2360
29.1k
  notdone=1;
2361
29.1k
  mcu_nvsbs=_enc->mcu_nvsbs;
2362
60.3k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2363
31.2k
    ptrdiff_t cfroffset;
2364
31.2k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2365
31.2k
    sbi_end=_enc->pipe.sbi_end[0];
2366
31.2k
    cfroffset=_enc->pipe.froffset[1];
2367
129k
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2368
98.3k
      int quadi;
2369
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
2370
491k
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2371
305k
        oc_mode_choice modes[8];
2372
305k
        unsigned       activity[4];
2373
305k
        unsigned       rd_scale[5];
2374
305k
        unsigned       rd_iscale[5];
2375
305k
        unsigned       skip_ssd[12];
2376
305k
        unsigned       intra_satd[12];
2377
305k
        unsigned       luma;
2378
305k
        int            mb_mv_bits_0;
2379
305k
        int            mb_gmv_bits_0;
2380
305k
        int            inter_mv_pref;
2381
305k
        int            mb_mode;
2382
305k
        int            refi;
2383
305k
        int            mv;
2384
305k
        unsigned       mbi;
2385
305k
        int            mapii;
2386
305k
        int            mapi;
2387
305k
        int            bi;
2388
305k
        ptrdiff_t      fragi;
2389
305k
        mbi=sbi<<2|quadi;
2390
305k
        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2391
        /*Activity masking.*/
2392
305k
        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2393
305k
          oc_mb_activity(_enc,mbi,activity);
2394
305k
        }
2395
0
        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2396
305k
        luma_sum+=luma;
2397
305k
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2398
305k
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2399
        /*Motion estimation:
2400
          We always do a basic 1MV search for all macroblocks, coded or not,
2401
           keyframe or not.*/
2402
305k
        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2403
305k
        mv=0;
2404
        /*Find the block choice with the lowest estimated coding cost.
2405
          If a Cb or Cr block is coded but no Y' block from a macro block then
2406
           the mode MUST be OC_MODE_INTER_NOMV.
2407
          This is the default state to which the mode data structure is
2408
           initialised in encoder and decoder at the start of each frame.*/
2409
        /*Block coding cost is estimated from correlated SATD metrics.*/
2410
        /*At this point, all blocks that are in frame are still marked coded.*/
2411
305k
        if(!_recode){
2412
226k
          embs[mbi].unref_mv[OC_FRAME_GOLD]=
2413
226k
           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2414
226k
          embs[mbi].unref_mv[OC_FRAME_PREV]=
2415
226k
           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2416
226k
          embs[mbi].refined=0;
2417
226k
        }
2418
        /*Estimate the cost of coding this MB in a keyframe.*/
2419
305k
        if(_allow_keyframe){
2420
305k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2421
305k
           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2422
305k
          intrabits+=modes[OC_MODE_INTRA].rate;
2423
1.52M
          for(bi=0;bi<4;bi++){
2424
1.22M
            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2425
1.22M
             modes[OC_MODE_INTRA].qii[bi]);
2426
1.22M
          }
2427
305k
        }
2428
        /*Estimate the cost in a delta frame for various modes.*/
2429
305k
        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2430
305k
        if(sp_level<OC_SP_LEVEL_NOMC){
2431
305k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2432
305k
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2433
305k
           skip_ssd,rd_scale);
2434
305k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2435
305k
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2436
305k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2437
305k
           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2438
305k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2439
305k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2440
305k
           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2441
305k
           skip_ssd,rd_scale);
2442
305k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2443
305k
           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2444
305k
           skip_ssd,rd_scale);
2445
305k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2446
305k
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2447
305k
           skip_ssd,rd_scale);
2448
305k
          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2449
305k
           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2450
305k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2451
          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
2452
             refinement.
2453
            We choose the explicit MV mode that's already furthest ahead on
2454
             R-D cost and refine only that one.
2455
            We have to be careful to remember which ones we've refined so that
2456
             we don't refine it again if we re-encode this frame.*/
2457
305k
          inter_mv_pref=_enc->lambda*3;
2458
305k
          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2459
305k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2460
305k
             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2461
305k
             skip_ssd,rd_scale);
2462
305k
          }
2463
0
          else{
2464
0
            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2465
0
          }
2466
305k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2467
64.6k
           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2468
48.6k
            if(!(embs[mbi].refined&0x80)){
2469
35.9k
              oc_mcenc_refine4mv(_enc,mbi);
2470
35.9k
              embs[mbi].refined|=0x80;
2471
35.9k
            }
2472
48.6k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2473
48.6k
             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2474
48.6k
             skip_ssd,rd_scale);
2475
48.6k
          }
2476
256k
          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2477
256k
           modes[OC_MODE_INTER_MV].cost){
2478
63.8k
            if(!(embs[mbi].refined&0x40)){
2479
53.5k
              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2480
53.5k
              embs[mbi].refined|=0x40;
2481
53.5k
            }
2482
63.8k
            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2483
63.8k
             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2484
63.8k
             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2485
63.8k
          }
2486
305k
          if(!(embs[mbi].refined&0x04)){
2487
226k
            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2488
226k
            embs[mbi].refined|=0x04;
2489
226k
          }
2490
305k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2491
305k
           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2492
305k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2493
          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2494
305k
          mb_mode=OC_MODE_INTER_NOMV;
2495
305k
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2496
248k
            mb_mode=OC_MODE_INTRA;
2497
248k
          }
2498
305k
          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2499
28.7k
            mb_mode=OC_MODE_INTER_MV_LAST;
2500
28.7k
          }
2501
305k
          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2502
5.84k
            mb_mode=OC_MODE_INTER_MV_LAST2;
2503
5.84k
          }
2504
305k
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2505
6.97k
            mb_mode=OC_MODE_GOLDEN_NOMV;
2506
6.97k
          }
2507
305k
          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2508
20.7k
            mb_mode=OC_MODE_GOLDEN_MV;
2509
20.7k
          }
2510
305k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2511
12.9k
            mb_mode=OC_MODE_INTER_MV_FOUR;
2512
12.9k
          }
2513
          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2514
305k
          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2515
24.0k
            inter_mv_pref=0;
2516
24.0k
          }
2517
305k
          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2518
22.3k
            mb_mode=OC_MODE_INTER_MV;
2519
22.3k
          }
2520
305k
        }
2521
0
        else{
2522
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2523
0
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2524
0
           skip_ssd,rd_scale);
2525
0
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2526
0
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2527
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2528
0
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2529
0
           skip_ssd,rd_scale);
2530
0
          mb_mode=OC_MODE_INTER_NOMV;
2531
0
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2532
0
            mb_mode=OC_MODE_INTRA;
2533
0
          }
2534
0
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2535
0
            mb_mode=OC_MODE_GOLDEN_NOMV;
2536
0
          }
2537
0
          mb_mv_bits_0=mb_gmv_bits_0=0;
2538
0
        }
2539
305k
        mb_modes[mbi]=mb_mode;
2540
        /*Propagate the MVs to the luma blocks.*/
2541
305k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2542
298k
          switch(mb_mode){
2543
22.3k
            case OC_MODE_INTER_MV:{
2544
22.3k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2545
22.3k
            }break;
2546
18.1k
            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2547
3.21k
            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2548
11.7k
            case OC_MODE_GOLDEN_MV:{
2549
11.7k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2550
11.7k
            }break;
2551
298k
          }
2552
1.49M
          for(bi=0;bi<4;bi++){
2553
1.19M
            fragi=mb_maps[mbi][0][bi];
2554
1.19M
            frag_mvs[fragi]=mv;
2555
1.19M
          }
2556
298k
        }
2557
1.52M
        for(bi=0;bi<4;bi++){
2558
1.22M
          fragi=sb_maps[mbi>>2][mbi&3][bi];
2559
1.22M
          frags[fragi].qii=modes[mb_mode].qii[bi];
2560
1.22M
        }
2561
305k
        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2562
305k
         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2563
249k
          int orig_mb_mode;
2564
249k
          orig_mb_mode=mb_mode;
2565
249k
          mb_mode=mb_modes[mbi];
2566
249k
          refi=OC_FRAME_FOR_MODE(mb_mode);
2567
249k
          switch(mb_mode){
2568
12.1k
            case OC_MODE_INTER_MV:{
2569
12.1k
              prior_mv=last_mv;
2570
              /*If we're backing out from 4MV, find the MV we're actually
2571
                 using.*/
2572
12.1k
              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2573
551
                for(bi=0;;bi++){
2574
551
                  fragi=mb_maps[mbi][0][bi];
2575
551
                  if(frags[fragi].coded){
2576
163
                    mv=last_mv=frag_mvs[fragi];
2577
163
                    break;
2578
163
                  }
2579
551
                }
2580
163
                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2581
163
                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
2582
163
              }
2583
              /*Otherwise we used the original analysis MV.*/
2584
11.9k
              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2585
12.1k
              _enc->mv_bits[0]+=mb_mv_bits_0;
2586
12.1k
              _enc->mv_bits[1]+=12;
2587
12.1k
            }break;
2588
2.77k
            case OC_MODE_INTER_MV_LAST2:{
2589
2.77k
              oc_mv tmp_mv;
2590
2.77k
              tmp_mv=prior_mv;
2591
2.77k
              prior_mv=last_mv;
2592
2.77k
              last_mv=tmp_mv;
2593
2.77k
            }break;
2594
7.05k
            case OC_MODE_GOLDEN_MV:{
2595
7.05k
              _enc->mv_bits[0]+=mb_gmv_bits_0;
2596
7.05k
              _enc->mv_bits[1]+=12;
2597
7.05k
            }break;
2598
5.57k
            case OC_MODE_INTER_MV_FOUR:{
2599
5.57k
              oc_mv lbmvs[4];
2600
5.57k
              oc_mv cbmvs[4];
2601
5.57k
              prior_mv=last_mv;
2602
27.8k
              for(bi=0;bi<4;bi++){
2603
22.3k
                fragi=mb_maps[mbi][0][bi];
2604
22.3k
                if(frags[fragi].coded){
2605
21.1k
                  lbmvs[bi]=last_mv=frag_mvs[fragi];
2606
21.1k
                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2607
21.1k
                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2608
21.1k
                  _enc->mv_bits[1]+=12;
2609
21.1k
                }
2610
                /*Replace the block MVs for not-coded blocks with (0,0).*/
2611
1.14k
                else lbmvs[bi]=0;
2612
22.3k
              }
2613
5.57k
              (*set_chroma_mvs)(cbmvs,lbmvs);
2614
19.2k
              for(mapii=4;mapii<nmap_idxs;mapii++){
2615
13.6k
                mapi=map_idxs[mapii];
2616
13.6k
                pli=mapi>>2;
2617
13.6k
                bi=mapi&3;
2618
13.6k
                fragi=mb_maps[mbi][pli][bi];
2619
13.6k
                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2620
13.6k
                frags[fragi].refi=refi;
2621
13.6k
                frags[fragi].mb_mode=mb_mode;
2622
13.6k
                frag_mvs[fragi]=cbmvs[bi];
2623
13.6k
              }
2624
5.57k
            }break;
2625
249k
          }
2626
249k
          coded_mbis[ncoded_mbis++]=mbi;
2627
249k
          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2628
249k
          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2629
249k
        }
2630
55.9k
        else{
2631
55.9k
          *(uncoded_mbis-++nuncoded_mbis)=mbi;
2632
55.9k
          mb_mode=OC_MODE_INTER_NOMV;
2633
55.9k
          refi=OC_FRAME_PREV;
2634
55.9k
          mv=0;
2635
55.9k
        }
2636
        /*Propagate final MB mode and MVs to the chroma blocks.
2637
          This has already been done for 4MV mode, since it requires individual
2638
           block motion vectors.*/
2639
305k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2640
997k
          for(mapii=4;mapii<nmap_idxs;mapii++){
2641
697k
            mapi=map_idxs[mapii];
2642
697k
            pli=mapi>>2;
2643
697k
            bi=mapi&3;
2644
697k
            fragi=mb_maps[mbi][pli][bi];
2645
            /*If we switched from 4MV mode to INTER_MV mode, then the qii
2646
               values won't have been chosen with the right MV, but it's
2647
               probably not worth re-estimating them.*/
2648
697k
            frags[fragi].qii=modes[mb_mode].qii[mapii];
2649
697k
            frags[fragi].refi=refi;
2650
697k
            frags[fragi].mb_mode=mb_mode;
2651
697k
            frag_mvs[fragi]=mv;
2652
697k
          }
2653
299k
        }
2654
        /*Save masking scale factors for chroma blocks.*/
2655
661k
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2656
355k
          mapi=map_idxs[mapii];
2657
355k
          bi=mapi&3;
2658
355k
          fragi=mb_maps[mbi][1][bi];
2659
355k
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2660
355k
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2661
355k
        }
2662
305k
      }
2663
98.3k
      oc_fr_state_flush_sb(_enc->pipe.fr+0);
2664
98.3k
      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2665
98.3k
      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2666
98.3k
    }
2667
31.2k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2668
    /*Code chroma planes.*/
2669
93.7k
    for(pli=1;pli<3;pli++){
2670
62.5k
      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2671
62.5k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2672
62.5k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2673
62.5k
    }
2674
31.2k
    notstart=1;
2675
31.2k
  }
2676
  /*Update the average block activity and MB luma score for the frame.
2677
    We could use a Bessel follower here, but fast reaction is probably almost
2678
     always best.*/
2679
29.1k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2680
29.1k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2681
29.1k
   _enc->state.fplanes[0].nfrags));
2682
29.1k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2683
  /*Finish filling in the reference frame borders.*/
2684
29.1k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2685
116k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2686
  /*Finish adding flagging overhead costs to inter bit counts to determine if
2687
     we should have coded a key frame instead.*/
2688
29.1k
  if(_allow_keyframe){
2689
    /*Technically the chroma plane counts are over-estimations, because they
2690
       don't account for continuing runs from the luma planes, but the
2691
       inaccuracy is small.
2692
      We don't need to add the luma plane coding flag costs, because they are
2693
       already included in the MB rate estimates.*/
2694
87.3k
    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2695
29.1k
    if(interbits>intrabits)return 1;
2696
29.1k
  }
2697
17.2k
  _enc->ncoded_mbis=ncoded_mbis;
2698
  /*Compact the coded fragment list.*/
2699
17.2k
  {
2700
17.2k
    ptrdiff_t ncoded_fragis;
2701
17.2k
    ncoded_fragis=_enc->state.ncoded_fragis[0];
2702
51.8k
    for(pli=1;pli<3;pli++){
2703
34.5k
      memmove(_enc->state.coded_fragis+ncoded_fragis,
2704
34.5k
       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2705
34.5k
       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2706
34.5k
      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2707
34.5k
    }
2708
17.2k
    _enc->state.ntotal_coded_fragis=ncoded_fragis;
2709
17.2k
  }
2710
17.2k
  return 0;
2711
29.1k
}