Coverage Report

Created: 2025-08-28 07:12

/src/theora/lib/analyze.c
Line
Count
Source (jump to first uncovered line)
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025           *
9
 * by the Xiph.Org Foundation https://www.xiph.org/                 *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function: mode selection code
14
15
 ********************************************************************/
16
#include <limits.h>
17
#include <string.h>
18
#include "encint.h"
19
#include "modedec.h"
20
#if defined(OC_COLLECT_METRICS)
21
# include "collect.c"
22
#endif
23
24
25
26
typedef struct oc_rd_metric          oc_rd_metric;
27
typedef struct oc_mode_choice        oc_mode_choice;
28
29
30
31
/*There are 8 possible schemes used to encode macro block modes.
32
  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
33
  The same set of Huffman codes is used for each of these 7 schemes, but the
34
   mode assigned to each codeword varies.
35
  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
36
   while schemes 1-6 have a fixed mapping.
37
  Scheme 7 just encodes each mode directly in 3 bits.*/
38
39
/*The mode orderings for the various mode coding schemes.
40
  Scheme 0 uses a custom alphabet, which is not stored in this table.
41
  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
42
   decoder.*/
43
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
44
  /*Last MV dominates.*/
45
  /*L P M N I G GM 4*/
46
  {3,4,2,0,1,5,6,7},
47
  /*L P N M I G GM 4*/
48
  {2,4,3,0,1,5,6,7},
49
  /*L M P N I G GM 4*/
50
  {3,4,1,0,2,5,6,7},
51
  /*L M N P I G GM 4*/
52
  {2,4,1,0,3,5,6,7},
53
  /*No MV dominates.*/
54
  /*N L P M I G GM 4*/
55
  {0,4,3,1,2,5,6,7},
56
  /*N G L P M I GM 4*/
57
  {0,5,4,2,3,1,6,7},
58
  /*Default ordering.*/
59
  /*N I M L P G GM 4*/
60
  {0,1,2,3,4,5,6,7}
61
};
62
63
64
65
/*Initialize the mode scheme chooser.
66
  This need only be called once per encoder.*/
67
3.41k
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
68
3.41k
  int si;
69
3.41k
  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
70
27.3k
  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
71
3.41k
}
72
73
/*Reset the mode scheme chooser.
74
  This needs to be called once for each frame, including the first.*/
75
30.7k
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
76
30.7k
  int si;
77
30.7k
  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
78
  /*Scheme 0 starts with 24 bits to store the mode list in.*/
79
30.7k
  _chooser->scheme_bits[0]=24;
80
30.7k
  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
81
276k
  for(si=0;si<8;si++){
82
    /*Scheme 7 should always start first, and scheme 0 should always start
83
       last.*/
84
246k
    _chooser->scheme_list[si]=7-si;
85
246k
    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
86
246k
  }
87
30.7k
}
88
89
/*Return the cost of coding _mb_mode in the specified scheme.*/
90
static int oc_mode_scheme_chooser_scheme_mb_cost(
91
10.6M
 const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
92
10.6M
  int codebook;
93
10.6M
  int ri;
94
10.6M
  codebook=_scheme+1>>3;
95
  /*For any scheme except 0, we can just use the bit cost of the mode's rank
96
     in that scheme.*/
97
10.6M
  ri=_chooser->mode_ranks[_scheme][_mb_mode];
98
10.6M
  if(_scheme==0){
99
1.66M
    int mc;
100
    /*For scheme 0, incrementing the mode count could potentially change the
101
       mode's rank.
102
      Find the index where the mode would be moved to in the optimal list,
103
       and use its bit cost instead of the one for the mode's current
104
       position in the list.*/
105
    /*We don't actually reorder the list; this is for computing opportunity
106
       cost, not an update.*/
107
1.66M
    mc=_chooser->mode_counts[_mb_mode];
108
4.69M
    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
109
1.66M
  }
110
10.6M
  return OC_MODE_BITS[codebook][ri];
111
10.6M
}
112
113
/*This is the real purpose of this data structure: not actually selecting a
114
   mode scheme, but estimating the cost of coding a given mode given all the
115
   modes selected so far.
116
  This is done via opportunity cost: the cost is defined as the number of bits
117
   required to encode all the modes selected so far including the current one
118
   using the best possible scheme, minus the number of bits required to encode
119
   all the modes selected so far not including the current one using the best
120
   possible scheme.
121
  The computational expense of doing this probably makes it overkill.
122
  Just be happy we take a greedy approach instead of trying to solve the
123
   global mode-selection problem (which is NP-hard).
124
  _mb_mode: The mode to determine the cost of.
125
  Return: The number of bits required to code this mode.*/
126
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
127
3.27M
 int _mb_mode){
128
3.27M
  int scheme0;
129
3.27M
  int scheme1;
130
3.27M
  int best_bits;
131
3.27M
  int mode_bits;
132
3.27M
  int si;
133
3.27M
  int scheme0_bits;
134
3.27M
  int scheme1_bits;
135
3.27M
  scheme0=_chooser->scheme_list[0];
136
3.27M
  scheme1=_chooser->scheme_list[1];
137
3.27M
  scheme0_bits=_chooser->scheme_bits[scheme0];
138
3.27M
  scheme1_bits=_chooser->scheme_bits[scheme1];
139
3.27M
  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
140
  /*Typical case: If the difference between the best scheme and the next best
141
     is greater than 6 bits, then adding just one mode cannot change which
142
     scheme we use.*/
143
3.27M
  if(scheme1_bits-scheme0_bits>6)return mode_bits;
144
  /*Otherwise, check to see if adding this mode selects a different scheme as
145
     the best.*/
146
1.46M
  si=1;
147
1.46M
  best_bits=scheme0_bits+mode_bits;
148
7.38M
  do{
149
7.38M
    int cur_bits;
150
7.38M
    cur_bits=scheme1_bits+
151
7.38M
     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
152
7.38M
    if(cur_bits<best_bits)best_bits=cur_bits;
153
7.38M
    if(++si>=8)break;
154
7.38M
    scheme1=_chooser->scheme_list[si];
155
7.38M
    scheme1_bits=_chooser->scheme_bits[scheme1];
156
7.38M
  }
157
7.38M
  while(scheme1_bits-scheme0_bits<=6);
158
0
  return best_bits-scheme0_bits;
159
3.27M
}
160
161
/*Incrementally update the mode counts and per-scheme bit counts and re-order
162
   the scheme lists once a mode has been selected.
163
  _mb_mode: The mode that was chosen.*/
164
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
165
257k
 int _mb_mode){
166
257k
  int ri;
167
257k
  int si;
168
257k
  _chooser->mode_counts[_mb_mode]++;
169
  /*Re-order the scheme0 mode list if necessary.*/
170
339k
  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
171
116k
    int pmode;
172
116k
    pmode=_chooser->scheme0_list[ri-1];
173
116k
    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
174
    /*Reorder the mode ranking.*/
175
81.9k
    _chooser->scheme0_ranks[pmode]++;
176
81.9k
    _chooser->scheme0_list[ri]=pmode;
177
81.9k
  }
178
257k
  _chooser->scheme0_ranks[_mb_mode]=ri;
179
257k
  _chooser->scheme0_list[ri]=_mb_mode;
180
  /*Now add the bit cost for the mode to each scheme.*/
181
2.31M
  for(si=0;si<8;si++){
182
2.05M
    _chooser->scheme_bits[si]+=
183
2.05M
     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
184
2.05M
  }
185
  /*Finally, re-order the list of schemes.*/
186
2.05M
  for(si=1;si<8;si++){
187
1.79M
    int sj;
188
1.79M
    int scheme0;
189
1.79M
    int bits0;
190
1.79M
    sj=si;
191
1.79M
    scheme0=_chooser->scheme_list[si];
192
1.79M
    bits0=_chooser->scheme_bits[scheme0];
193
1.98M
    do{
194
1.98M
      int scheme1;
195
1.98M
      scheme1=_chooser->scheme_list[sj-1];
196
1.98M
      if(bits0>=_chooser->scheme_bits[scheme1])break;
197
202k
      _chooser->scheme_list[sj]=scheme1;
198
202k
    }
199
1.79M
    while(--sj>0);
200
0
    _chooser->scheme_list[sj]=scheme0;
201
1.79M
  }
202
257k
}
203
204
205
206
/*The number of bits required to encode a super block run.
207
  _run_count: The desired run count; must be positive and less than 4130.*/
208
165M
static int oc_sb_run_bits(int _run_count){
209
165M
  int i;
210
611M
  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
211
165M
  return OC_SB_RUN_CODE_NBITS[i];
212
165M
}
213
214
/*The number of bits required to encode a block run.
215
  _run_count: The desired run count; must be positive and less than 30.*/
216
21.1M
static int oc_block_run_bits(int _run_count){
217
21.1M
  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
218
21.1M
}
219
220
221
222
153k
static void oc_fr_state_init(oc_fr_state *_fr){
223
153k
  _fr->bits=0;
224
153k
  _fr->sb_partial_count=0;
225
153k
  _fr->sb_full_count=0;
226
153k
  _fr->b_coded_count_prev=0;
227
153k
  _fr->b_coded_count=0;
228
153k
  _fr->b_count=0;
229
153k
  _fr->sb_prefer_partial=0;
230
153k
  _fr->sb_bits=0;
231
153k
  _fr->sb_partial=-1;
232
153k
  _fr->sb_full=-1;
233
153k
  _fr->b_coded_prev=-1;
234
153k
  _fr->b_coded=-1;
235
153k
}
236
237
238
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
239
10.8M
 int _sb_partial,int _sb_full){
240
10.8M
  int bits;
241
10.8M
  int sb_partial_count;
242
10.8M
  int sb_full_count;
243
10.8M
  bits=0;
244
10.8M
  sb_partial_count=_fr->sb_partial_count;
245
  /*Extend the sb_partial run, or start a new one.*/
246
10.8M
  if(_fr->sb_partial==_sb_partial){
247
2.56M
    if(sb_partial_count>=4129){
248
0
      bits++;
249
0
      sb_partial_count=0;
250
0
    }
251
2.56M
    else bits-=oc_sb_run_bits(sb_partial_count);
252
2.56M
  }
253
8.23M
  else sb_partial_count=0;
254
10.8M
  bits+=oc_sb_run_bits(++sb_partial_count);
255
10.8M
  if(!_sb_partial){
256
    /*Extend the sb_full run, or start a new one.*/
257
3.26M
    sb_full_count=_fr->sb_full_count;
258
3.26M
    if(_fr->sb_full==_sb_full){
259
1.47M
      if(sb_full_count>=4129){
260
0
        bits++;
261
0
        sb_full_count=0;
262
0
      }
263
1.47M
      else bits-=oc_sb_run_bits(sb_full_count);
264
1.47M
    }
265
1.78M
    else sb_full_count=0;
266
3.26M
    bits+=oc_sb_run_bits(++sb_full_count);
267
3.26M
  }
268
10.8M
  return bits;
269
10.8M
}
270
271
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
272
213k
 int _sb_partial,int _sb_full){
273
213k
  int sb_partial_count;
274
213k
  int sb_full_count;
275
213k
  sb_partial_count=_fr->sb_partial_count;
276
213k
  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
277
213k
  sb_partial_count++;
278
213k
  if(!_sb_partial){
279
149k
    sb_full_count=_fr->sb_full_count;
280
149k
    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
281
149k
    sb_full_count++;
282
149k
    _fr->sb_full_count=sb_full_count;
283
149k
    _fr->sb_full=_sb_full;
284
    /*Roll back the partial block state.*/
285
149k
    _fr->b_coded=_fr->b_coded_prev;
286
149k
    _fr->b_coded_count=_fr->b_coded_count_prev;
287
149k
  }
288
64.3k
  else{
289
    /*Commit back the partial block state.*/
290
64.3k
    _fr->b_coded_prev=_fr->b_coded;
291
64.3k
    _fr->b_coded_count_prev=_fr->b_coded_count;
292
64.3k
  }
293
213k
  _fr->sb_partial_count=sb_partial_count;
294
213k
  _fr->sb_partial=_sb_partial;
295
213k
  _fr->b_count=0;
296
213k
  _fr->sb_prefer_partial=0;
297
213k
  _fr->sb_bits=0;
298
213k
}
299
300
/*Commit the state of the current super block and advance to the next.*/
301
213k
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
302
213k
  int sb_partial;
303
213k
  int sb_full;
304
213k
  int b_coded_count;
305
213k
  int b_count;
306
213k
  b_count=_fr->b_count;
307
213k
  b_coded_count=_fr->b_coded_count;
308
213k
  sb_full=_fr->b_coded;
309
213k
  sb_partial=b_coded_count<b_count;
310
213k
  if(!sb_partial){
311
    /*If the super block is fully coded/uncoded...*/
312
149k
    if(_fr->sb_prefer_partial){
313
      /*So far coding this super block as partial was cheaper anyway.*/
314
2.08k
      if(b_coded_count>15||_fr->b_coded_prev<0){
315
1.37k
        int sb_bits;
316
        /*If the block run is too long, this will limit how far it can be
317
           extended into the next partial super block.
318
          If we need to extend it farther, we don't want to have to roll all
319
           the way back here (since there could be many full SBs between now
320
           and then), so we disallow this.
321
          Similarly, if this is the start of a stripe, we don't know how the
322
           length of the outstanding block run from the previous stripe.*/
323
1.37k
        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
324
1.37k
        _fr->bits+=sb_bits-_fr->sb_bits;
325
1.37k
        _fr->sb_bits=sb_bits;
326
1.37k
      }
327
717
      else sb_partial=1;
328
2.08k
    }
329
149k
  }
330
213k
  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
331
213k
}
332
333
25.8M
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
334
25.8M
  ptrdiff_t bits;
335
25.8M
  int       sb_bits;
336
25.8M
  int       b_coded_count;
337
25.8M
  int       b_count;
338
25.8M
  int       sb_prefer_partial;
339
25.8M
  sb_bits=_fr->sb_bits;
340
25.8M
  bits=_fr->bits-sb_bits;
341
25.8M
  b_count=_fr->b_count;
342
25.8M
  b_coded_count=_fr->b_coded_count;
343
25.8M
  sb_prefer_partial=_fr->sb_prefer_partial;
344
25.8M
  if(b_coded_count>=b_count){
345
18.7M
    int sb_partial_bits;
346
    /*This super block is currently fully coded/uncoded.*/
347
18.7M
    if(b_count<=0){
348
      /*This is the first block in this SB.*/
349
2.27M
      b_count=1;
350
      /*Check to see whether it's cheaper to code it partially or fully.*/
351
2.27M
      if(_fr->b_coded==_b_coded){
352
522k
        sb_partial_bits=-oc_block_run_bits(b_coded_count);
353
522k
        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
354
522k
      }
355
1.75M
      else{
356
1.75M
        b_coded_count=1;
357
1.75M
        sb_partial_bits=2;
358
1.75M
      }
359
2.27M
      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
360
2.27M
      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
361
2.27M
      sb_prefer_partial=sb_partial_bits<sb_bits;
362
2.27M
      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
363
2.27M
    }
364
16.4M
    else if(_fr->b_coded==_b_coded){
365
10.5M
      b_coded_count++;
366
10.5M
      if(++b_count<16){
367
10.1M
        if(sb_prefer_partial){
368
          /*Check to see if it's cheaper to code it fully.*/
369
968k
          sb_partial_bits=sb_bits;
370
968k
          sb_partial_bits+=oc_block_run_bits(b_coded_count);
371
968k
          if(b_coded_count>0){
372
968k
            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
373
968k
          }
374
968k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
375
968k
          sb_prefer_partial=sb_partial_bits<sb_bits;
376
968k
          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
377
968k
        }
378
        /*There's no need to check the converse (whether it's cheaper to code
379
           this SB partially if we were coding it fully), since the cost to
380
           code a SB partially can only increase as we add more blocks, whereas
381
           the cost to code it fully stays constant.*/
382
10.1M
      }
383
404k
      else{
384
        /*If we get to the end and this SB is still full, then force it to be
385
           coded full.
386
          Otherwise we might not be able to extend the block run far enough
387
           into the next partial SB.*/
388
404k
        if(sb_prefer_partial){
389
16.4k
          sb_prefer_partial=0;
390
16.4k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
391
16.4k
        }
392
404k
      }
393
10.5M
    }
394
5.90M
    else{
395
      /*This SB was full, but now must be made partial.*/
396
5.90M
      if(!sb_prefer_partial){
397
5.27M
        sb_bits=oc_block_run_bits(b_coded_count);
398
5.27M
        if(b_coded_count>b_count){
399
1.26M
          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
400
1.26M
        }
401
5.27M
        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
402
5.27M
      }
403
5.90M
      b_count++;
404
5.90M
      b_coded_count=1;
405
5.90M
      sb_prefer_partial=1;
406
5.90M
      sb_bits+=2;
407
5.90M
    }
408
18.7M
  }
409
7.12M
  else{
410
7.12M
    b_count++;
411
7.12M
    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
412
2.64M
    else b_coded_count=0;
413
7.12M
    sb_bits+=oc_block_run_bits(++b_coded_count);
414
7.12M
  }
415
25.8M
  _fr->bits=bits+sb_bits;
416
25.8M
  _fr->b_coded_count=b_coded_count;
417
25.8M
  _fr->b_coded=_b_coded;
418
25.8M
  _fr->b_count=b_count;
419
25.8M
  _fr->sb_prefer_partial=sb_prefer_partial;
420
25.8M
  _fr->sb_bits=sb_bits;
421
25.8M
}
422
423
9.82M
static void oc_fr_skip_block(oc_fr_state *_fr){
424
9.82M
  oc_fr_state_advance_block(_fr,0);
425
9.82M
}
426
427
16.0M
static void oc_fr_code_block(oc_fr_state *_fr){
428
16.0M
  oc_fr_state_advance_block(_fr,1);
429
16.0M
}
430
431
1.54M
static int oc_fr_cost1(const oc_fr_state *_fr){
432
1.54M
  oc_fr_state tmp;
433
1.54M
  ptrdiff_t   bits;
434
1.54M
  *&tmp=*_fr;
435
1.54M
  oc_fr_skip_block(&tmp);
436
1.54M
  bits=tmp.bits;
437
1.54M
  *&tmp=*_fr;
438
1.54M
  oc_fr_code_block(&tmp);
439
1.54M
  return (int)(tmp.bits-bits);
440
1.54M
}
441
442
263k
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
443
263k
  oc_fr_state tmp;
444
263k
  *&tmp=*_pre;
445
263k
  oc_fr_skip_block(&tmp);
446
263k
  oc_fr_skip_block(&tmp);
447
263k
  oc_fr_skip_block(&tmp);
448
263k
  oc_fr_skip_block(&tmp);
449
263k
  return (int)(_post->bits-tmp.bits);
450
263k
}
451
452
453
454
183k
static void oc_qii_state_init(oc_qii_state *_qs){
455
183k
  _qs->bits=0;
456
183k
  _qs->qi01_count=0;
457
183k
  _qs->qi01=-1;
458
183k
  _qs->qi12_count=0;
459
183k
  _qs->qi12=-1;
460
183k
}
461
462
463
static void oc_qii_state_advance(oc_qii_state *_qd,
464
66.3M
 const oc_qii_state *_qs,int _qii){
465
66.3M
  ptrdiff_t bits;
466
66.3M
  int       qi01;
467
66.3M
  int       qi01_count;
468
66.3M
  int       qi12;
469
66.3M
  int       qi12_count;
470
66.3M
  bits=_qs->bits;
471
66.3M
  qi01=_qii+1>>1;
472
66.3M
  qi01_count=_qs->qi01_count;
473
66.3M
  if(qi01==_qs->qi01){
474
39.1M
    if(qi01_count>=4129){
475
3.07k
      bits++;
476
3.07k
      qi01_count=0;
477
3.07k
    }
478
39.1M
    else bits-=oc_sb_run_bits(qi01_count);
479
39.1M
  }
480
27.2M
  else qi01_count=0;
481
66.3M
  qi01_count++;
482
66.3M
  bits+=oc_sb_run_bits(qi01_count);
483
66.3M
  qi12_count=_qs->qi12_count;
484
66.3M
  if(_qii){
485
27.3M
    qi12=_qii>>1;
486
27.3M
    if(qi12==_qs->qi12){
487
14.9M
      if(qi12_count>=4129){
488
14.9k
        bits++;
489
14.9k
        qi12_count=0;
490
14.9k
      }
491
14.9M
      else bits-=oc_sb_run_bits(qi12_count);
492
14.9M
    }
493
12.3M
    else qi12_count=0;
494
27.3M
    qi12_count++;
495
27.3M
    bits+=oc_sb_run_bits(qi12_count);
496
27.3M
  }
497
39.0M
  else qi12=_qs->qi12;
498
66.3M
  _qd->bits=bits;
499
66.3M
  _qd->qi01=qi01;
500
66.3M
  _qd->qi01_count=qi01_count;
501
66.3M
  _qd->qi12=qi12;
502
66.3M
  _qd->qi12_count=qi12_count;
503
66.3M
}
504
505
506
507
51.0k
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
508
51.0k
  ptrdiff_t *coded_fragis;
509
51.0k
  unsigned   mcu_nvsbs;
510
51.0k
  ptrdiff_t  mcu_nfrags;
511
51.0k
  int        flimit;
512
51.0k
  int        hdec;
513
51.0k
  int        vdec;
514
51.0k
  int        pli;
515
51.0k
  int        nqis;
516
51.0k
  int        qii;
517
51.0k
  int        qi0;
518
51.0k
  int        qti;
519
  /*Initialize the per-plane coded block flag trackers.
520
    These are used for bit-estimation purposes only; the real flag bits span
521
     all three planes, so we can't compute them in parallel.*/
522
204k
  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
523
204k
  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
524
  /*Set up the per-plane skip SSD storage pointers.*/
525
51.0k
  mcu_nvsbs=_enc->mcu_nvsbs;
526
51.0k
  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
527
51.0k
  hdec=!(_enc->state.info.pixel_fmt&1);
528
51.0k
  vdec=!(_enc->state.info.pixel_fmt&2);
529
51.0k
  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
530
51.0k
  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
531
51.0k
  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
532
  /*Set up per-plane pointers to the coded and uncoded fragments lists.
533
    Unlike the decoder, each planes' coded and uncoded fragment list is kept
534
     separate during the analysis stage; we only make the coded list for all
535
     three planes contiguous right before the final packet is output
536
     (destroying the uncoded lists, which are no longer needed).*/
537
51.0k
  coded_fragis=_enc->state.coded_fragis;
538
204k
  for(pli=0;pli<3;pli++){
539
153k
    _pipe->coded_fragis[pli]=coded_fragis;
540
153k
    coded_fragis+=_enc->state.fplanes[pli].nfrags;
541
153k
    _pipe->uncoded_fragis[pli]=coded_fragis;
542
153k
  }
543
51.0k
  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
544
51.0k
  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
545
  /*Set up condensed quantizer tables.*/
546
51.0k
  qi0=_enc->state.qis[0];
547
51.0k
  nqis=_enc->state.nqis;
548
204k
  for(pli=0;pli<3;pli++){
549
412k
    for(qii=0;qii<nqis;qii++){
550
259k
      int qi;
551
259k
      qi=_enc->state.qis[qii];
552
779k
      for(qti=0;qti<2;qti++){
553
        /*Set the DC coefficient in the dequantization table.*/
554
519k
        _enc->state.dequant_tables[qi][pli][qti][0]=
555
519k
         _enc->dequant_dc[qi0][pli][qti];
556
519k
        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
557
        /*Copy over the quantization table.*/
558
519k
        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
559
519k
         _enc->opt_data.enquant_table_size);
560
519k
      }
561
259k
    }
562
153k
  }
563
  /*Fix up the DC coefficients in the quantization tables.*/
564
51.0k
  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
565
  /*Initialize the tokenization state.*/
566
204k
  for(pli=0;pli<3;pli++){
567
153k
    _pipe->ndct_tokens1[pli]=0;
568
153k
    _pipe->eob_run1[pli]=0;
569
153k
  }
570
  /*Initialize the bounding value array for the loop filter.*/
571
51.0k
  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
572
51.0k
  _pipe->loop_filter=flimit!=0;
573
51.0k
  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
574
  /*Clear the temporary DCT scratch space.*/
575
51.0k
  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
576
51.0k
}
577
578
/*Sets the current MCU stripe to super block row _sby.
579
  Return: A non-zero value if this was the last MCU.*/
580
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
581
219k
 oc_enc_pipeline_state *_pipe,int _sby){
582
219k
  const oc_fragment_plane *fplane;
583
219k
  unsigned                 mcu_nvsbs;
584
219k
  int                      sby_end;
585
219k
  int                      notdone;
586
219k
  int                      vdec;
587
219k
  int                      pli;
588
219k
  mcu_nvsbs=_enc->mcu_nvsbs;
589
219k
  sby_end=_enc->state.fplanes[0].nvsbs;
590
219k
  notdone=_sby+mcu_nvsbs<sby_end;
591
219k
  if(notdone)sby_end=_sby+mcu_nvsbs;
592
219k
  vdec=0;
593
879k
  for(pli=0;pli<3;pli++){
594
659k
    fplane=_enc->state.fplanes+pli;
595
659k
    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
596
659k
    _pipe->fragy0[pli]=_sby<<2-vdec;
597
659k
    _pipe->froffset[pli]=fplane->froffset
598
659k
     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
599
659k
    if(notdone){
600
506k
      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
601
506k
      _pipe->fragy_end[pli]=sby_end<<2-vdec;
602
506k
    }
603
153k
    else{
604
153k
      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
605
153k
      _pipe->fragy_end[pli]=fplane->nvfrags;
606
153k
    }
607
659k
    vdec=!(_enc->state.info.pixel_fmt&2);
608
659k
  }
609
219k
  return notdone;
610
219k
}
611
612
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
613
659k
 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
614
  /*Copy over all the uncoded fragments from this plane and advance the uncoded
615
     fragment list.*/
616
659k
  if(_pipe->nuncoded_fragis[_pli]>0){
617
54.2k
    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
618
54.2k
    oc_frag_copy_list(&_enc->state,
619
54.2k
     _enc->state.ref_frame_data[OC_FRAME_SELF],
620
54.2k
     _enc->state.ref_frame_data[OC_FRAME_PREV],
621
54.2k
     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
622
54.2k
     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
623
54.2k
    _pipe->nuncoded_fragis[_pli]=0;
624
54.2k
  }
625
  /*Perform DC prediction.*/
626
659k
  oc_enc_pred_dc_frag_rows(_enc,_pli,
627
659k
   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
628
  /*Finish DC tokenization.*/
629
659k
  oc_enc_tokenize_dc_frag_list(_enc,_pli,
630
659k
   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
631
659k
   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
632
659k
  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
633
659k
  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
634
  /*And advance the coded fragment list.*/
635
659k
  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
636
659k
  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
659k
  _pipe->ncoded_fragis[_pli]=0;
638
  /*Apply the loop filter if necessary.*/
639
659k
  if(_pipe->loop_filter){
640
445k
    oc_state_loop_filter_frag_rows(&_enc->state,
641
445k
     _pipe->bounding_values,OC_FRAME_SELF,_pli,
642
445k
     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
643
445k
  }
644
214k
  else _sdelay=_edelay=0;
645
  /*To fill borders, we have an additional two pixel delay, since a fragment
646
     in the next row could filter its top edge, using two pixels from a
647
     fragment in this row.
648
    But there's no reason to delay a full fragment between the two.*/
649
659k
  oc_state_borders_fill_rows(&_enc->state,
650
659k
   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
651
659k
   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
652
659k
   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
653
659k
}
654
655
656
657
/*Cost information about the coded blocks in a MB.*/
658
struct oc_rd_metric{
659
  int uncoded_ac_ssd;
660
  int coded_ac_ssd;
661
  int ac_bits;
662
  int dc_flag;
663
};
664
665
666
667
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
668
 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
669
 unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
670
23.2M
 oc_fr_state *_fr,oc_token_checkpoint **_stack){
671
23.2M
  ogg_int16_t            *data;
672
23.2M
  ogg_int16_t            *dct;
673
23.2M
  ogg_int16_t            *idct;
674
23.2M
  oc_qii_state            qs;
675
23.2M
  const ogg_uint16_t     *dequant;
676
23.2M
  ogg_uint16_t            dequant_dc;
677
23.2M
  ptrdiff_t               frag_offs;
678
23.2M
  int                     ystride;
679
23.2M
  const unsigned char    *src;
680
23.2M
  const unsigned char    *ref;
681
23.2M
  unsigned char          *dst;
682
23.2M
  int                     nonzero;
683
23.2M
  unsigned                uncoded_ssd;
684
23.2M
  unsigned                coded_ssd;
685
23.2M
  oc_token_checkpoint    *checkpoint;
686
23.2M
  oc_fragment            *frags;
687
23.2M
  int                     mb_mode;
688
23.2M
  int                     refi;
689
23.2M
  int                     mv_offs[2];
690
23.2M
  int                     nmv_offs;
691
23.2M
  int                     ac_bits;
692
23.2M
  int                     borderi;
693
23.2M
  int                     nqis;
694
23.2M
  int                     qti;
695
23.2M
  int                     qii;
696
23.2M
  int                     dc;
697
23.2M
  nqis=_enc->state.nqis;
698
23.2M
  frags=_enc->state.frags;
699
23.2M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
700
23.2M
  ystride=_enc->state.ref_ystride[_pli];
701
23.2M
  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
702
23.2M
  borderi=frags[_fragi].borderi;
703
23.2M
  qii=frags[_fragi].qii;
704
23.2M
  data=_enc->pipe.dct_data;
705
23.2M
  dct=data+64;
706
23.2M
  idct=data+128;
707
23.2M
  if(qii&~3){
708
455k
#if !defined(OC_COLLECT_METRICS)
709
455k
    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
710
      /*Enable early skip detection.*/
711
455k
      frags[_fragi].coded=0;
712
455k
      frags[_fragi].refi=OC_FRAME_NONE;
713
455k
      oc_fr_skip_block(_fr);
714
455k
      return 0;
715
455k
    }
716
0
#endif
717
    /*Try and code this block anyway.*/
718
0
    qii&=3;
719
0
  }
720
22.7M
  refi=frags[_fragi].refi;
721
22.7M
  mb_mode=frags[_fragi].mb_mode;
722
22.7M
  ref=_enc->state.ref_frame_data[refi]+frag_offs;
723
22.7M
  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
724
  /*Motion compensation:*/
725
22.7M
  switch(mb_mode){
726
22.3M
    case OC_MODE_INTRA:{
727
22.3M
      nmv_offs=0;
728
22.3M
      oc_enc_frag_sub_128(_enc,data,src,ystride);
729
22.3M
    }break;
730
17.1k
    case OC_MODE_GOLDEN_NOMV:
731
129k
    case OC_MODE_INTER_NOMV:{
732
129k
      nmv_offs=1;
733
129k
      mv_offs[0]=0;
734
129k
      oc_enc_frag_sub(_enc,data,src,ref,ystride);
735
129k
    }break;
736
305k
    default:{
737
305k
      const oc_mv *frag_mvs;
738
305k
      frag_mvs=_enc->state.frag_mvs;
739
305k
      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
740
305k
       _pli,frag_mvs[_fragi]);
741
305k
      if(nmv_offs>1){
742
240k
        oc_enc_frag_copy2(_enc,dst,
743
240k
         ref+mv_offs[0],ref+mv_offs[1],ystride);
744
240k
        oc_enc_frag_sub(_enc,data,src,dst,ystride);
745
240k
      }
746
65.1k
      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
747
305k
    }break;
748
22.7M
  }
749
#if defined(OC_COLLECT_METRICS)
750
  {
751
    unsigned sad;
752
    unsigned satd;
753
    switch(nmv_offs){
754
      case 0:{
755
        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
756
        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
757
      }break;
758
      case 1:{
759
        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
760
        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
761
        satd+=abs(dc);
762
      }break;
763
      default:{
764
        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
765
        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
766
        satd+=abs(dc);
767
      }break;
768
    }
769
    _enc->frag_sad[_fragi]=sad;
770
    _enc->frag_satd[_fragi]=satd;
771
  }
772
#endif
773
  /*Transform:*/
774
22.7M
  oc_enc_fdct8x8(_enc,dct,data);
775
  /*Quantize:*/
776
22.7M
  qti=mb_mode!=OC_MODE_INTRA;
777
22.7M
  dequant=_enc->dequant[_pli][qii][qti];
778
22.7M
  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
779
22.7M
  dc=data[0];
780
  /*Tokenize.*/
781
22.7M
  checkpoint=*_stack;
782
22.7M
  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
783
22.7M
    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
784
22.7M
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
785
22.7M
  }
786
0
  else{
787
0
    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
788
0
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
789
0
  }
790
  /*Reconstruct.
791
    TODO: nonzero may need to be adjusted after tokenization.*/
792
22.7M
  dequant_dc=dequant[0];
793
22.7M
  if(nonzero==0){
794
19.3M
    ogg_int16_t p;
795
19.3M
    int         ci;
796
19.3M
    int         qi01;
797
19.3M
    int         qi12;
798
    /*We round this dequant product (and not any of the others) because there's
799
       no iDCT rounding.*/
800
19.3M
    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
801
    /*LOOP VECTORIZES.*/
802
1.25G
    for(ci=0;ci<64;ci++)data[ci]=p;
803
    /*We didn't code any AC coefficients, so don't change the quantizer.*/
804
19.3M
    qi01=_pipe->qs[_pli].qi01;
805
19.3M
    qi12=_pipe->qs[_pli].qi12;
806
19.3M
    if(qi01>0)qii=1+qi12;
807
17.2M
    else if(qi01>=0)qii=0;
808
19.3M
  }
809
3.40M
  else{
810
3.40M
    idct[0]=dc*dequant_dc;
811
    /*Note: This clears idct[] back to zero for the next block.*/
812
3.40M
    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
813
3.40M
  }
814
22.7M
  frags[_fragi].qii=qii;
815
22.7M
  if(nqis>1){
816
8.31M
    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
817
8.31M
    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
818
8.31M
  }
819
22.7M
  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
820
434k
  else{
821
434k
    oc_enc_frag_recon_inter(_enc,dst,
822
434k
     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
823
434k
  }
824
  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
825
22.7M
#if !defined(OC_COLLECT_METRICS)
826
22.7M
  if(_fr!=NULL)
827
1.54M
#endif
828
1.54M
  {
829
    /*In retrospect, should we have skipped this block?*/
830
1.54M
    if(borderi<0){
831
937k
      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
832
937k
    }
833
602k
    else{
834
602k
      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
835
602k
       _enc->state.borders[borderi].mask);
836
602k
    }
837
    /*Scale to match DCT domain.*/
838
1.54M
    coded_ssd<<=4;
839
#if defined(OC_COLLECT_METRICS)
840
    _enc->frag_ssd[_fragi]=coded_ssd;
841
  }
842
  if(_fr!=NULL){
843
#endif
844
1.54M
    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
845
1.54M
    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
846
1.54M
    if(uncoded_ssd<UINT_MAX&&
847
     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
848
        is enabled.*/
849
1.54M
     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
850
1.54M
      int overhead_bits;
851
1.54M
      overhead_bits=oc_fr_cost1(_fr);
852
      /*Although the fragment coding overhead determination is accurate, it is
853
         greedy, using very coarse-grained local information.
854
        Allowing it to mildly discourage coding turns out to be beneficial, but
855
         it's not clear that allowing it to encourage coding through negative
856
         coding overhead deltas is useful.
857
        For that reason, we disallow negative coding overheads.*/
858
1.54M
      if(overhead_bits<0)overhead_bits=0;
859
1.54M
      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
860
        /*Hm, not worth it; roll back.*/
861
143k
        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
862
143k
        *_stack=checkpoint;
863
143k
        frags[_fragi].coded=0;
864
143k
        frags[_fragi].refi=OC_FRAME_NONE;
865
143k
        oc_fr_skip_block(_fr);
866
143k
        return 0;
867
143k
      }
868
1.54M
    }
869
0
    else _mo->dc_flag=1;
870
1.39M
    _mo->uncoded_ac_ssd+=uncoded_ssd;
871
1.39M
    _mo->coded_ac_ssd+=coded_ssd;
872
1.39M
    _mo->ac_bits+=ac_bits;
873
1.39M
    oc_fr_code_block(_fr);
874
1.39M
  }
875
  /*GCC 4.4.4 generates a warning here because it can't tell that
876
     the init code in the nqis check above will run anytime this
877
     line runs.*/
878
22.6M
  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
879
22.6M
  frags[_fragi].dc=dc;
880
22.6M
  frags[_fragi].coded=1;
881
22.6M
  return 1;
882
22.7M
}
883
884
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
885
 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
886
315k
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
887
  /*Worst case token stack usage for 4 fragments.*/
888
315k
  oc_token_checkpoint  stack[64*4];
889
315k
  oc_token_checkpoint *stackptr;
890
315k
  const oc_sb_map     *sb_maps;
891
315k
  signed char         *mb_modes;
892
315k
  oc_fragment         *frags;
893
315k
  ptrdiff_t           *coded_fragis;
894
315k
  ptrdiff_t            ncoded_fragis;
895
315k
  ptrdiff_t           *uncoded_fragis;
896
315k
  ptrdiff_t            nuncoded_fragis;
897
315k
  oc_rd_metric         mo;
898
315k
  oc_fr_state          fr_checkpoint;
899
315k
  oc_qii_state         qs_checkpoint;
900
315k
  int                  mb_mode;
901
315k
  int                  refi;
902
315k
  int                  ncoded;
903
315k
  ptrdiff_t            fragi;
904
315k
  int                  bi;
905
315k
  *&fr_checkpoint=*(_pipe->fr+0);
906
315k
  *&qs_checkpoint=*(_pipe->qs+0);
907
315k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
908
315k
  mb_modes=_enc->state.mb_modes;
909
315k
  frags=_enc->state.frags;
910
315k
  coded_fragis=_pipe->coded_fragis[0];
911
315k
  ncoded_fragis=_pipe->ncoded_fragis[0];
912
315k
  uncoded_fragis=_pipe->uncoded_fragis[0];
913
315k
  nuncoded_fragis=_pipe->nuncoded_fragis[0];
914
315k
  mb_mode=mb_modes[_mbi];
915
315k
  refi=OC_FRAME_FOR_MODE(mb_mode);
916
315k
  ncoded=0;
917
315k
  stackptr=stack;
918
315k
  memset(&mo,0,sizeof(mo));
919
1.57M
  for(bi=0;bi<4;bi++){
920
1.26M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
921
1.26M
    frags[fragi].refi=refi;
922
1.26M
    frags[fragi].mb_mode=mb_mode;
923
1.26M
    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
924
1.26M
     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
925
942k
      coded_fragis[ncoded_fragis++]=fragi;
926
942k
      ncoded++;
927
942k
    }
928
318k
    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
929
1.26M
  }
930
315k
  if(ncoded>0&&!mo.dc_flag){
931
263k
    int cost;
932
    /*Some individual blocks were worth coding.
933
      See if that's still true when accounting for mode and MV overhead.*/
934
263k
    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
935
263k
     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
936
263k
    if(mo.uncoded_ac_ssd<=cost){
937
      /*Taking macroblock overhead into account, it is not worth coding this
938
         MB.*/
939
6.69k
      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
940
6.69k
      *(_pipe->fr+0)=*&fr_checkpoint;
941
6.69k
      *(_pipe->qs+0)=*&qs_checkpoint;
942
33.4k
      for(bi=0;bi<4;bi++){
943
26.7k
        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
944
26.7k
        if(frags[fragi].coded){
945
10.1k
          *(uncoded_fragis-++nuncoded_fragis)=fragi;
946
10.1k
          frags[fragi].coded=0;
947
10.1k
          frags[fragi].refi=OC_FRAME_NONE;
948
10.1k
        }
949
26.7k
        oc_fr_skip_block(_pipe->fr+0);
950
26.7k
      }
951
6.69k
      ncoded_fragis-=ncoded;
952
6.69k
      ncoded=0;
953
6.69k
    }
954
263k
  }
955
  /*If no luma blocks coded, the mode is forced.*/
956
315k
  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
957
  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
958
     with a single coded block.
959
    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
960
     skipped blocks, while a 1MV does not.*/
961
257k
  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
962
167
    mb_modes[_mbi]=OC_MODE_INTER_MV;
963
167
  }
964
315k
  _pipe->ncoded_fragis[0]=ncoded_fragis;
965
315k
  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
966
315k
  return ncoded;
967
315k
}
968
969
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
970
66.3k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
971
66.3k
  const ogg_uint16_t *mcu_rd_scale;
972
66.3k
  const ogg_uint16_t *mcu_rd_iscale;
973
66.3k
  const oc_sb_map    *sb_maps;
974
66.3k
  oc_sb_flags        *sb_flags;
975
66.3k
  oc_fr_state        *fr;
976
66.3k
  ptrdiff_t          *coded_fragis;
977
66.3k
  ptrdiff_t           ncoded_fragis;
978
66.3k
  ptrdiff_t          *uncoded_fragis;
979
66.3k
  ptrdiff_t           nuncoded_fragis;
980
66.3k
  ptrdiff_t           froffset;
981
66.3k
  int                 sbi;
982
66.3k
  fr=_pipe->fr+_pli;
983
66.3k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
984
66.3k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
985
66.3k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
986
66.3k
  sb_flags=_enc->state.sb_flags;
987
66.3k
  coded_fragis=_pipe->coded_fragis[_pli];
988
66.3k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
989
66.3k
  uncoded_fragis=_pipe->uncoded_fragis[_pli];
990
66.3k
  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
991
66.3k
  froffset=_pipe->froffset[_pli];
992
179k
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
993
    /*Worst case token stack usage for 1 fragment.*/
994
112k
    oc_token_checkpoint stack[64];
995
112k
    oc_rd_metric        mo;
996
112k
    int                 quadi;
997
112k
    int                 bi;
998
112k
    memset(&mo,0,sizeof(mo));
999
2.25M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1000
1.80M
      ptrdiff_t fragi;
1001
1.80M
      fragi=sb_maps[sbi][quadi][bi];
1002
1.80M
      if(fragi>=0){
1003
734k
        oc_token_checkpoint *stackptr;
1004
734k
        unsigned             rd_scale;
1005
734k
        unsigned             rd_iscale;
1006
734k
        rd_scale=mcu_rd_scale[fragi-froffset];
1007
734k
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1008
734k
        stackptr=stack;
1009
734k
        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1010
734k
         rd_scale,rd_iscale,&mo,fr,&stackptr)){
1011
453k
          coded_fragis[ncoded_fragis++]=fragi;
1012
453k
        }
1013
280k
        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1014
734k
      }
1015
1.80M
    }
1016
112k
    oc_fr_state_flush_sb(fr);
1017
112k
    sb_flags[sbi].coded_fully=fr->sb_full;
1018
112k
    sb_flags[sbi].coded_partially=fr->sb_partial;
1019
112k
  }
1020
66.3k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1021
66.3k
  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1022
66.3k
}
1023
1024
/*Mode decision is done by exhaustively examining all potential choices.
1025
  Obviously, doing the motion compensation, fDCT, tokenization, and then
1026
   counting the bits each token uses is computationally expensive.
1027
  Theora's EOB runs can also split the cost of these tokens across multiple
1028
   fragments, and naturally we don't know what the optimal choice of Huffman
1029
   codes will be until we know all the tokens we're going to encode in all the
1030
   fragments.
1031
  So we use a simple approach to estimating the bit cost and distortion of each
1032
   mode based upon the SATD value of the residual before coding.
1033
  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1034
   the process (modified somewhat from that of the paper) is very simple.
1035
  We build a non-linear regression of the mappings from
1036
   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1037
   SSD for each qi.
1038
  A separate set of mappings is kept for each quantization type and color
1039
   plane.
1040
  The mappings are constructed by partitioning the SATD values into a small
1041
   number of bins (currently 24) and using a linear regression in each bin
1042
   (as opposed to the 0th-order regression used by Kim).
1043
  The bit counts and SSD measurements are obtained by examining actual encoded
1044
   frames, with appropriate lambda values and optimal Huffman codes selected.
1045
  EOB bits are assigned to the fragment that started the EOB run (as opposed to
1046
   dividing them among all the blocks in the run; the latter approach seems
1047
   more theoretically correct, but Monty's testing showed a small improvement
1048
   with the former, though that may have been merely statistical noise).
1049
1050
  @ARTICLE{Kim03,
1051
    author="Hyun Mun Kim",
1052
    title="Adaptive Rate Control Using Nonlinear Regression",
1053
    journal="IEEE Transactions on Circuits and Systems for Video Technology",
1054
    volume=13,
1055
    number=5,
1056
    pages="432--439",
1057
    month=May,
1058
    year=2003
1059
  }*/
1060
1061
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1062
   overflow for large lambda values.*/
1063
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1064
87.6M
 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1065
87.6M
 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1066
87.6M
 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1067
1068
51.0k
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1069
51.0k
#if !defined(OC_COLLECT_METRICS)
1070
51.0k
  const
1071
51.0k
#endif
1072
51.0k
  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1073
51.0k
   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1074
51.0k
  int qii;
1075
#if defined(OC_COLLECT_METRICS)
1076
  oc_enc_mode_metrics_load(_enc);
1077
#endif
1078
137k
  for(qii=0;qii<_enc->state.nqis;qii++){
1079
86.6k
    int qi;
1080
86.6k
    int pli;
1081
86.6k
    qi=_enc->state.qis[qii];
1082
346k
    for(pli=0;pli<3;pli++){
1083
259k
      int qti;
1084
779k
      for(qti=0;qti<2;qti++){
1085
519k
        int log_plq;
1086
519k
        int modeline;
1087
519k
        int bin;
1088
519k
        int dx;
1089
519k
        int dq;
1090
519k
        log_plq=_enc->log_plq[qi][pli][qti];
1091
        /*Find the pair of rows in the mode table that bracket this quantizer.
1092
          If it falls outside the range the table covers, then we just use a
1093
           pair on the edge for linear extrapolation.*/
1094
2.36M
        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1095
2.36M
         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1096
        /*Interpolate a row for this quantizer.*/
1097
519k
        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1098
519k
        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1099
519k
        if(dq==0)dq=1;
1100
12.9M
        for(bin=0;bin<OC_COMP_BINS;bin++){
1101
12.4M
          int y0;
1102
12.4M
          int z0;
1103
12.4M
          int dy;
1104
12.4M
          int dz;
1105
12.4M
          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1106
12.4M
          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1107
12.4M
          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1108
12.4M
          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1109
12.4M
          _enc->mode_rd[qii][pli][qti][bin].rate=
1110
12.4M
           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1111
12.4M
          _enc->mode_rd[qii][pli][qti][bin].rmse=
1112
12.4M
           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1113
12.4M
        }
1114
519k
      }
1115
259k
    }
1116
86.6k
  }
1117
51.0k
}
1118
1119
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1120
   prediction.*/
1121
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1122
57.1M
 int _qii,int _pli,int _qti,int _satd){
1123
57.1M
  unsigned rmse;
1124
57.1M
  int      shift;
1125
57.1M
  int      bin;
1126
57.1M
  int      dx;
1127
57.1M
  int      y0;
1128
57.1M
  int      z0;
1129
57.1M
  int      dy;
1130
57.1M
  int      dz;
1131
  /*SATD metrics for chroma planes vary much less than luma, so we scale them
1132
     by 4 to distribute them into the mode decision bins more evenly.*/
1133
57.1M
  _satd<<=_pli+1&2;
1134
57.1M
  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1135
57.1M
  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1136
57.1M
  dx=_satd-(bin<<shift);
1137
57.1M
  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1138
57.1M
  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1139
57.1M
  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1140
57.1M
  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1141
57.1M
  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1142
57.1M
  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1143
57.1M
  return OC_MAXI(y0+(dy*dx>>shift),0);
1144
57.1M
}
1145
1146
/*activity_avg must be positive, or flat regions could get a zero weight, which
1147
   confounds analysis.
1148
  We set the minimum to this value so that it also avoids the need for divide
1149
   by zero checks in oc_mb_masking().*/
1150
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1151
1152
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1153
3.06M
 unsigned _activity[4]){
1154
3.06M
  const unsigned char *src;
1155
3.06M
  const ptrdiff_t     *frag_buf_offs;
1156
3.06M
  const ptrdiff_t     *sb_map;
1157
3.06M
  unsigned             luma;
1158
3.06M
  int                  ystride;
1159
3.06M
  ptrdiff_t            frag_offs;
1160
3.06M
  ptrdiff_t            fragi;
1161
3.06M
  int                  bi;
1162
3.06M
  frag_buf_offs=_enc->state.frag_buf_offs;
1163
3.06M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1164
3.06M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1165
3.06M
  ystride=_enc->state.ref_ystride[0];
1166
3.06M
  luma=0;
1167
15.3M
  for(bi=0;bi<4;bi++){
1168
12.2M
    const unsigned char *s;
1169
12.2M
    unsigned             x;
1170
12.2M
    unsigned             x2;
1171
12.2M
    unsigned             act;
1172
12.2M
    int                  i;
1173
12.2M
    int                  j;
1174
12.2M
    fragi=sb_map[bi];
1175
12.2M
    frag_offs=frag_buf_offs[fragi];
1176
    /*TODO: This could be replaced with SATD^2, since we already have to
1177
       compute SATD.*/
1178
12.2M
    x=x2=0;
1179
12.2M
    s=src+frag_offs;
1180
110M
    for(i=0;i<8;i++){
1181
881M
      for(j=0;j<8;j++){
1182
783M
        unsigned c;
1183
783M
        c=s[j];
1184
783M
        x+=c;
1185
783M
        x2+=c*c;
1186
783M
      }
1187
97.9M
      s+=ystride;
1188
97.9M
    }
1189
12.2M
    luma+=x;
1190
12.2M
    act=(x2<<6)-x*x;
1191
12.2M
    if(act<8<<12){
1192
      /*The region is flat.*/
1193
9.60M
      act=OC_MINI(act,5<<12);
1194
9.60M
    }
1195
2.63M
    else{
1196
2.63M
      unsigned e1;
1197
2.63M
      unsigned e2;
1198
2.63M
      unsigned e3;
1199
2.63M
      unsigned e4;
1200
      /*Test for an edge.
1201
        TODO: There are probably much simpler ways to do this (e.g., it could
1202
         probably be combined with the SATD calculation).
1203
        Alternatively, we could split the block around the mean and compute the
1204
         reduction in variance in each half.
1205
        For a Gaussian source the reduction should be
1206
         (1-2/pi) ~= 0.36338022763241865692446494650994.
1207
        Significantly more reduction is a good indication of a bi-level image.
1208
        This has the advantage of identifying, in addition to straight edges,
1209
         small text regions, which would otherwise be classified as "texture".*/
1210
2.63M
      e1=e2=e3=e4=0;
1211
2.63M
      s=src+frag_offs-1;
1212
23.7M
      for(i=0;i<8;i++){
1213
189M
        for(j=0;j<8;j++){
1214
168M
          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1215
168M
           +(s+ystride)[j+2]-(s+ystride)[j]);
1216
168M
          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1217
168M
           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1218
168M
          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1219
168M
           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1220
168M
          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1221
168M
           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1222
168M
        }
1223
21.1M
        s+=ystride;
1224
21.1M
      }
1225
      /*If the largest component of the edge energy is at least 40% of the
1226
         total, then classify the block as an edge block.*/
1227
2.63M
      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1228
         /*act=act_th*(act/act_th)**0.7
1229
              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
1230
           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1231
40.1k
         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1232
40.1k
      }
1233
2.63M
    }
1234
12.2M
    _activity[bi]=act;
1235
12.2M
  }
1236
3.06M
  return luma;
1237
3.06M
}
1238
1239
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1240
0
 unsigned _activity[4],const unsigned _intra_satd[12]){
1241
0
  int bi;
1242
0
  for(bi=0;bi<4;bi++){
1243
0
    unsigned act;
1244
0
    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1245
0
    if(act<8<<12){
1246
      /*The region is flat.*/
1247
0
      act=OC_MINI(act,5<<12);
1248
0
    }
1249
0
    _activity[bi]=act;
1250
0
  }
1251
0
}
1252
1253
/*Compute the masking scales for the blocks in a macro block.
1254
  All masking is computed from the luma blocks.
1255
  We derive scaling factors for the chroma blocks from these, and use the same
1256
   ones for all chroma blocks, regardless of the subsampling.
1257
  It's possible for luma to be perfectly flat and yet have high chroma energy,
1258
   but this is unlikely in non-artificial images, and not a case that has been
1259
   addressed by any research to my knowledge.
1260
  The output of the masking process is two scale factors, which are fed into
1261
   the various R-D optimizations.
1262
  The first, rd_scale, is applied to D in the equation
1263
    D*rd_scale+lambda*R.
1264
  This is the form that must be used to properly combine scores from multiple
1265
   blocks, and can be interpreted as scaling distortions by their visibility.
1266
  The inverse, rd_iscale, is applied to lambda in the equation
1267
    D+rd_iscale*lambda*R.
1268
  This is equivalent to the first form within a single block, but much faster
1269
   to use when evaluating many possible distortions (e.g., during actual
1270
   quantization, where separate distortions are evaluated for every
1271
   coefficient).
1272
  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1273
   used to perform the multiplications with the proper re-scaling for the range
1274
   of the scaling factors.
1275
  Many researchers apply masking values directly to the quantizers used, and
1276
   not to the R-D cost.
1277
  Since we generally use MSE for D, rd_scale must use the square of their
1278
   values to generate an equivalent effect.*/
1279
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1280
 const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1281
3.06M
 unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1282
3.06M
  unsigned activity_sum;
1283
3.06M
  unsigned la;
1284
3.06M
  unsigned lb;
1285
3.06M
  unsigned d;
1286
3.06M
  int      bi;
1287
3.06M
  int      bi_min;
1288
3.06M
  int      bi_min2;
1289
  /*The ratio lb/la is meant to approximate
1290
     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1291
     effective luminance masking from~\cite{LKW06} (including the self-masking
1292
     deflator).
1293
    The following actually turns out to be a pretty good approximation for
1294
     _luma>75 or so.
1295
    For smaller values luminance does not really follow Weber's Law anyway, and
1296
     this approximation gives a much less aggressive bitrate boost in this
1297
     region.
1298
    Though some researchers claim that contrast sensitivity actually decreases
1299
     for very low luminance values, in my experience excessive brightness on
1300
     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1301
     of the CCIR 601 range) make artifacts in such regions extremely visible.
1302
    We substitute _luma_avg for 128 to allow the strength of the masking to
1303
     vary with the actual average image luminance, within certain limits (the
1304
     caller has clamped _luma_avg to the range [90,160], inclusive).
1305
    @ARTICLE{LKW06,
1306
      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1307
      title="{JPEG2000} Encoding With Perceptual Distortion Control",
1308
      journal="{IEEE} Transactions on Image Processing",
1309
      volume=15,
1310
      number=7,
1311
      pages="1763--1778",
1312
      month=Jul,
1313
      year=2006
1314
    }*/
1315
#if 0
1316
  la=_luma+4*_luma_avg;
1317
  lb=4*_luma+_luma_avg;
1318
#else
1319
  /*Disable luminance masking.*/
1320
3.06M
  la=lb=1;
1321
3.06M
#endif
1322
3.06M
  activity_sum=0;
1323
15.3M
  for(bi=0;bi<4;bi++){
1324
12.2M
    unsigned a;
1325
12.2M
    unsigned b;
1326
12.2M
    activity_sum+=_activity[bi];
1327
    /*Apply activity masking.*/
1328
12.2M
    a=_activity[bi]+4*_activity_avg;
1329
12.2M
    b=4*_activity[bi]+_activity_avg;
1330
12.2M
    d=OC_RD_SCALE(b,1);
1331
    /*And luminance masking.*/
1332
12.2M
    d=(a+(d>>1))/d;
1333
12.2M
    _rd_scale[bi]=(d*la+(lb>>1))/lb;
1334
    /*And now the inverse.*/
1335
12.2M
    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1336
12.2M
    d=(b+(d>>1))/d;
1337
12.2M
    _rd_iscale[bi]=(d*lb+(la>>1))/la;
1338
12.2M
  }
1339
  /*Now compute scaling factors for chroma blocks.
1340
    We start by finding the two smallest iscales from the luma blocks.*/
1341
3.06M
  bi_min=_rd_iscale[1]<_rd_iscale[0];
1342
3.06M
  bi_min2=1-bi_min;
1343
9.18M
  for(bi=2;bi<4;bi++){
1344
6.12M
    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1345
475k
      bi_min2=bi_min;
1346
475k
      bi_min=bi;
1347
475k
    }
1348
5.64M
    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1349
6.12M
  }
1350
  /*If the minimum iscale is less than 1.0, use the second smallest instead,
1351
     and force the value to at least 1.0 (inflating chroma is a waste).*/
1352
3.06M
  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1353
3.06M
  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1354
3.06M
  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1355
3.06M
  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1356
3.06M
  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1357
3.06M
  return activity_sum;
1358
3.06M
}
1359
1360
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1361
315k
 unsigned _frag_satd[12]){
1362
315k
  const unsigned char   *src;
1363
315k
  const ptrdiff_t       *frag_buf_offs;
1364
315k
  const ptrdiff_t       *sb_map;
1365
315k
  const oc_mb_map_plane *mb_map;
1366
315k
  const unsigned char   *map_idxs;
1367
315k
  int                    map_nidxs;
1368
315k
  int                    mapii;
1369
315k
  int                    mapi;
1370
315k
  int                    ystride;
1371
315k
  int                    pli;
1372
315k
  int                    bi;
1373
315k
  ptrdiff_t              fragi;
1374
315k
  ptrdiff_t              frag_offs;
1375
315k
  unsigned               luma;
1376
315k
  int                    dc;
1377
315k
  frag_buf_offs=_enc->state.frag_buf_offs;
1378
315k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1379
315k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1380
315k
  ystride=_enc->state.ref_ystride[0];
1381
315k
  luma=0;
1382
1.57M
  for(bi=0;bi<4;bi++){
1383
1.26M
    fragi=sb_map[bi];
1384
1.26M
    frag_offs=frag_buf_offs[fragi];
1385
1.26M
    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1386
1.26M
    luma+=dc;
1387
1.26M
  }
1388
315k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1389
315k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1390
315k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1391
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1392
315k
  ystride=_enc->state.ref_ystride[1];
1393
1.05M
  for(mapii=4;mapii<map_nidxs;mapii++){
1394
734k
    mapi=map_idxs[mapii];
1395
734k
    pli=mapi>>2;
1396
734k
    bi=mapi&3;
1397
734k
    fragi=mb_map[pli][bi];
1398
734k
    frag_offs=frag_buf_offs[fragi];
1399
734k
    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1400
734k
  }
1401
315k
  return luma;
1402
315k
}
1403
1404
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1405
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1406
2.74M
 const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1407
2.74M
  const unsigned char *src;
1408
2.74M
  const ptrdiff_t     *frag_buf_offs;
1409
2.74M
  const oc_sb_map     *sb_maps;
1410
2.74M
  oc_fragment         *frags;
1411
2.74M
  ptrdiff_t            frag_offs;
1412
2.74M
  ptrdiff_t            fragi;
1413
2.74M
  oc_qii_state         qs[4][3];
1414
2.74M
  unsigned             cost[4][3];
1415
2.74M
  unsigned             ssd[4][3];
1416
2.74M
  unsigned             rate[4][3];
1417
2.74M
  int                  prev[3][3];
1418
2.74M
  unsigned             satd;
1419
2.74M
  int                  dc;
1420
2.74M
  unsigned             best_cost;
1421
2.74M
  unsigned             best_ssd;
1422
2.74M
  unsigned             best_rate;
1423
2.74M
  int                  best_qii;
1424
2.74M
  int                  qii;
1425
2.74M
  int                  lambda;
1426
2.74M
  int                  ystride;
1427
2.74M
  int                  nqis;
1428
2.74M
  int                  bi;
1429
2.74M
  frag_buf_offs=_enc->state.frag_buf_offs;
1430
2.74M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1431
2.74M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1432
2.74M
  ystride=_enc->state.ref_ystride[0];
1433
2.74M
  fragi=sb_maps[_mbi>>2][_mbi&3][0];
1434
2.74M
  frag_offs=frag_buf_offs[fragi];
1435
2.74M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1436
2.74M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1437
2.74M
  }
1438
0
  else{
1439
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1440
0
  }
1441
2.74M
  nqis=_enc->state.nqis;
1442
2.74M
  lambda=_enc->lambda;
1443
7.15M
  for(qii=0;qii<nqis;qii++){
1444
4.41M
    oc_qii_state_advance(qs[0]+qii,_qs,qii);
1445
4.41M
    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1446
4.41M
     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1447
4.41M
    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1448
4.41M
    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1449
4.41M
  }
1450
10.9M
  for(bi=1;bi<4;bi++){
1451
8.23M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1452
8.23M
    frag_offs=frag_buf_offs[fragi];
1453
8.23M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1454
8.23M
      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1455
8.23M
    }
1456
0
    else{
1457
0
      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1458
0
    }
1459
21.4M
    for(qii=0;qii<nqis;qii++){
1460
13.2M
      oc_qii_state qt[3];
1461
13.2M
      unsigned     cur_ssd;
1462
13.2M
      unsigned     cur_rate;
1463
13.2M
      int          best_qij;
1464
13.2M
      int          qij;
1465
13.2M
      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1466
13.2M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1467
13.2M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1468
13.2M
      best_ssd=ssd[bi-1][0]+cur_ssd;
1469
13.2M
      best_rate=rate[bi-1][0]+cur_rate
1470
13.2M
       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1471
13.2M
      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1472
13.2M
      best_qij=0;
1473
27.5M
      for(qij=1;qij<nqis;qij++){
1474
14.3M
        unsigned chain_ssd;
1475
14.3M
        unsigned chain_rate;
1476
14.3M
        unsigned chain_cost;
1477
14.3M
        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1478
14.3M
        chain_ssd=ssd[bi-1][qij]+cur_ssd;
1479
14.3M
        chain_rate=rate[bi-1][qij]+cur_rate
1480
14.3M
         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1481
14.3M
        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1482
14.3M
        if(chain_cost<best_cost){
1483
5.83M
          best_cost=chain_cost;
1484
5.83M
          best_ssd=chain_ssd;
1485
5.83M
          best_rate=chain_rate;
1486
5.83M
          best_qij=qij;
1487
5.83M
        }
1488
14.3M
      }
1489
13.2M
      *(qs[bi]+qii)=*(qt+best_qij);
1490
13.2M
      cost[bi][qii]=best_cost;
1491
13.2M
      ssd[bi][qii]=best_ssd;
1492
13.2M
      rate[bi][qii]=best_rate;
1493
13.2M
      prev[bi-1][qii]=best_qij;
1494
13.2M
    }
1495
8.23M
  }
1496
2.74M
  best_qii=0;
1497
2.74M
  best_cost=cost[3][0];
1498
4.41M
  for(qii=1;qii<nqis;qii++){
1499
1.66M
    if(cost[3][qii]<best_cost){
1500
583k
      best_cost=cost[3][qii];
1501
583k
      best_qii=qii;
1502
583k
    }
1503
1.66M
  }
1504
2.74M
  frags=_enc->state.frags;
1505
10.9M
  for(bi=3;;){
1506
10.9M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1507
10.9M
    frags[fragi].qii=best_qii;
1508
10.9M
    if(bi--<=0)break;
1509
8.23M
    best_qii=prev[bi][best_qii];
1510
8.23M
  }
1511
2.74M
  return best_cost;
1512
2.74M
}
1513
1514
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1515
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1516
10.2M
 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1517
10.2M
  const unsigned char *src;
1518
10.2M
  oc_fragment         *frags;
1519
10.2M
  ptrdiff_t            frag_offs;
1520
10.2M
  oc_qii_state         qt[3];
1521
10.2M
  unsigned             cost[3];
1522
10.2M
  unsigned             satd;
1523
10.2M
  int                  dc;
1524
10.2M
  unsigned             best_cost;
1525
10.2M
  int                  best_qii;
1526
10.2M
  int                  qii;
1527
10.2M
  int                  lambda;
1528
10.2M
  int                  ystride;
1529
10.2M
  int                  nqis;
1530
10.2M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1531
10.2M
  ystride=_enc->state.ref_ystride[_pli];
1532
10.2M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
1533
10.2M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1534
10.2M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1535
10.2M
  }
1536
0
  else{
1537
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1538
0
  }
1539
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1540
     worth spending the bits to change the AC quantizer.
1541
    TODO: This may be worth revisiting when we separate out DC and AC
1542
     predictions from SATD.*/
1543
#if 0
1544
  nqis=_enc->state.nqis;
1545
#else
1546
10.2M
  nqis=1;
1547
10.2M
#endif
1548
10.2M
  lambda=_enc->lambda;
1549
10.2M
  best_qii=0;
1550
20.4M
  for(qii=0;qii<nqis;qii++){
1551
10.2M
    unsigned cur_rate;
1552
10.2M
    unsigned cur_ssd;
1553
10.2M
    oc_qii_state_advance(qt+qii,_qs,qii);
1554
10.2M
    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1555
10.2M
     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1556
10.2M
    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1557
10.2M
    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1558
10.2M
  }
1559
10.2M
  best_cost=cost[0];
1560
10.2M
  for(qii=1;qii<nqis;qii++){
1561
0
    if(cost[qii]<best_cost){
1562
0
      best_cost=cost[qii];
1563
0
      best_qii=qii;
1564
0
    }
1565
0
  }
1566
10.2M
  frags=_enc->state.frags;
1567
10.2M
  frags[_fragi].qii=best_qii;
1568
10.2M
  return best_cost;
1569
10.2M
}
1570
1571
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1572
 oc_enc_pipeline_state *_pipe,unsigned _mbi,
1573
2.74M
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1574
  /*Worst case token stack usage for 4 fragments.*/
1575
2.74M
  oc_token_checkpoint  stack[64*4];
1576
2.74M
  oc_token_checkpoint *stackptr;
1577
2.74M
  const oc_sb_map     *sb_maps;
1578
2.74M
  oc_fragment         *frags;
1579
2.74M
  ptrdiff_t           *coded_fragis;
1580
2.74M
  ptrdiff_t            ncoded_fragis;
1581
2.74M
  ptrdiff_t            fragi;
1582
2.74M
  int                  bi;
1583
2.74M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1584
2.74M
  frags=_enc->state.frags;
1585
2.74M
  coded_fragis=_pipe->coded_fragis[0];
1586
2.74M
  ncoded_fragis=_pipe->ncoded_fragis[0];
1587
2.74M
  stackptr=stack;
1588
13.7M
  for(bi=0;bi<4;bi++){
1589
10.9M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1590
10.9M
    frags[fragi].refi=OC_FRAME_SELF;
1591
10.9M
    frags[fragi].mb_mode=OC_MODE_INTRA;
1592
10.9M
    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1593
10.9M
     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1594
10.9M
    coded_fragis[ncoded_fragis++]=fragi;
1595
10.9M
  }
1596
2.74M
  _pipe->ncoded_fragis[0]=ncoded_fragis;
1597
2.74M
}
1598
1599
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1600
373k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1601
373k
  const ogg_uint16_t *mcu_rd_scale;
1602
373k
  const ogg_uint16_t *mcu_rd_iscale;
1603
373k
  const oc_sb_map    *sb_maps;
1604
373k
  ptrdiff_t          *coded_fragis;
1605
373k
  ptrdiff_t           ncoded_fragis;
1606
373k
  ptrdiff_t           froffset;
1607
373k
  int                 sbi;
1608
373k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1609
373k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1610
373k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1611
373k
  coded_fragis=_pipe->coded_fragis[_pli];
1612
373k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
1613
373k
  froffset=_pipe->froffset[_pli];
1614
2.01M
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1615
    /*Worst case token stack usage for 1 fragment.*/
1616
1.63M
    oc_token_checkpoint stack[64];
1617
1.63M
    int                 quadi;
1618
1.63M
    int                 bi;
1619
32.7M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1620
26.2M
      ptrdiff_t fragi;
1621
26.2M
      fragi=sb_maps[sbi][quadi][bi];
1622
26.2M
      if(fragi>=0){
1623
10.2M
        oc_token_checkpoint *stackptr;
1624
10.2M
        unsigned             rd_scale;
1625
10.2M
        unsigned             rd_iscale;
1626
10.2M
        rd_scale=mcu_rd_scale[fragi-froffset];
1627
10.2M
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1628
10.2M
        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1629
10.2M
        stackptr=stack;
1630
10.2M
        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1631
10.2M
         rd_scale,rd_iscale,NULL,NULL,&stackptr);
1632
10.2M
        coded_fragis[ncoded_fragis++]=fragi;
1633
10.2M
      }
1634
26.2M
    }
1635
1.63M
  }
1636
373k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1637
373k
}
1638
1639
/*Analysis stage for an INTRA frame.*/
1640
20.2k
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1641
20.2k
  ogg_int64_t             activity_sum;
1642
20.2k
  ogg_int64_t             luma_sum;
1643
20.2k
  unsigned                activity_avg;
1644
20.2k
  unsigned                luma_avg;
1645
20.2k
  const ogg_uint16_t     *chroma_rd_scale;
1646
20.2k
  ogg_uint16_t           *mcu_rd_scale;
1647
20.2k
  ogg_uint16_t           *mcu_rd_iscale;
1648
20.2k
  const unsigned char    *map_idxs;
1649
20.2k
  int                     nmap_idxs;
1650
20.2k
  oc_sb_flags            *sb_flags;
1651
20.2k
  signed char            *mb_modes;
1652
20.2k
  const oc_mb_map        *mb_maps;
1653
20.2k
  const oc_sb_map        *sb_maps;
1654
20.2k
  oc_fragment            *frags;
1655
20.2k
  unsigned                stripe_sby;
1656
20.2k
  unsigned                mcu_nvsbs;
1657
20.2k
  int                     notstart;
1658
20.2k
  int                     notdone;
1659
20.2k
  int                     refi;
1660
20.2k
  int                     pli;
1661
20.2k
  _enc->state.frame_type=OC_INTRA_FRAME;
1662
20.2k
  oc_enc_tokenize_start(_enc);
1663
20.2k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
1664
20.2k
  oc_enc_mode_rd_init(_enc);
1665
20.2k
  activity_sum=luma_sum=0;
1666
20.2k
  activity_avg=_enc->activity_avg;
1667
20.2k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1668
20.2k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1669
20.2k
  mcu_rd_scale=_enc->mcu_rd_scale;
1670
20.2k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
1671
  /*Choose MVs and MB modes and quantize and code luma.
1672
    Must be done in Hilbert order.*/
1673
20.2k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1674
20.2k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1675
20.2k
  _enc->state.ncoded_fragis[0]=0;
1676
20.2k
  _enc->state.ncoded_fragis[1]=0;
1677
20.2k
  _enc->state.ncoded_fragis[2]=0;
1678
20.2k
  sb_flags=_enc->state.sb_flags;
1679
20.2k
  mb_modes=_enc->state.mb_modes;
1680
20.2k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1681
20.2k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1682
20.2k
  frags=_enc->state.frags;
1683
20.2k
  notstart=0;
1684
20.2k
  notdone=1;
1685
20.2k
  mcu_nvsbs=_enc->mcu_nvsbs;
1686
206k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1687
186k
    ptrdiff_t cfroffset;
1688
186k
    unsigned  sbi;
1689
186k
    unsigned  sbi_end;
1690
186k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1691
186k
    sbi_end=_enc->pipe.sbi_end[0];
1692
186k
    cfroffset=_enc->pipe.froffset[1];
1693
1.46M
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1694
1.27M
      int quadi;
1695
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
1696
6.38M
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1697
2.74M
        unsigned  activity[4];
1698
2.74M
        unsigned  rd_scale[5];
1699
2.74M
        unsigned  rd_iscale[5];
1700
2.74M
        unsigned  luma;
1701
2.74M
        unsigned  mbi;
1702
2.74M
        int       mapii;
1703
2.74M
        int       mapi;
1704
2.74M
        int       bi;
1705
2.74M
        ptrdiff_t fragi;
1706
2.74M
        mbi=sbi<<2|quadi;
1707
        /*Activity masking.*/
1708
2.74M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1709
2.74M
          luma=oc_mb_activity(_enc,mbi,activity);
1710
2.74M
        }
1711
0
        else{
1712
0
          unsigned intra_satd[12];
1713
0
          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1714
0
          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1715
0
          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1716
0
        }
1717
2.74M
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1718
2.74M
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1719
2.74M
        luma_sum+=luma;
1720
        /*Motion estimation:
1721
          We do a basic 1MV search for all macroblocks, coded or not,
1722
           keyframe or not, unless we aren't using motion estimation at all.*/
1723
2.74M
        if(!_recode&&_enc->state.curframe_num>0&&
1724
2.74M
         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1725
609
          oc_mcenc_search(_enc,mbi);
1726
609
        }
1727
2.74M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1728
2.74M
          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1729
2.74M
        }
1730
2.74M
        mb_modes[mbi]=OC_MODE_INTRA;
1731
2.74M
        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1732
2.74M
         mbi,rd_scale,rd_iscale);
1733
        /*Propagate final MB mode and MVs to the chroma blocks.*/
1734
12.9M
        for(mapii=4;mapii<nmap_idxs;mapii++){
1735
10.2M
          mapi=map_idxs[mapii];
1736
10.2M
          pli=mapi>>2;
1737
10.2M
          bi=mapi&3;
1738
10.2M
          fragi=mb_maps[mbi][pli][bi];
1739
10.2M
          frags[fragi].refi=OC_FRAME_SELF;
1740
10.2M
          frags[fragi].mb_mode=OC_MODE_INTRA;
1741
10.2M
        }
1742
        /*Save masking scale factors for chroma blocks.*/
1743
7.87M
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1744
5.12M
          mapi=map_idxs[mapii];
1745
5.12M
          bi=mapi&3;
1746
5.12M
          fragi=mb_maps[mbi][1][bi];
1747
5.12M
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1748
5.12M
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1749
5.12M
        }
1750
2.74M
      }
1751
1.27M
    }
1752
186k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1753
    /*Code chroma planes.*/
1754
560k
    for(pli=1;pli<3;pli++){
1755
373k
      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1756
373k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1757
373k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1758
373k
    }
1759
186k
    notstart=1;
1760
186k
  }
1761
  /*Compute the average block activity and MB luma score for the frame.*/
1762
20.2k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1763
20.2k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1764
20.2k
   _enc->state.fplanes[0].nfrags));
1765
20.2k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1766
  /*Finish filling in the reference frame borders.*/
1767
20.2k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1768
81.0k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1769
20.2k
  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1770
20.2k
}
1771
1772
1773
1774
/*Cost information about a MB mode.*/
1775
struct oc_mode_choice{
1776
  unsigned      cost;
1777
  unsigned      ssd;
1778
  unsigned      rate;
1779
  unsigned      overhead;
1780
  unsigned char qii[12];
1781
};
1782
1783
1784
1785
4.28M
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1786
4.28M
  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1787
4.28M
   _modec->rate+_modec->overhead,_lambda);
1788
4.28M
}
1789
1790
/*A set of skip SSD's to use to disable early skipping.*/
1791
static const unsigned OC_NOSKIP[12]={
1792
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1793
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1795
};
1796
1797
/*The estimated number of bits used by a coded chroma block to specify the AC
1798
   quantizer.
1799
  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1800
   measurements suggest this is in the right ballpark, but it varies somewhat
1801
   with lambda.*/
1802
7.63M
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1803
1804
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1805
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1806
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1807
3.27M
 const unsigned _rd_scale[4],int _qti){
1808
3.27M
  oc_fr_state  fr;
1809
3.27M
  oc_qii_state qs;
1810
3.27M
  unsigned     ssd;
1811
3.27M
  unsigned     rate;
1812
3.27M
  unsigned     satd;
1813
3.27M
  unsigned     best_ssd;
1814
3.27M
  unsigned     best_rate;
1815
3.27M
  int          best_fri;
1816
3.27M
  int          best_qii;
1817
3.27M
  int          lambda;
1818
3.27M
  int          nqis;
1819
3.27M
  int          nskipped;
1820
3.27M
  int          bi;
1821
3.27M
  lambda=_enc->lambda;
1822
3.27M
  nqis=_enc->state.nqis;
1823
  /*We could do a trellis optimization here, but we don't make final skip
1824
     decisions until after transform+quantization, so the result wouldn't be
1825
     optimal anyway.
1826
    Instead we just use a greedy approach; for most SATD values, the
1827
     differences between the qiis are large enough to drown out the cost to
1828
     code the flags, anyway.*/
1829
3.27M
  *&fr=*_fr;
1830
3.27M
  *&qs=*_qs;
1831
3.27M
  ssd=rate=nskipped=0;
1832
16.3M
  for(bi=0;bi<4;bi++){
1833
13.0M
    oc_fr_state  ft[2];
1834
13.0M
    oc_qii_state qt[3];
1835
13.0M
    unsigned     best_cost;
1836
13.0M
    unsigned     cur_cost;
1837
13.0M
    unsigned     cur_ssd;
1838
13.0M
    unsigned     cur_rate;
1839
13.0M
    unsigned     cur_overhead;
1840
13.0M
    int          qii;
1841
13.0M
    satd=_frag_satd[bi];
1842
13.0M
    *(ft+0)=*&fr;
1843
13.0M
    oc_fr_code_block(ft+0);
1844
13.0M
    cur_overhead=ft[0].bits-fr.bits;
1845
13.0M
    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1846
13.0M
     +(cur_overhead<<OC_BIT_SCALE);
1847
13.0M
    if(nqis>1){
1848
5.99M
      oc_qii_state_advance(qt+0,&qs,0);
1849
5.99M
      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1850
5.99M
    }
1851
13.0M
    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1852
13.0M
    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1853
13.0M
    best_fri=0;
1854
13.0M
    best_qii=0;
1855
21.6M
    for(qii=1;qii<nqis;qii++){
1856
8.52M
      oc_qii_state_advance(qt+qii,&qs,qii);
1857
8.52M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1858
8.52M
       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1859
8.52M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1860
8.52M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1861
8.52M
      if(cur_cost<best_cost){
1862
2.43M
        best_cost=cur_cost;
1863
2.43M
        best_ssd=cur_ssd;
1864
2.43M
        best_rate=cur_rate;
1865
2.43M
        best_qii=qii;
1866
2.43M
      }
1867
8.52M
    }
1868
13.0M
    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1869
6.60M
      *(ft+1)=*&fr;
1870
6.60M
      oc_fr_skip_block(ft+1);
1871
6.60M
      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1872
6.60M
      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1873
6.60M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1874
6.60M
      if(cur_cost<=best_cost){
1875
2.26M
        best_ssd=cur_ssd;
1876
2.26M
        best_rate=cur_overhead;
1877
2.26M
        best_fri=1;
1878
2.26M
        best_qii+=4;
1879
2.26M
      }
1880
6.60M
    }
1881
13.0M
    rate+=best_rate;
1882
13.0M
    ssd+=best_ssd;
1883
13.0M
    *&fr=*(ft+best_fri);
1884
13.0M
    if(best_fri==0)*&qs=*(qt+best_qii);
1885
2.26M
    else nskipped++;
1886
13.0M
    _modec->qii[bi]=best_qii;
1887
13.0M
  }
1888
3.27M
  _modec->ssd=ssd;
1889
3.27M
  _modec->rate=rate;
1890
3.27M
}
1891
1892
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1893
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1894
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1895
3.27M
 unsigned _rd_scale,int _qti){
1896
3.27M
  unsigned ssd;
1897
3.27M
  unsigned rate;
1898
3.27M
  unsigned satd;
1899
3.27M
  unsigned best_ssd;
1900
3.27M
  unsigned best_rate;
1901
3.27M
  int      best_qii;
1902
3.27M
  unsigned cur_cost;
1903
3.27M
  unsigned cur_ssd;
1904
3.27M
  unsigned cur_rate;
1905
3.27M
  int      lambda;
1906
3.27M
  int      nblocks;
1907
3.27M
  int      nqis;
1908
3.27M
  int      pli;
1909
3.27M
  int      bi;
1910
3.27M
  int      qii;
1911
3.27M
  lambda=_enc->lambda;
1912
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1913
     worth spending the bits to change the AC quantizer.
1914
    TODO: This may be worth revisiting when we separate out DC and AC
1915
     predictions from SATD.*/
1916
#if 0
1917
  nqis=_enc->state.nqis;
1918
#else
1919
3.27M
  nqis=1;
1920
3.27M
#endif
1921
3.27M
  ssd=_modec->ssd;
1922
3.27M
  rate=_modec->rate;
1923
  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1924
     order, we assume a constant overhead for coded block and qii flags.*/
1925
3.27M
  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1926
3.27M
  nblocks=(nblocks-4>>1)+4;
1927
3.27M
  bi=4;
1928
9.81M
  for(pli=1;pli<3;pli++){
1929
14.1M
    for(;bi<nblocks;bi++){
1930
7.63M
      unsigned best_cost;
1931
7.63M
      satd=_frag_satd[bi];
1932
7.63M
      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1933
7.63M
       +OC_CHROMA_QII_RATE;
1934
7.63M
      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1935
7.63M
      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1936
7.63M
      best_qii=0;
1937
7.63M
      for(qii=1;qii<nqis;qii++){
1938
0
        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1939
0
         +OC_CHROMA_QII_RATE;
1940
0
        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1941
0
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1942
0
        if(cur_cost<best_cost){
1943
0
          best_cost=cur_cost;
1944
0
          best_ssd=cur_ssd;
1945
0
          best_rate=cur_rate;
1946
0
          best_qii=qii;
1947
0
        }
1948
0
      }
1949
7.63M
      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1950
5.31M
        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1951
5.31M
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1952
5.31M
        if(cur_cost<=best_cost){
1953
2.58M
          best_ssd=cur_ssd;
1954
2.58M
          best_rate=0;
1955
2.58M
          best_qii+=4;
1956
2.58M
        }
1957
5.31M
      }
1958
7.63M
      rate+=best_rate;
1959
7.63M
      ssd+=best_ssd;
1960
7.63M
      _modec->qii[bi]=best_qii;
1961
7.63M
    }
1962
6.54M
    nblocks=(nblocks-4<<1)+4;
1963
6.54M
  }
1964
3.27M
  _modec->ssd=ssd;
1965
3.27M
  _modec->rate=rate;
1966
3.27M
}
1967
1968
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1969
315k
 unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1970
315k
  const unsigned char   *src;
1971
315k
  const unsigned char   *ref;
1972
315k
  int                    ystride;
1973
315k
  const oc_fragment     *frags;
1974
315k
  const ptrdiff_t       *frag_buf_offs;
1975
315k
  const ptrdiff_t       *sb_map;
1976
315k
  const oc_mb_map_plane *mb_map;
1977
315k
  const unsigned char   *map_idxs;
1978
315k
  oc_mv                 *mvs;
1979
315k
  int                    map_nidxs;
1980
315k
  unsigned               uncoded_ssd;
1981
315k
  int                    mapii;
1982
315k
  int                    mapi;
1983
315k
  int                    pli;
1984
315k
  int                    bi;
1985
315k
  ptrdiff_t              fragi;
1986
315k
  ptrdiff_t              frag_offs;
1987
315k
  int                    borderi;
1988
315k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1989
315k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1990
315k
  ystride=_enc->state.ref_ystride[0];
1991
315k
  frags=_enc->state.frags;
1992
315k
  frag_buf_offs=_enc->state.frag_buf_offs;
1993
315k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1994
315k
  mvs=_enc->mb_info[_mbi].block_mv;
1995
1.57M
  for(bi=0;bi<4;bi++){
1996
1.26M
    fragi=sb_map[bi];
1997
1.26M
    borderi=frags[fragi].borderi;
1998
1.26M
    frag_offs=frag_buf_offs[fragi];
1999
1.26M
    if(borderi<0){
2000
703k
      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2001
703k
    }
2002
557k
    else{
2003
557k
      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2004
557k
       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2005
557k
    }
2006
    /*Scale to match DCT domain and RD.*/
2007
1.26M
    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2008
    /*Motion is a special case; if there is more than a full-pixel motion
2009
       against the prior frame, penalize skipping.
2010
      TODO: The factor of two here is a kludge, but it tested out better than a
2011
       hard limit.*/
2012
1.26M
    if(mvs[bi]!=0)uncoded_ssd*=2;
2013
1.26M
    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2014
1.26M
  }
2015
315k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2016
315k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2017
315k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2018
315k
  map_nidxs=(map_nidxs-4>>1)+4;
2019
315k
  mapii=4;
2020
315k
  mvs=_enc->mb_info[_mbi].unref_mv;
2021
945k
  for(pli=1;pli<3;pli++){
2022
630k
    ystride=_enc->state.ref_ystride[pli];
2023
1.36M
    for(;mapii<map_nidxs;mapii++){
2024
734k
      mapi=map_idxs[mapii];
2025
734k
      bi=mapi&3;
2026
734k
      fragi=mb_map[pli][bi];
2027
734k
      borderi=frags[fragi].borderi;
2028
734k
      frag_offs=frag_buf_offs[fragi];
2029
734k
      if(borderi<0){
2030
411k
        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2031
411k
      }
2032
323k
      else{
2033
323k
        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2034
323k
         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2035
323k
      }
2036
      /*Scale to match DCT domain and RD.*/
2037
734k
      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2038
      /*Motion is a special case; if there is more than a full-pixel motion
2039
         against the prior frame, penalize skipping.
2040
        TODO: The factor of two here is a kludge, but it tested out better than
2041
         a hard limit*/
2042
734k
      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2043
734k
      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2044
734k
    }
2045
630k
    map_nidxs=(map_nidxs-4<<1)+4;
2046
630k
  }
2047
315k
}
2048
2049
2050
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2051
 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2052
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2053
630k
 const unsigned _rd_scale[5]){
2054
630k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2055
630k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2056
630k
   _frag_satd,_skip_ssd,_rd_scale[4],0);
2057
630k
  _modec->overhead=
2058
630k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2059
630k
  oc_mode_set_cost(_modec,_enc->lambda);
2060
630k
}
2061
2062
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2063
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2064
 const oc_fr_state *_fr,const oc_qii_state *_qs,
2065
2.27M
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2066
2.27M
  unsigned               frag_satd[12];
2067
2.27M
  const unsigned char   *src;
2068
2.27M
  const unsigned char   *ref;
2069
2.27M
  int                    ystride;
2070
2.27M
  const ptrdiff_t       *frag_buf_offs;
2071
2.27M
  const ptrdiff_t       *sb_map;
2072
2.27M
  const oc_mb_map_plane *mb_map;
2073
2.27M
  const unsigned char   *map_idxs;
2074
2.27M
  int                    map_nidxs;
2075
2.27M
  int                    mapii;
2076
2.27M
  int                    mapi;
2077
2.27M
  int                    mv_offs[2];
2078
2.27M
  int                    pli;
2079
2.27M
  int                    bi;
2080
2.27M
  ptrdiff_t              fragi;
2081
2.27M
  ptrdiff_t              frag_offs;
2082
2.27M
  int                    dc;
2083
2.27M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2084
2.27M
  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2085
2.27M
  ystride=_enc->state.ref_ystride[0];
2086
2.27M
  frag_buf_offs=_enc->state.frag_buf_offs;
2087
2.27M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2088
2.27M
  _modec->rate=_modec->ssd=0;
2089
2.27M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2090
2.10M
    for(bi=0;bi<4;bi++){
2091
1.68M
      fragi=sb_map[bi];
2092
1.68M
      frag_offs=frag_buf_offs[fragi];
2093
1.68M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2094
1.68M
        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2095
1.68M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2096
1.68M
        frag_satd[bi]+=abs(dc);
2097
1.68M
      }
2098
0
      else{
2099
0
        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2100
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2101
0
      }
2102
1.68M
    }
2103
420k
  }
2104
1.85M
  else{
2105
9.26M
    for(bi=0;bi<4;bi++){
2106
7.41M
      fragi=sb_map[bi];
2107
7.41M
      frag_offs=frag_buf_offs[fragi];
2108
7.41M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2109
7.41M
        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2110
7.41M
         ref+frag_offs+mv_offs[0],ystride);
2111
7.41M
        frag_satd[bi]+=abs(dc);
2112
7.41M
      }
2113
0
      else{
2114
0
        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2115
0
         ref+frag_offs+mv_offs[0],ystride);
2116
0
      }
2117
7.41M
    }
2118
1.85M
  }
2119
2.27M
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2120
2.27M
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2121
2.27M
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2122
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2123
2.27M
  ystride=_enc->state.ref_ystride[1];
2124
2.27M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2125
2.58M
    for(mapii=4;mapii<map_nidxs;mapii++){
2126
1.77M
      mapi=map_idxs[mapii];
2127
1.77M
      pli=mapi>>2;
2128
1.77M
      bi=mapi&3;
2129
1.77M
      fragi=mb_map[pli][bi];
2130
1.77M
      frag_offs=frag_buf_offs[fragi];
2131
1.77M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2132
1.77M
        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2133
1.77M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2134
1.77M
        frag_satd[mapii]+=abs(dc);
2135
1.77M
      }
2136
0
      else{
2137
0
        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2138
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2139
0
      }
2140
1.77M
    }
2141
809k
  }
2142
1.46M
  else{
2143
4.98M
    for(mapii=4;mapii<map_nidxs;mapii++){
2144
3.52M
      mapi=map_idxs[mapii];
2145
3.52M
      pli=mapi>>2;
2146
3.52M
      bi=mapi&3;
2147
3.52M
      fragi=mb_map[pli][bi];
2148
3.52M
      frag_offs=frag_buf_offs[fragi];
2149
3.52M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2150
3.52M
        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2151
3.52M
         ref+frag_offs+mv_offs[0],ystride);
2152
3.52M
        frag_satd[mapii]+=abs(dc);
2153
3.52M
      }
2154
0
      else{
2155
0
        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2156
0
         ref+frag_offs+mv_offs[0],ystride);
2157
0
      }
2158
3.52M
    }
2159
1.46M
  }
2160
2.27M
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2161
2.27M
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2162
2.27M
   frag_satd,_skip_ssd,_rd_scale[4],1);
2163
2.27M
  _modec->overhead=
2164
2.27M
   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2165
2.27M
  oc_mode_set_cost(_modec,_enc->lambda);
2166
2.27M
}
2167
2168
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2169
 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2170
630k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2171
630k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2172
630k
}
2173
2174
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2175
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2176
 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2177
1.01M
 const unsigned _rd_scale[5]){
2178
1.01M
  int bits0;
2179
1.01M
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2180
1.01M
  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2181
1.01M
  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2182
1.01M
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2183
1.01M
  oc_mode_set_cost(_modec,_enc->lambda);
2184
1.01M
  return bits0;
2185
1.01M
}
2186
2187
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2188
static const unsigned char OC_MB_PHASE[4][4]={
2189
  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2190
};
2191
2192
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2193
 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2194
365k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2195
365k
  unsigned               frag_satd[12];
2196
365k
  oc_mv                  lbmvs[4];
2197
365k
  oc_mv                  cbmvs[4];
2198
365k
  const unsigned char   *src;
2199
365k
  const unsigned char   *ref;
2200
365k
  int                    ystride;
2201
365k
  const ptrdiff_t       *frag_buf_offs;
2202
365k
  oc_mv                 *frag_mvs;
2203
365k
  const oc_mb_map_plane *mb_map;
2204
365k
  const unsigned char   *map_idxs;
2205
365k
  int                    map_nidxs;
2206
365k
  int                    nqis;
2207
365k
  int                    mapii;
2208
365k
  int                    mapi;
2209
365k
  int                    mv_offs[2];
2210
365k
  int                    pli;
2211
365k
  int                    bi;
2212
365k
  ptrdiff_t              fragi;
2213
365k
  ptrdiff_t              frag_offs;
2214
365k
  int                    bits0;
2215
365k
  int                    bits1;
2216
365k
  unsigned               satd;
2217
365k
  int                    dc;
2218
365k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2219
365k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2220
365k
  ystride=_enc->state.ref_ystride[0];
2221
365k
  frag_buf_offs=_enc->state.frag_buf_offs;
2222
365k
  frag_mvs=_enc->state.frag_mvs;
2223
365k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2224
365k
  _modec->rate=_modec->ssd=0;
2225
1.82M
  for(bi=0;bi<4;bi++){
2226
1.46M
    fragi=mb_map[0][bi];
2227
    /*Save the block MVs as the current ones while we're here; we'll replace
2228
       them if we don't ultimately choose 4MV mode.*/
2229
1.46M
    frag_mvs[fragi]=_mv[bi];
2230
1.46M
    frag_offs=frag_buf_offs[fragi];
2231
1.46M
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2232
174k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2233
174k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2234
174k
    }
2235
1.28M
    else{
2236
1.28M
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2237
1.28M
       ref+frag_offs+mv_offs[0],ystride);
2238
1.28M
    }
2239
1.46M
    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2240
1.46M
  }
2241
365k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2242
365k
   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2243
  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2244
365k
  bits0=0;
2245
365k
  bits1=0;
2246
365k
  nqis=_enc->state.nqis;
2247
1.82M
  for(bi=0;bi<4;bi++){
2248
1.46M
    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2249
1.20M
    else{
2250
1.20M
      lbmvs[bi]=_mv[bi];
2251
1.20M
      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2252
1.20M
       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2253
1.20M
      bits1+=12;
2254
1.20M
    }
2255
1.46M
  }
2256
365k
  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2257
365k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2258
365k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2259
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2260
365k
  ystride=_enc->state.ref_ystride[1];
2261
1.22M
  for(mapii=4;mapii<map_nidxs;mapii++){
2262
860k
    mapi=map_idxs[mapii];
2263
860k
    pli=mapi>>2;
2264
860k
    bi=mapi&3;
2265
860k
    fragi=mb_map[pli][bi];
2266
860k
    frag_offs=frag_buf_offs[fragi];
2267
    /*TODO: We could save half these calls by re-using the results for the Cb
2268
       and Cr planes; is it worth it?*/
2269
860k
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2270
537k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2271
537k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2272
537k
    }
2273
322k
    else{
2274
322k
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2275
322k
       ref+frag_offs+mv_offs[0],ystride);
2276
322k
    }
2277
860k
    frag_satd[mapii]=satd+abs(dc);
2278
860k
  }
2279
365k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2280
365k
   frag_satd,_skip_ssd,_rd_scale[4],1);
2281
365k
  _modec->overhead=
2282
365k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2283
365k
   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2284
365k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2285
365k
  oc_mode_set_cost(_modec,_enc->lambda);
2286
365k
}
2287
2288
30.7k
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2289
30.7k
  oc_set_chroma_mvs_func  set_chroma_mvs;
2290
30.7k
  oc_qii_state            intra_luma_qs;
2291
30.7k
  oc_mv                   last_mv;
2292
30.7k
  oc_mv                   prior_mv;
2293
30.7k
  ogg_int64_t             interbits;
2294
30.7k
  ogg_int64_t             intrabits;
2295
30.7k
  ogg_int64_t             activity_sum;
2296
30.7k
  ogg_int64_t             luma_sum;
2297
30.7k
  unsigned                activity_avg;
2298
30.7k
  unsigned                luma_avg;
2299
30.7k
  const ogg_uint16_t     *chroma_rd_scale;
2300
30.7k
  ogg_uint16_t           *mcu_rd_scale;
2301
30.7k
  ogg_uint16_t           *mcu_rd_iscale;
2302
30.7k
  const unsigned char    *map_idxs;
2303
30.7k
  int                     nmap_idxs;
2304
30.7k
  unsigned               *coded_mbis;
2305
30.7k
  unsigned               *uncoded_mbis;
2306
30.7k
  size_t                  ncoded_mbis;
2307
30.7k
  size_t                  nuncoded_mbis;
2308
30.7k
  oc_sb_flags            *sb_flags;
2309
30.7k
  signed char            *mb_modes;
2310
30.7k
  const oc_sb_map        *sb_maps;
2311
30.7k
  const oc_mb_map        *mb_maps;
2312
30.7k
  oc_mb_enc_info         *embs;
2313
30.7k
  oc_fragment            *frags;
2314
30.7k
  oc_mv                  *frag_mvs;
2315
30.7k
  unsigned                stripe_sby;
2316
30.7k
  unsigned                mcu_nvsbs;
2317
30.7k
  int                     notstart;
2318
30.7k
  int                     notdone;
2319
30.7k
  unsigned                sbi;
2320
30.7k
  unsigned                sbi_end;
2321
30.7k
  int                     refi;
2322
30.7k
  int                     pli;
2323
30.7k
  int                     sp_level;
2324
30.7k
  sp_level=_enc->sp_level;
2325
30.7k
  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2326
30.7k
  _enc->state.frame_type=OC_INTER_FRAME;
2327
30.7k
  oc_mode_scheme_chooser_reset(&_enc->chooser);
2328
30.7k
  oc_enc_tokenize_start(_enc);
2329
30.7k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
2330
30.7k
  oc_enc_mode_rd_init(_enc);
2331
30.7k
  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2332
30.7k
  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
2333
30.7k
  interbits=intrabits=0;
2334
30.7k
  activity_sum=luma_sum=0;
2335
30.7k
  activity_avg=_enc->activity_avg;
2336
30.7k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2337
30.7k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2338
30.7k
  mcu_rd_scale=_enc->mcu_rd_scale;
2339
30.7k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
2340
30.7k
  last_mv=prior_mv=0;
2341
  /*Choose MVs and MB modes and quantize and code luma.
2342
    Must be done in Hilbert order.*/
2343
30.7k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2344
30.7k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2345
30.7k
  coded_mbis=_enc->coded_mbis;
2346
30.7k
  uncoded_mbis=coded_mbis+_enc->state.nmbs;
2347
30.7k
  ncoded_mbis=0;
2348
30.7k
  nuncoded_mbis=0;
2349
30.7k
  _enc->state.ncoded_fragis[0]=0;
2350
30.7k
  _enc->state.ncoded_fragis[1]=0;
2351
30.7k
  _enc->state.ncoded_fragis[2]=0;
2352
30.7k
  sb_flags=_enc->state.sb_flags;
2353
30.7k
  mb_modes=_enc->state.mb_modes;
2354
30.7k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2355
30.7k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2356
30.7k
  embs=_enc->mb_info;
2357
30.7k
  frags=_enc->state.frags;
2358
30.7k
  frag_mvs=_enc->state.frag_mvs;
2359
30.7k
  notstart=0;
2360
30.7k
  notdone=1;
2361
30.7k
  mcu_nvsbs=_enc->mcu_nvsbs;
2362
63.9k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2363
33.1k
    ptrdiff_t cfroffset;
2364
33.1k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2365
33.1k
    sbi_end=_enc->pipe.sbi_end[0];
2366
33.1k
    cfroffset=_enc->pipe.froffset[1];
2367
133k
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2368
100k
      int quadi;
2369
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
2370
503k
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2371
315k
        oc_mode_choice modes[8];
2372
315k
        unsigned       activity[4];
2373
315k
        unsigned       rd_scale[5];
2374
315k
        unsigned       rd_iscale[5];
2375
315k
        unsigned       skip_ssd[12];
2376
315k
        unsigned       intra_satd[12];
2377
315k
        unsigned       luma;
2378
315k
        int            mb_mv_bits_0;
2379
315k
        int            mb_gmv_bits_0;
2380
315k
        int            inter_mv_pref;
2381
315k
        int            mb_mode;
2382
315k
        int            refi;
2383
315k
        int            mv;
2384
315k
        unsigned       mbi;
2385
315k
        int            mapii;
2386
315k
        int            mapi;
2387
315k
        int            bi;
2388
315k
        ptrdiff_t      fragi;
2389
315k
        mbi=sbi<<2|quadi;
2390
315k
        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2391
        /*Activity masking.*/
2392
315k
        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2393
315k
          oc_mb_activity(_enc,mbi,activity);
2394
315k
        }
2395
0
        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2396
315k
        luma_sum+=luma;
2397
315k
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2398
315k
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2399
        /*Motion estimation:
2400
          We always do a basic 1MV search for all macroblocks, coded or not,
2401
           keyframe or not.*/
2402
315k
        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2403
315k
        mv=0;
2404
        /*Find the block choice with the lowest estimated coding cost.
2405
          If a Cb or Cr block is coded but no Y' block from a macro block then
2406
           the mode MUST be OC_MODE_INTER_NOMV.
2407
          This is the default state to which the mode data structure is
2408
           initialised in encoder and decoder at the start of each frame.*/
2409
        /*Block coding cost is estimated from correlated SATD metrics.*/
2410
        /*At this point, all blocks that are in frame are still marked coded.*/
2411
315k
        if(!_recode){
2412
238k
          embs[mbi].unref_mv[OC_FRAME_GOLD]=
2413
238k
           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2414
238k
          embs[mbi].unref_mv[OC_FRAME_PREV]=
2415
238k
           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2416
238k
          embs[mbi].refined=0;
2417
238k
        }
2418
        /*Estimate the cost of coding this MB in a keyframe.*/
2419
315k
        if(_allow_keyframe){
2420
315k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2421
315k
           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2422
315k
          intrabits+=modes[OC_MODE_INTRA].rate;
2423
1.57M
          for(bi=0;bi<4;bi++){
2424
1.26M
            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2425
1.26M
             modes[OC_MODE_INTRA].qii[bi]);
2426
1.26M
          }
2427
315k
        }
2428
        /*Estimate the cost in a delta frame for various modes.*/
2429
315k
        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2430
315k
        if(sp_level<OC_SP_LEVEL_NOMC){
2431
315k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2432
315k
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2433
315k
           skip_ssd,rd_scale);
2434
315k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2435
315k
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2436
315k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2437
315k
           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2438
315k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2439
315k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2440
315k
           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2441
315k
           skip_ssd,rd_scale);
2442
315k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2443
315k
           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2444
315k
           skip_ssd,rd_scale);
2445
315k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2446
315k
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2447
315k
           skip_ssd,rd_scale);
2448
315k
          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2449
315k
           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2450
315k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2451
          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
2452
             refinement.
2453
            We choose the explicit MV mode that's already furthest ahead on
2454
             R-D cost and refine only that one.
2455
            We have to be careful to remember which ones we've refined so that
2456
             we don't refine it again if we re-encode this frame.*/
2457
315k
          inter_mv_pref=_enc->lambda*3;
2458
315k
          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2459
315k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2460
315k
             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2461
315k
             skip_ssd,rd_scale);
2462
315k
          }
2463
0
          else{
2464
0
            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2465
0
          }
2466
315k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2467
315k
           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2468
49.9k
            if(!(embs[mbi].refined&0x80)){
2469
37.2k
              oc_mcenc_refine4mv(_enc,mbi);
2470
37.2k
              embs[mbi].refined|=0x80;
2471
37.2k
            }
2472
49.9k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2473
49.9k
             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2474
49.9k
             skip_ssd,rd_scale);
2475
49.9k
          }
2476
265k
          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2477
265k
           modes[OC_MODE_INTER_MV].cost){
2478
67.4k
            if(!(embs[mbi].refined&0x40)){
2479
57.3k
              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2480
57.3k
              embs[mbi].refined|=0x40;
2481
57.3k
            }
2482
67.4k
            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2483
67.4k
             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2484
67.4k
             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2485
67.4k
          }
2486
315k
          if(!(embs[mbi].refined&0x04)){
2487
238k
            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2488
238k
            embs[mbi].refined|=0x04;
2489
238k
          }
2490
315k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2491
315k
           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2492
315k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2493
          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2494
315k
          mb_mode=OC_MODE_INTER_NOMV;
2495
315k
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2496
250k
            mb_mode=OC_MODE_INTRA;
2497
250k
          }
2498
315k
          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2499
33.0k
            mb_mode=OC_MODE_INTER_MV_LAST;
2500
33.0k
          }
2501
315k
          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2502
6.70k
            mb_mode=OC_MODE_INTER_MV_LAST2;
2503
6.70k
          }
2504
315k
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2505
7.77k
            mb_mode=OC_MODE_GOLDEN_NOMV;
2506
7.77k
          }
2507
315k
          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2508
22.0k
            mb_mode=OC_MODE_GOLDEN_MV;
2509
22.0k
          }
2510
315k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2511
13.1k
            mb_mode=OC_MODE_INTER_MV_FOUR;
2512
13.1k
          }
2513
          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2514
315k
          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2515
27.6k
            inter_mv_pref=0;
2516
27.6k
          }
2517
315k
          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2518
22.9k
            mb_mode=OC_MODE_INTER_MV;
2519
22.9k
          }
2520
315k
        }
2521
0
        else{
2522
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2523
0
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2524
0
           skip_ssd,rd_scale);
2525
0
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2526
0
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2527
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2528
0
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2529
0
           skip_ssd,rd_scale);
2530
0
          mb_mode=OC_MODE_INTER_NOMV;
2531
0
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2532
0
            mb_mode=OC_MODE_INTRA;
2533
0
          }
2534
0
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2535
0
            mb_mode=OC_MODE_GOLDEN_NOMV;
2536
0
          }
2537
0
          mb_mv_bits_0=mb_gmv_bits_0=0;
2538
0
        }
2539
315k
        mb_modes[mbi]=mb_mode;
2540
        /*Propagate the MVs to the luma blocks.*/
2541
315k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2542
307k
          switch(mb_mode){
2543
22.9k
            case OC_MODE_INTER_MV:{
2544
22.9k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2545
22.9k
            }break;
2546
20.9k
            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2547
3.76k
            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2548
12.8k
            case OC_MODE_GOLDEN_MV:{
2549
12.8k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2550
12.8k
            }break;
2551
307k
          }
2552
1.53M
          for(bi=0;bi<4;bi++){
2553
1.23M
            fragi=mb_maps[mbi][0][bi];
2554
1.23M
            frag_mvs[fragi]=mv;
2555
1.23M
          }
2556
307k
        }
2557
1.57M
        for(bi=0;bi<4;bi++){
2558
1.26M
          fragi=sb_maps[mbi>>2][mbi&3][bi];
2559
1.26M
          frags[fragi].qii=modes[mb_mode].qii[bi];
2560
1.26M
        }
2561
315k
        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2562
315k
         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2563
257k
          int orig_mb_mode;
2564
257k
          orig_mb_mode=mb_mode;
2565
257k
          mb_mode=mb_modes[mbi];
2566
257k
          refi=OC_FRAME_FOR_MODE(mb_mode);
2567
257k
          switch(mb_mode){
2568
12.8k
            case OC_MODE_INTER_MV:{
2569
12.8k
              prior_mv=last_mv;
2570
              /*If we're backing out from 4MV, find the MV we're actually
2571
                 using.*/
2572
12.8k
              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2573
554
                for(bi=0;;bi++){
2574
554
                  fragi=mb_maps[mbi][0][bi];
2575
554
                  if(frags[fragi].coded){
2576
167
                    mv=last_mv=frag_mvs[fragi];
2577
167
                    break;
2578
167
                  }
2579
554
                }
2580
167
                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2581
167
                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
2582
167
              }
2583
              /*Otherwise we used the original analysis MV.*/
2584
12.6k
              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2585
12.8k
              _enc->mv_bits[0]+=mb_mv_bits_0;
2586
12.8k
              _enc->mv_bits[1]+=12;
2587
12.8k
            }break;
2588
3.30k
            case OC_MODE_INTER_MV_LAST2:{
2589
3.30k
              oc_mv tmp_mv;
2590
3.30k
              tmp_mv=prior_mv;
2591
3.30k
              prior_mv=last_mv;
2592
3.30k
              last_mv=tmp_mv;
2593
3.30k
            }break;
2594
7.61k
            case OC_MODE_GOLDEN_MV:{
2595
7.61k
              _enc->mv_bits[0]+=mb_gmv_bits_0;
2596
7.61k
              _enc->mv_bits[1]+=12;
2597
7.61k
            }break;
2598
5.79k
            case OC_MODE_INTER_MV_FOUR:{
2599
5.79k
              oc_mv lbmvs[4];
2600
5.79k
              oc_mv cbmvs[4];
2601
5.79k
              prior_mv=last_mv;
2602
28.9k
              for(bi=0;bi<4;bi++){
2603
23.1k
                fragi=mb_maps[mbi][0][bi];
2604
23.1k
                if(frags[fragi].coded){
2605
21.9k
                  lbmvs[bi]=last_mv=frag_mvs[fragi];
2606
21.9k
                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2607
21.9k
                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2608
21.9k
                  _enc->mv_bits[1]+=12;
2609
21.9k
                }
2610
                /*Replace the block MVs for not-coded blocks with (0,0).*/
2611
1.26k
                else lbmvs[bi]=0;
2612
23.1k
              }
2613
5.79k
              (*set_chroma_mvs)(cbmvs,lbmvs);
2614
19.9k
              for(mapii=4;mapii<nmap_idxs;mapii++){
2615
14.2k
                mapi=map_idxs[mapii];
2616
14.2k
                pli=mapi>>2;
2617
14.2k
                bi=mapi&3;
2618
14.2k
                fragi=mb_maps[mbi][pli][bi];
2619
14.2k
                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2620
14.2k
                frags[fragi].refi=refi;
2621
14.2k
                frags[fragi].mb_mode=mb_mode;
2622
14.2k
                frag_mvs[fragi]=cbmvs[bi];
2623
14.2k
              }
2624
5.79k
            }break;
2625
257k
          }
2626
257k
          coded_mbis[ncoded_mbis++]=mbi;
2627
257k
          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2628
257k
          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2629
257k
        }
2630
58.1k
        else{
2631
58.1k
          *(uncoded_mbis-++nuncoded_mbis)=mbi;
2632
58.1k
          mb_mode=OC_MODE_INTER_NOMV;
2633
58.1k
          refi=OC_FRAME_PREV;
2634
58.1k
          mv=0;
2635
58.1k
        }
2636
        /*Propagate final MB mode and MVs to the chroma blocks.
2637
          This has already been done for 4MV mode, since it requires individual
2638
           block motion vectors.*/
2639
315k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2640
1.03M
          for(mapii=4;mapii<nmap_idxs;mapii++){
2641
720k
            mapi=map_idxs[mapii];
2642
720k
            pli=mapi>>2;
2643
720k
            bi=mapi&3;
2644
720k
            fragi=mb_maps[mbi][pli][bi];
2645
            /*If we switched from 4MV mode to INTER_MV mode, then the qii
2646
               values won't have been chosen with the right MV, but it's
2647
               probably not worth re-estimating them.*/
2648
720k
            frags[fragi].qii=modes[mb_mode].qii[mapii];
2649
720k
            frags[fragi].refi=refi;
2650
720k
            frags[fragi].mb_mode=mb_mode;
2651
720k
            frag_mvs[fragi]=mv;
2652
720k
          }
2653
309k
        }
2654
        /*Save masking scale factors for chroma blocks.*/
2655
682k
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2656
367k
          mapi=map_idxs[mapii];
2657
367k
          bi=mapi&3;
2658
367k
          fragi=mb_maps[mbi][1][bi];
2659
367k
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2660
367k
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2661
367k
        }
2662
315k
      }
2663
100k
      oc_fr_state_flush_sb(_enc->pipe.fr+0);
2664
100k
      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2665
100k
      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2666
100k
    }
2667
33.1k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2668
    /*Code chroma planes.*/
2669
99.5k
    for(pli=1;pli<3;pli++){
2670
66.3k
      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2671
66.3k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2672
66.3k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2673
66.3k
    }
2674
33.1k
    notstart=1;
2675
33.1k
  }
2676
  /*Update the average block activity and MB luma score for the frame.
2677
    We could use a Bessel follower here, but fast reaction is probably almost
2678
     always best.*/
2679
30.7k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2680
30.7k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2681
30.7k
   _enc->state.fplanes[0].nfrags));
2682
30.7k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2683
  /*Finish filling in the reference frame borders.*/
2684
30.7k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2685
123k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2686
  /*Finish adding flagging overhead costs to inter bit counts to determine if
2687
     we should have coded a key frame instead.*/
2688
30.7k
  if(_allow_keyframe){
2689
    /*Technically the chroma plane counts are over-estimations, because they
2690
       don't account for continuing runs from the luma planes, but the
2691
       inaccuracy is small.
2692
      We don't need to add the luma plane coding flag costs, because they are
2693
       already included in the MB rate estimates.*/
2694
92.3k
    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2695
30.7k
    if(interbits>intrabits)return 1;
2696
30.7k
  }
2697
18.8k
  _enc->ncoded_mbis=ncoded_mbis;
2698
  /*Compact the coded fragment list.*/
2699
18.8k
  {
2700
18.8k
    ptrdiff_t ncoded_fragis;
2701
18.8k
    ncoded_fragis=_enc->state.ncoded_fragis[0];
2702
56.5k
    for(pli=1;pli<3;pli++){
2703
37.7k
      memmove(_enc->state.coded_fragis+ncoded_fragis,
2704
37.7k
       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2705
37.7k
       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2706
37.7k
      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2707
37.7k
    }
2708
18.8k
    _enc->state.ntotal_coded_fragis=ncoded_fragis;
2709
18.8k
  }
2710
18.8k
  return 0;
2711
30.7k
}