Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/analyze.c
Line
Count
Source (jump to first uncovered line)
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function: mode selection code
14
  last mod: $Id$
15
16
 ********************************************************************/
17
#include <limits.h>
18
#include <string.h>
19
#include "encint.h"
20
#include "modedec.h"
21
#if defined(OC_COLLECT_METRICS)
22
# include "collect.c"
23
#endif
24
25
26
27
typedef struct oc_rd_metric          oc_rd_metric;
28
typedef struct oc_mode_choice        oc_mode_choice;
29
30
31
32
/*There are 8 possible schemes used to encode macro block modes.
33
  Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
34
  The same set of Huffman codes is used for each of these 7 schemes, but the
35
   mode assigned to each codeword varies.
36
  Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
37
   while schemes 1-6 have a fixed mapping.
38
  Scheme 7 just encodes each mode directly in 3 bits.*/
39
40
/*The mode orderings for the various mode coding schemes.
41
  Scheme 0 uses a custom alphabet, which is not stored in this table.
42
  This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
43
   decoder.*/
44
static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
45
  /*Last MV dominates.*/
46
  /*L P M N I G GM 4*/
47
  {3,4,2,0,1,5,6,7},
48
  /*L P N M I G GM 4*/
49
  {2,4,3,0,1,5,6,7},
50
  /*L M P N I G GM 4*/
51
  {3,4,1,0,2,5,6,7},
52
  /*L M N P I G GM 4*/
53
  {2,4,1,0,3,5,6,7},
54
  /*No MV dominates.*/
55
  /*N L P M I G GM 4*/
56
  {0,4,3,1,2,5,6,7},
57
  /*N G L P M I GM 4*/
58
  {0,5,4,2,3,1,6,7},
59
  /*Default ordering.*/
60
  /*N I M L P G GM 4*/
61
  {0,1,2,3,4,5,6,7}
62
};
63
64
65
66
/*Initialize the mode scheme chooser.
67
  This need only be called once per encoder.*/
68
3.44k
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
69
3.44k
  int si;
70
3.44k
  _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
71
27.5k
  for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
72
3.44k
}
73
74
/*Reset the mode scheme chooser.
75
  This needs to be called once for each frame, including the first.*/
76
39.4k
static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
77
39.4k
  int si;
78
39.4k
  memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
79
  /*Scheme 0 starts with 24 bits to store the mode list in.*/
80
39.4k
  _chooser->scheme_bits[0]=24;
81
39.4k
  memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
82
354k
  for(si=0;si<8;si++){
83
    /*Scheme 7 should always start first, and scheme 0 should always start
84
       last.*/
85
315k
    _chooser->scheme_list[si]=7-si;
86
315k
    _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
87
315k
  }
88
39.4k
}
89
90
/*Return the cost of coding _mb_mode in the specified scheme.*/
91
static int oc_mode_scheme_chooser_scheme_mb_cost(
92
13.3M
 const oc_mode_scheme_chooser *_chooser,int _scheme,int _mb_mode){
93
13.3M
  int codebook;
94
13.3M
  int ri;
95
13.3M
  codebook=_scheme+1>>3;
96
  /*For any scheme except 0, we can just use the bit cost of the mode's rank
97
     in that scheme.*/
98
13.3M
  ri=_chooser->mode_ranks[_scheme][_mb_mode];
99
13.3M
  if(_scheme==0){
100
2.06M
    int mc;
101
    /*For scheme 0, incrementing the mode count could potentially change the
102
       mode's rank.
103
      Find the index where the mode would be moved to in the optimal list,
104
       and use its bit cost instead of the one for the mode's current
105
       position in the list.*/
106
    /*We don't actually reorder the list; this is for computing opportunity
107
       cost, not an update.*/
108
2.06M
    mc=_chooser->mode_counts[_mb_mode];
109
5.27M
    while(ri>0&&mc>=_chooser->mode_counts[_chooser->scheme0_list[ri-1]])ri--;
110
2.06M
  }
111
13.3M
  return OC_MODE_BITS[codebook][ri];
112
13.3M
}
113
114
/*This is the real purpose of this data structure: not actually selecting a
115
   mode scheme, but estimating the cost of coding a given mode given all the
116
   modes selected so far.
117
  This is done via opportunity cost: the cost is defined as the number of bits
118
   required to encode all the modes selected so far including the current one
119
   using the best possible scheme, minus the number of bits required to encode
120
   all the modes selected so far not including the current one using the best
121
   possible scheme.
122
  The computational expense of doing this probably makes it overkill.
123
  Just be happy we take a greedy approach instead of trying to solve the
124
   global mode-selection problem (which is NP-hard).
125
  _mb_mode: The mode to determine the cost of.
126
  Return: The number of bits required to code this mode.*/
127
static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
128
4.26M
 int _mb_mode){
129
4.26M
  int scheme0;
130
4.26M
  int scheme1;
131
4.26M
  int best_bits;
132
4.26M
  int mode_bits;
133
4.26M
  int si;
134
4.26M
  int scheme0_bits;
135
4.26M
  int scheme1_bits;
136
4.26M
  scheme0=_chooser->scheme_list[0];
137
4.26M
  scheme1=_chooser->scheme_list[1];
138
4.26M
  scheme0_bits=_chooser->scheme_bits[scheme0];
139
4.26M
  scheme1_bits=_chooser->scheme_bits[scheme1];
140
4.26M
  mode_bits=oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme0,_mb_mode);
141
  /*Typical case: If the difference between the best scheme and the next best
142
     is greater than 6 bits, then adding just one mode cannot change which
143
     scheme we use.*/
144
4.26M
  if(scheme1_bits-scheme0_bits>6)return mode_bits;
145
  /*Otherwise, check to see if adding this mode selects a different scheme as
146
     the best.*/
147
1.93M
  si=1;
148
1.93M
  best_bits=scheme0_bits+mode_bits;
149
9.06M
  do{
150
9.06M
    int cur_bits;
151
9.06M
    cur_bits=scheme1_bits+
152
9.06M
     oc_mode_scheme_chooser_scheme_mb_cost(_chooser,scheme1,_mb_mode);
153
9.06M
    if(cur_bits<best_bits)best_bits=cur_bits;
154
9.06M
    if(++si>=8)break;
155
9.06M
    scheme1=_chooser->scheme_list[si];
156
9.06M
    scheme1_bits=_chooser->scheme_bits[scheme1];
157
9.06M
  }
158
9.06M
  while(scheme1_bits-scheme0_bits<=6);
159
0
  return best_bits-scheme0_bits;
160
4.26M
}
161
162
/*Incrementally update the mode counts and per-scheme bit counts and re-order
163
   the scheme lists once a mode has been selected.
164
  _mb_mode: The mode that was chosen.*/
165
static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
166
346k
 int _mb_mode){
167
346k
  int ri;
168
346k
  int si;
169
346k
  _chooser->mode_counts[_mb_mode]++;
170
  /*Re-order the scheme0 mode list if necessary.*/
171
483k
  for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
172
211k
    int pmode;
173
211k
    pmode=_chooser->scheme0_list[ri-1];
174
211k
    if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
175
    /*Reorder the mode ranking.*/
176
136k
    _chooser->scheme0_ranks[pmode]++;
177
136k
    _chooser->scheme0_list[ri]=pmode;
178
136k
  }
179
346k
  _chooser->scheme0_ranks[_mb_mode]=ri;
180
346k
  _chooser->scheme0_list[ri]=_mb_mode;
181
  /*Now add the bit cost for the mode to each scheme.*/
182
3.11M
  for(si=0;si<8;si++){
183
2.77M
    _chooser->scheme_bits[si]+=
184
2.77M
     OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
185
2.77M
  }
186
  /*Finally, re-order the list of schemes.*/
187
2.77M
  for(si=1;si<8;si++){
188
2.42M
    int sj;
189
2.42M
    int scheme0;
190
2.42M
    int bits0;
191
2.42M
    sj=si;
192
2.42M
    scheme0=_chooser->scheme_list[si];
193
2.42M
    bits0=_chooser->scheme_bits[scheme0];
194
2.70M
    do{
195
2.70M
      int scheme1;
196
2.70M
      scheme1=_chooser->scheme_list[sj-1];
197
2.70M
      if(bits0>=_chooser->scheme_bits[scheme1])break;
198
305k
      _chooser->scheme_list[sj]=scheme1;
199
305k
    }
200
2.42M
    while(--sj>0);
201
0
    _chooser->scheme_list[sj]=scheme0;
202
2.42M
  }
203
346k
}
204
205
206
207
/*The number of bits required to encode a super block run.
208
  _run_count: The desired run count; must be positive and less than 4130.*/
209
157M
static int oc_sb_run_bits(int _run_count){
210
157M
  int i;
211
552M
  for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
212
157M
  return OC_SB_RUN_CODE_NBITS[i];
213
157M
}
214
215
/*The number of bits required to encode a block run.
216
  _run_count: The desired run count; must be positive and less than 30.*/
217
27.2M
static int oc_block_run_bits(int _run_count){
218
27.2M
  return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
219
27.2M
}
220
221
222
223
186k
static void oc_fr_state_init(oc_fr_state *_fr){
224
186k
  _fr->bits=0;
225
186k
  _fr->sb_partial_count=0;
226
186k
  _fr->sb_full_count=0;
227
186k
  _fr->b_coded_count_prev=0;
228
186k
  _fr->b_coded_count=0;
229
186k
  _fr->b_count=0;
230
186k
  _fr->sb_prefer_partial=0;
231
186k
  _fr->sb_bits=0;
232
186k
  _fr->sb_partial=-1;
233
186k
  _fr->sb_full=-1;
234
186k
  _fr->b_coded_prev=-1;
235
186k
  _fr->b_coded=-1;
236
186k
}
237
238
239
static int oc_fr_state_sb_cost(const oc_fr_state *_fr,
240
14.7M
 int _sb_partial,int _sb_full){
241
14.7M
  int bits;
242
14.7M
  int sb_partial_count;
243
14.7M
  int sb_full_count;
244
14.7M
  bits=0;
245
14.7M
  sb_partial_count=_fr->sb_partial_count;
246
  /*Extend the sb_partial run, or start a new one.*/
247
14.7M
  if(_fr->sb_partial==_sb_partial){
248
3.26M
    if(sb_partial_count>=4129){
249
0
      bits++;
250
0
      sb_partial_count=0;
251
0
    }
252
3.26M
    else bits-=oc_sb_run_bits(sb_partial_count);
253
3.26M
  }
254
11.5M
  else sb_partial_count=0;
255
14.7M
  bits+=oc_sb_run_bits(++sb_partial_count);
256
14.7M
  if(!_sb_partial){
257
    /*Extend the sb_full run, or start a new one.*/
258
3.77M
    sb_full_count=_fr->sb_full_count;
259
3.77M
    if(_fr->sb_full==_sb_full){
260
1.60M
      if(sb_full_count>=4129){
261
0
        bits++;
262
0
        sb_full_count=0;
263
0
      }
264
1.60M
      else bits-=oc_sb_run_bits(sb_full_count);
265
1.60M
    }
266
2.16M
    else sb_full_count=0;
267
3.77M
    bits+=oc_sb_run_bits(++sb_full_count);
268
3.77M
  }
269
14.7M
  return bits;
270
14.7M
}
271
272
static void oc_fr_state_advance_sb(oc_fr_state *_fr,
273
255k
 int _sb_partial,int _sb_full){
274
255k
  int sb_partial_count;
275
255k
  int sb_full_count;
276
255k
  sb_partial_count=_fr->sb_partial_count;
277
255k
  if(_fr->sb_partial!=_sb_partial||sb_partial_count>=4129)sb_partial_count=0;
278
255k
  sb_partial_count++;
279
255k
  if(!_sb_partial){
280
179k
    sb_full_count=_fr->sb_full_count;
281
179k
    if(_fr->sb_full!=_sb_full||sb_full_count>=4129)sb_full_count=0;
282
179k
    sb_full_count++;
283
179k
    _fr->sb_full_count=sb_full_count;
284
179k
    _fr->sb_full=_sb_full;
285
    /*Roll back the partial block state.*/
286
179k
    _fr->b_coded=_fr->b_coded_prev;
287
179k
    _fr->b_coded_count=_fr->b_coded_count_prev;
288
179k
  }
289
75.6k
  else{
290
    /*Commit back the partial block state.*/
291
75.6k
    _fr->b_coded_prev=_fr->b_coded;
292
75.6k
    _fr->b_coded_count_prev=_fr->b_coded_count;
293
75.6k
  }
294
255k
  _fr->sb_partial_count=sb_partial_count;
295
255k
  _fr->sb_partial=_sb_partial;
296
255k
  _fr->b_count=0;
297
255k
  _fr->sb_prefer_partial=0;
298
255k
  _fr->sb_bits=0;
299
255k
}
300
301
/*Commit the state of the current super block and advance to the next.*/
302
255k
static void oc_fr_state_flush_sb(oc_fr_state *_fr){
303
255k
  int sb_partial;
304
255k
  int sb_full;
305
255k
  int b_coded_count;
306
255k
  int b_count;
307
255k
  b_count=_fr->b_count;
308
255k
  b_coded_count=_fr->b_coded_count;
309
255k
  sb_full=_fr->b_coded;
310
255k
  sb_partial=b_coded_count<b_count;
311
255k
  if(!sb_partial){
312
    /*If the super block is fully coded/uncoded...*/
313
180k
    if(_fr->sb_prefer_partial){
314
      /*So far coding this super block as partial was cheaper anyway.*/
315
1.96k
      if(b_coded_count>15||_fr->b_coded_prev<0){
316
1.38k
        int sb_bits;
317
        /*If the block run is too long, this will limit how far it can be
318
           extended into the next partial super block.
319
          If we need to extend it farther, we don't want to have to roll all
320
           the way back here (since there could be many full SBs between now
321
           and then), so we disallow this.
322
          Similarly, if this is the start of a stripe, we don't know how the
323
           length of the outstanding block run from the previous stripe.*/
324
1.38k
        sb_bits=oc_fr_state_sb_cost(_fr,sb_partial,sb_full);
325
1.38k
        _fr->bits+=sb_bits-_fr->sb_bits;
326
1.38k
        _fr->sb_bits=sb_bits;
327
1.38k
      }
328
579
      else sb_partial=1;
329
1.96k
    }
330
180k
  }
331
255k
  oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
332
255k
}
333
334
34.9M
static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
335
34.9M
  ptrdiff_t bits;
336
34.9M
  int       sb_bits;
337
34.9M
  int       b_coded_count;
338
34.9M
  int       b_count;
339
34.9M
  int       sb_prefer_partial;
340
34.9M
  sb_bits=_fr->sb_bits;
341
34.9M
  bits=_fr->bits-sb_bits;
342
34.9M
  b_count=_fr->b_count;
343
34.9M
  b_coded_count=_fr->b_coded_count;
344
34.9M
  sb_prefer_partial=_fr->sb_prefer_partial;
345
34.9M
  if(b_coded_count>=b_count){
346
26.1M
    int sb_partial_bits;
347
    /*This super block is currently fully coded/uncoded.*/
348
26.1M
    if(b_count<=0){
349
      /*This is the first block in this SB.*/
350
2.84M
      b_count=1;
351
      /*Check to see whether it's cheaper to code it partially or fully.*/
352
2.84M
      if(_fr->b_coded==_b_coded){
353
608k
        sb_partial_bits=-oc_block_run_bits(b_coded_count);
354
608k
        sb_partial_bits+=oc_block_run_bits(++b_coded_count);
355
608k
      }
356
2.23M
      else{
357
2.23M
        b_coded_count=1;
358
2.23M
        sb_partial_bits=2;
359
2.23M
      }
360
2.84M
      sb_partial_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
361
2.84M
      sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
362
2.84M
      sb_prefer_partial=sb_partial_bits<sb_bits;
363
2.84M
      sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
364
2.84M
    }
365
23.3M
    else if(_fr->b_coded==_b_coded){
366
14.5M
      b_coded_count++;
367
14.5M
      if(++b_count<16){
368
13.9M
        if(sb_prefer_partial){
369
          /*Check to see if it's cheaper to code it fully.*/
370
917k
          sb_partial_bits=sb_bits;
371
917k
          sb_partial_bits+=oc_block_run_bits(b_coded_count);
372
917k
          if(b_coded_count>0){
373
917k
            sb_partial_bits-=oc_block_run_bits(b_coded_count-1);
374
917k
          }
375
917k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
376
917k
          sb_prefer_partial=sb_partial_bits<sb_bits;
377
917k
          sb_bits^=(sb_partial_bits^sb_bits)&-sb_prefer_partial;
378
917k
        }
379
        /*There's no need to check the converse (whether it's cheaper to code
380
           this SB partially if we were coding it fully), since the cost to
381
           code a SB partially can only increase as we add more blocks, whereas
382
           the cost to code it fully stays constant.*/
383
13.9M
      }
384
660k
      else{
385
        /*If we get to the end and this SB is still full, then force it to be
386
           coded full.
387
          Otherwise we might not be able to extend the block run far enough
388
           into the next partial SB.*/
389
660k
        if(sb_prefer_partial){
390
9.44k
          sb_prefer_partial=0;
391
9.44k
          sb_bits=oc_fr_state_sb_cost(_fr,0,_b_coded);
392
9.44k
        }
393
660k
      }
394
14.5M
    }
395
8.73M
    else{
396
      /*This SB was full, but now must be made partial.*/
397
8.73M
      if(!sb_prefer_partial){
398
8.16M
        sb_bits=oc_block_run_bits(b_coded_count);
399
8.16M
        if(b_coded_count>b_count){
400
1.79M
          sb_bits-=oc_block_run_bits(b_coded_count-b_count);
401
1.79M
        }
402
8.16M
        sb_bits+=oc_fr_state_sb_cost(_fr,1,_b_coded);
403
8.16M
      }
404
8.73M
      b_count++;
405
8.73M
      b_coded_count=1;
406
8.73M
      sb_prefer_partial=1;
407
8.73M
      sb_bits+=2;
408
8.73M
    }
409
26.1M
  }
410
8.74M
  else{
411
8.74M
    b_count++;
412
8.74M
    if(_fr->b_coded==_b_coded)sb_bits-=oc_block_run_bits(b_coded_count);
413
3.30M
    else b_coded_count=0;
414
8.74M
    sb_bits+=oc_block_run_bits(++b_coded_count);
415
8.74M
  }
416
34.9M
  _fr->bits=bits+sb_bits;
417
34.9M
  _fr->b_coded_count=b_coded_count;
418
34.9M
  _fr->b_coded=_b_coded;
419
34.9M
  _fr->b_count=b_count;
420
34.9M
  _fr->sb_prefer_partial=sb_prefer_partial;
421
34.9M
  _fr->sb_bits=sb_bits;
422
34.9M
}
423
424
13.7M
static void oc_fr_skip_block(oc_fr_state *_fr){
425
13.7M
  oc_fr_state_advance_block(_fr,0);
426
13.7M
}
427
428
21.2M
static void oc_fr_code_block(oc_fr_state *_fr){
429
21.2M
  oc_fr_state_advance_block(_fr,1);
430
21.2M
}
431
432
2.14M
static int oc_fr_cost1(const oc_fr_state *_fr){
433
2.14M
  oc_fr_state tmp;
434
2.14M
  ptrdiff_t   bits;
435
2.14M
  *&tmp=*_fr;
436
2.14M
  oc_fr_skip_block(&tmp);
437
2.14M
  bits=tmp.bits;
438
2.14M
  *&tmp=*_fr;
439
2.14M
  oc_fr_code_block(&tmp);
440
2.14M
  return (int)(tmp.bits-bits);
441
2.14M
}
442
443
353k
static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
444
353k
  oc_fr_state tmp;
445
353k
  *&tmp=*_pre;
446
353k
  oc_fr_skip_block(&tmp);
447
353k
  oc_fr_skip_block(&tmp);
448
353k
  oc_fr_skip_block(&tmp);
449
353k
  oc_fr_skip_block(&tmp);
450
353k
  return (int)(_post->bits-tmp.bits);
451
353k
}
452
453
454
455
225k
static void oc_qii_state_init(oc_qii_state *_qs){
456
225k
  _qs->bits=0;
457
225k
  _qs->qi01_count=0;
458
225k
  _qs->qi01=-1;
459
225k
  _qs->qi12_count=0;
460
225k
  _qs->qi12=-1;
461
225k
}
462
463
464
static void oc_qii_state_advance(oc_qii_state *_qd,
465
62.3M
 const oc_qii_state *_qs,int _qii){
466
62.3M
  ptrdiff_t bits;
467
62.3M
  int       qi01;
468
62.3M
  int       qi01_count;
469
62.3M
  int       qi12;
470
62.3M
  int       qi12_count;
471
62.3M
  bits=_qs->bits;
472
62.3M
  qi01=_qii+1>>1;
473
62.3M
  qi01_count=_qs->qi01_count;
474
62.3M
  if(qi01==_qs->qi01){
475
35.2M
    if(qi01_count>=4129){
476
2.07k
      bits++;
477
2.07k
      qi01_count=0;
478
2.07k
    }
479
35.2M
    else bits-=oc_sb_run_bits(qi01_count);
480
35.2M
  }
481
27.1M
  else qi01_count=0;
482
62.3M
  qi01_count++;
483
62.3M
  bits+=oc_sb_run_bits(qi01_count);
484
62.3M
  qi12_count=_qs->qi12_count;
485
62.3M
  if(_qii){
486
23.4M
    qi12=_qii>>1;
487
23.4M
    if(qi12==_qs->qi12){
488
13.5M
      if(qi12_count>=4129){
489
1.02k
        bits++;
490
1.02k
        qi12_count=0;
491
1.02k
      }
492
13.5M
      else bits-=oc_sb_run_bits(qi12_count);
493
13.5M
    }
494
9.85M
    else qi12_count=0;
495
23.4M
    qi12_count++;
496
23.4M
    bits+=oc_sb_run_bits(qi12_count);
497
23.4M
  }
498
38.9M
  else qi12=_qs->qi12;
499
62.3M
  _qd->bits=bits;
500
62.3M
  _qd->qi01=qi01;
501
62.3M
  _qd->qi01_count=qi01_count;
502
62.3M
  _qd->qi12=qi12;
503
62.3M
  _qd->qi12_count=qi12_count;
504
62.3M
}
505
506
507
508
62.0k
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
509
62.0k
  ptrdiff_t *coded_fragis;
510
62.0k
  unsigned   mcu_nvsbs;
511
62.0k
  ptrdiff_t  mcu_nfrags;
512
62.0k
  int        flimit;
513
62.0k
  int        hdec;
514
62.0k
  int        vdec;
515
62.0k
  int        pli;
516
62.0k
  int        nqis;
517
62.0k
  int        qii;
518
62.0k
  int        qi0;
519
62.0k
  int        qti;
520
  /*Initialize the per-plane coded block flag trackers.
521
    These are used for bit-estimation purposes only; the real flag bits span
522
     all three planes, so we can't compute them in parallel.*/
523
248k
  for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
524
248k
  for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
525
  /*Set up the per-plane skip SSD storage pointers.*/
526
62.0k
  mcu_nvsbs=_enc->mcu_nvsbs;
527
62.0k
  mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
528
62.0k
  hdec=!(_enc->state.info.pixel_fmt&1);
529
62.0k
  vdec=!(_enc->state.info.pixel_fmt&2);
530
62.0k
  _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
531
62.0k
  _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
532
62.0k
  _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
533
  /*Set up per-plane pointers to the coded and uncoded fragments lists.
534
    Unlike the decoder, each planes' coded and uncoded fragment list is kept
535
     separate during the analysis stage; we only make the coded list for all
536
     three planes contiguous right before the final packet is output
537
     (destroying the uncoded lists, which are no longer needed).*/
538
62.0k
  coded_fragis=_enc->state.coded_fragis;
539
248k
  for(pli=0;pli<3;pli++){
540
186k
    _pipe->coded_fragis[pli]=coded_fragis;
541
186k
    coded_fragis+=_enc->state.fplanes[pli].nfrags;
542
186k
    _pipe->uncoded_fragis[pli]=coded_fragis;
543
186k
  }
544
62.0k
  memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
545
62.0k
  memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
546
  /*Set up condensed quantizer tables.*/
547
62.0k
  qi0=_enc->state.qis[0];
548
62.0k
  nqis=_enc->state.nqis;
549
248k
  for(pli=0;pli<3;pli++){
550
493k
    for(qii=0;qii<nqis;qii++){
551
307k
      int qi;
552
307k
      qi=_enc->state.qis[qii];
553
922k
      for(qti=0;qti<2;qti++){
554
        /*Set the DC coefficient in the dequantization table.*/
555
614k
        _enc->state.dequant_tables[qi][pli][qti][0]=
556
614k
         _enc->dequant_dc[qi0][pli][qti];
557
614k
        _enc->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
558
        /*Copy over the quantization table.*/
559
614k
        memcpy(_enc->enquant[pli][qii][qti],_enc->enquant_tables[qi][pli][qti],
560
614k
         _enc->opt_data.enquant_table_size);
561
614k
      }
562
307k
    }
563
186k
  }
564
  /*Fix up the DC coefficients in the quantization tables.*/
565
62.0k
  oc_enc_enquant_table_fixup(_enc,_enc->enquant,nqis);
566
  /*Initialize the tokenization state.*/
567
248k
  for(pli=0;pli<3;pli++){
568
186k
    _pipe->ndct_tokens1[pli]=0;
569
186k
    _pipe->eob_run1[pli]=0;
570
186k
  }
571
  /*Initialize the bounding value array for the loop filter.*/
572
62.0k
  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
573
62.0k
  _pipe->loop_filter=flimit!=0;
574
62.0k
  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
575
  /*Clear the temporary DCT scratch space.*/
576
62.0k
  memset(_pipe->dct_data,0,sizeof(_pipe->dct_data));
577
62.0k
}
578
579
/*Sets the current MCU stripe to super block row _sby.
580
  Return: A non-zero value if this was the last MCU.*/
581
static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
582
215k
 oc_enc_pipeline_state *_pipe,int _sby){
583
215k
  const oc_fragment_plane *fplane;
584
215k
  unsigned                 mcu_nvsbs;
585
215k
  int                      sby_end;
586
215k
  int                      notdone;
587
215k
  int                      vdec;
588
215k
  int                      pli;
589
215k
  mcu_nvsbs=_enc->mcu_nvsbs;
590
215k
  sby_end=_enc->state.fplanes[0].nvsbs;
591
215k
  notdone=_sby+mcu_nvsbs<sby_end;
592
215k
  if(notdone)sby_end=_sby+mcu_nvsbs;
593
215k
  vdec=0;
594
863k
  for(pli=0;pli<3;pli++){
595
647k
    fplane=_enc->state.fplanes+pli;
596
647k
    _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
597
647k
    _pipe->fragy0[pli]=_sby<<2-vdec;
598
647k
    _pipe->froffset[pli]=fplane->froffset
599
647k
     +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
600
647k
    if(notdone){
601
461k
      _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
602
461k
      _pipe->fragy_end[pli]=sby_end<<2-vdec;
603
461k
    }
604
186k
    else{
605
186k
      _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
606
186k
      _pipe->fragy_end[pli]=fplane->nvfrags;
607
186k
    }
608
647k
    vdec=!(_enc->state.info.pixel_fmt&2);
609
647k
  }
610
215k
  return notdone;
611
215k
}
612
613
static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
614
647k
 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
615
  /*Copy over all the uncoded fragments from this plane and advance the uncoded
616
     fragment list.*/
617
647k
  if(_pipe->nuncoded_fragis[_pli]>0){
618
58.9k
    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
619
58.9k
    oc_frag_copy_list(&_enc->state,
620
58.9k
     _enc->state.ref_frame_data[OC_FRAME_SELF],
621
58.9k
     _enc->state.ref_frame_data[OC_FRAME_PREV],
622
58.9k
     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
623
58.9k
     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
624
58.9k
    _pipe->nuncoded_fragis[_pli]=0;
625
58.9k
  }
626
  /*Perform DC prediction.*/
627
647k
  oc_enc_pred_dc_frag_rows(_enc,_pli,
628
647k
   _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
629
  /*Finish DC tokenization.*/
630
647k
  oc_enc_tokenize_dc_frag_list(_enc,_pli,
631
647k
   _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
632
647k
   _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
633
647k
  _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
634
647k
  _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
635
  /*And advance the coded fragment list.*/
636
647k
  _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
637
647k
  _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
638
647k
  _pipe->ncoded_fragis[_pli]=0;
639
  /*Apply the loop filter if necessary.*/
640
647k
  if(_pipe->loop_filter){
641
444k
    oc_state_loop_filter_frag_rows(&_enc->state,
642
444k
     _pipe->bounding_values,OC_FRAME_SELF,_pli,
643
444k
     _pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
644
444k
  }
645
202k
  else _sdelay=_edelay=0;
646
  /*To fill borders, we have an additional two pixel delay, since a fragment
647
     in the next row could filter its top edge, using two pixels from a
648
     fragment in this row.
649
    But there's no reason to delay a full fragment between the two.*/
650
647k
  oc_state_borders_fill_rows(&_enc->state,
651
647k
   _enc->state.ref_frame_idx[OC_FRAME_SELF],_pli,
652
647k
   (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
653
647k
   (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
654
647k
}
655
656
657
658
/*Cost information about the coded blocks in a MB.*/
659
struct oc_rd_metric{
660
  int uncoded_ac_ssd;
661
  int coded_ac_ssd;
662
  int ac_bits;
663
  int dc_flag;
664
};
665
666
667
668
static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
669
 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
670
 unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
671
25.1M
 oc_fr_state *_fr,oc_token_checkpoint **_stack){
672
25.1M
  ogg_int16_t            *data;
673
25.1M
  ogg_int16_t            *dct;
674
25.1M
  ogg_int16_t            *idct;
675
25.1M
  oc_qii_state            qs;
676
25.1M
  const ogg_uint16_t     *dequant;
677
25.1M
  ogg_uint16_t            dequant_dc;
678
25.1M
  ptrdiff_t               frag_offs;
679
25.1M
  int                     ystride;
680
25.1M
  const unsigned char    *src;
681
25.1M
  const unsigned char    *ref;
682
25.1M
  unsigned char          *dst;
683
25.1M
  int                     nonzero;
684
25.1M
  unsigned                uncoded_ssd;
685
25.1M
  unsigned                coded_ssd;
686
25.1M
  oc_token_checkpoint    *checkpoint;
687
25.1M
  oc_fragment            *frags;
688
25.1M
  int                     mb_mode;
689
25.1M
  int                     refi;
690
25.1M
  int                     mv_offs[2];
691
25.1M
  int                     nmv_offs;
692
25.1M
  int                     ac_bits;
693
25.1M
  int                     borderi;
694
25.1M
  int                     nqis;
695
25.1M
  int                     qti;
696
25.1M
  int                     qii;
697
25.1M
  int                     dc;
698
25.1M
  nqis=_enc->state.nqis;
699
25.1M
  frags=_enc->state.frags;
700
25.1M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
701
25.1M
  ystride=_enc->state.ref_ystride[_pli];
702
25.1M
  src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
703
25.1M
  borderi=frags[_fragi].borderi;
704
25.1M
  qii=frags[_fragi].qii;
705
25.1M
  data=_enc->pipe.dct_data;
706
25.1M
  dct=data+64;
707
25.1M
  idct=data+128;
708
25.1M
  if(qii&~3){
709
507k
#if !defined(OC_COLLECT_METRICS)
710
507k
    if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
711
      /*Enable early skip detection.*/
712
507k
      frags[_fragi].coded=0;
713
507k
      frags[_fragi].refi=OC_FRAME_NONE;
714
507k
      oc_fr_skip_block(_fr);
715
507k
      return 0;
716
507k
    }
717
0
#endif
718
    /*Try and code this block anyway.*/
719
0
    qii&=3;
720
0
  }
721
24.6M
  refi=frags[_fragi].refi;
722
24.6M
  mb_mode=frags[_fragi].mb_mode;
723
24.6M
  ref=_enc->state.ref_frame_data[refi]+frag_offs;
724
24.6M
  dst=_enc->state.ref_frame_data[OC_FRAME_SELF]+frag_offs;
725
  /*Motion compensation:*/
726
24.6M
  switch(mb_mode){
727
23.7M
    case OC_MODE_INTRA:{
728
23.7M
      nmv_offs=0;
729
23.7M
      oc_enc_frag_sub_128(_enc,data,src,ystride);
730
23.7M
    }break;
731
55.1k
    case OC_MODE_GOLDEN_NOMV:
732
274k
    case OC_MODE_INTER_NOMV:{
733
274k
      nmv_offs=1;
734
274k
      mv_offs[0]=0;
735
274k
      oc_enc_frag_sub(_enc,data,src,ref,ystride);
736
274k
    }break;
737
560k
    default:{
738
560k
      const oc_mv *frag_mvs;
739
560k
      frag_mvs=_enc->state.frag_mvs;
740
560k
      nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,
741
560k
       _pli,frag_mvs[_fragi]);
742
560k
      if(nmv_offs>1){
743
408k
        oc_enc_frag_copy2(_enc,dst,
744
408k
         ref+mv_offs[0],ref+mv_offs[1],ystride);
745
408k
        oc_enc_frag_sub(_enc,data,src,dst,ystride);
746
408k
      }
747
151k
      else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
748
560k
    }break;
749
24.6M
  }
750
#if defined(OC_COLLECT_METRICS)
751
  {
752
    unsigned sad;
753
    unsigned satd;
754
    switch(nmv_offs){
755
      case 0:{
756
        sad=oc_enc_frag_intra_sad(_enc,src,ystride);
757
        satd=oc_enc_frag_intra_satd(_enc,&dc,src,ystride);
758
      }break;
759
      case 1:{
760
        sad=oc_enc_frag_sad_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
761
        satd=oc_enc_frag_satd(_enc,&dc,src,ref+mv_offs[0],ystride);
762
        satd+=abs(dc);
763
      }break;
764
      default:{
765
        sad=oc_enc_frag_sad_thresh(_enc,src,dst,ystride,UINT_MAX);
766
        satd=oc_enc_frag_satd(_enc,&dc,src,dst,ystride);
767
        satd+=abs(dc);
768
      }break;
769
    }
770
    _enc->frag_sad[_fragi]=sad;
771
    _enc->frag_satd[_fragi]=satd;
772
  }
773
#endif
774
  /*Transform:*/
775
24.6M
  oc_enc_fdct8x8(_enc,dct,data);
776
  /*Quantize:*/
777
24.6M
  qti=mb_mode!=OC_MODE_INTRA;
778
24.6M
  dequant=_enc->dequant[_pli][qii][qti];
779
24.6M
  nonzero=oc_enc_quantize(_enc,data,dct,dequant,_enc->enquant[_pli][qii][qti]);
780
24.6M
  dc=data[0];
781
  /*Tokenize.*/
782
24.6M
  checkpoint=*_stack;
783
24.6M
  if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
784
24.6M
    ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,idct,data,dequant,dct,
785
24.6M
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
786
24.6M
  }
787
0
  else{
788
0
    ac_bits=oc_enc_tokenize_ac_fast(_enc,_pli,_fragi,idct,data,dequant,dct,
789
0
     nonzero+1,_stack,OC_RD_ISCALE(_enc->lambda,_rd_iscale),qti?0:3);
790
0
  }
791
  /*Reconstruct.
792
    TODO: nonzero may need to be adjusted after tokenization.*/
793
24.6M
  dequant_dc=dequant[0];
794
24.6M
  if(nonzero==0){
795
19.5M
    ogg_int16_t p;
796
19.5M
    int         ci;
797
19.5M
    int         qi01;
798
19.5M
    int         qi12;
799
    /*We round this dequant product (and not any of the others) because there's
800
       no iDCT rounding.*/
801
19.5M
    p=(ogg_int16_t)(dc*(ogg_int32_t)dequant_dc+15>>5);
802
    /*LOOP VECTORIZES.*/
803
1.27G
    for(ci=0;ci<64;ci++)data[ci]=p;
804
    /*We didn't code any AC coefficients, so don't change the quantizer.*/
805
19.5M
    qi01=_pipe->qs[_pli].qi01;
806
19.5M
    qi12=_pipe->qs[_pli].qi12;
807
19.5M
    if(qi01>0)qii=1+qi12;
808
17.7M
    else if(qi01>=0)qii=0;
809
19.5M
  }
810
5.06M
  else{
811
5.06M
    idct[0]=dc*dequant_dc;
812
    /*Note: This clears idct[] back to zero for the next block.*/
813
5.06M
    oc_idct8x8(&_enc->state,data,idct,nonzero+1);
814
5.06M
  }
815
24.6M
  frags[_fragi].qii=qii;
816
24.6M
  if(nqis>1){
817
7.26M
    oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
818
7.26M
    ac_bits+=qs.bits-_pipe->qs[_pli].bits;
819
7.26M
  }
820
24.6M
  if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
821
834k
  else{
822
834k
    oc_enc_frag_recon_inter(_enc,dst,
823
834k
     nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
824
834k
  }
825
  /*If _fr is NULL, then this is an INTRA frame, and we can't skip blocks.*/
826
24.6M
#if !defined(OC_COLLECT_METRICS)
827
24.6M
  if(_fr!=NULL)
828
2.14M
#endif
829
2.14M
  {
830
    /*In retrospect, should we have skipped this block?*/
831
2.14M
    if(borderi<0){
832
1.63M
      coded_ssd=oc_enc_frag_ssd(_enc,src,dst,ystride);
833
1.63M
    }
834
510k
    else{
835
510k
      coded_ssd=oc_enc_frag_border_ssd(_enc,src,dst,ystride,
836
510k
       _enc->state.borders[borderi].mask);
837
510k
    }
838
    /*Scale to match DCT domain.*/
839
2.14M
    coded_ssd<<=4;
840
#if defined(OC_COLLECT_METRICS)
841
    _enc->frag_ssd[_fragi]=coded_ssd;
842
  }
843
  if(_fr!=NULL){
844
#endif
845
2.14M
    coded_ssd=OC_RD_SCALE(coded_ssd,_rd_scale);
846
2.14M
    uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
847
2.14M
    if(uncoded_ssd<UINT_MAX&&
848
     /*Don't allow luma blocks to be skipped in 4MV mode when VP3 compatibility
849
        is enabled.*/
850
2.14M
     (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
851
2.14M
      int overhead_bits;
852
2.14M
      overhead_bits=oc_fr_cost1(_fr);
853
      /*Although the fragment coding overhead determination is accurate, it is
854
         greedy, using very coarse-grained local information.
855
        Allowing it to mildly discourage coding turns out to be beneficial, but
856
         it's not clear that allowing it to encourage coding through negative
857
         coding overhead deltas is useful.
858
        For that reason, we disallow negative coding overheads.*/
859
2.14M
      if(overhead_bits<0)overhead_bits=0;
860
2.14M
      if(uncoded_ssd<=coded_ssd+(overhead_bits+ac_bits)*_enc->lambda){
861
        /*Hm, not worth it; roll back.*/
862
163k
        oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
863
163k
        *_stack=checkpoint;
864
163k
        frags[_fragi].coded=0;
865
163k
        frags[_fragi].refi=OC_FRAME_NONE;
866
163k
        oc_fr_skip_block(_fr);
867
163k
        return 0;
868
163k
      }
869
2.14M
    }
870
0
    else _mo->dc_flag=1;
871
1.98M
    _mo->uncoded_ac_ssd+=uncoded_ssd;
872
1.98M
    _mo->coded_ac_ssd+=coded_ssd;
873
1.98M
    _mo->ac_bits+=ac_bits;
874
1.98M
    oc_fr_code_block(_fr);
875
1.98M
  }
876
  /*GCC 4.4.4 generates a warning here because it can't tell that
877
     the init code in the nqis check above will run anytime this
878
     line runs.*/
879
24.4M
  if(nqis>1)*(_pipe->qs+_pli)=*&qs;
880
24.4M
  frags[_fragi].dc=dc;
881
24.4M
  frags[_fragi].coded=1;
882
24.4M
  return 1;
883
24.6M
}
884
885
static int oc_enc_mb_transform_quantize_inter_luma(oc_enc_ctx *_enc,
886
 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead,
887
411k
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
888
  /*Worst case token stack usage for 4 fragments.*/
889
411k
  oc_token_checkpoint  stack[64*4];
890
411k
  oc_token_checkpoint *stackptr;
891
411k
  const oc_sb_map     *sb_maps;
892
411k
  signed char         *mb_modes;
893
411k
  oc_fragment         *frags;
894
411k
  ptrdiff_t           *coded_fragis;
895
411k
  ptrdiff_t            ncoded_fragis;
896
411k
  ptrdiff_t           *uncoded_fragis;
897
411k
  ptrdiff_t            nuncoded_fragis;
898
411k
  oc_rd_metric         mo;
899
411k
  oc_fr_state          fr_checkpoint;
900
411k
  oc_qii_state         qs_checkpoint;
901
411k
  int                  mb_mode;
902
411k
  int                  refi;
903
411k
  int                  ncoded;
904
411k
  ptrdiff_t            fragi;
905
411k
  int                  bi;
906
411k
  *&fr_checkpoint=*(_pipe->fr+0);
907
411k
  *&qs_checkpoint=*(_pipe->qs+0);
908
411k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
909
411k
  mb_modes=_enc->state.mb_modes;
910
411k
  frags=_enc->state.frags;
911
411k
  coded_fragis=_pipe->coded_fragis[0];
912
411k
  ncoded_fragis=_pipe->ncoded_fragis[0];
913
411k
  uncoded_fragis=_pipe->uncoded_fragis[0];
914
411k
  nuncoded_fragis=_pipe->nuncoded_fragis[0];
915
411k
  mb_mode=mb_modes[_mbi];
916
411k
  refi=OC_FRAME_FOR_MODE(mb_mode);
917
411k
  ncoded=0;
918
411k
  stackptr=stack;
919
411k
  memset(&mo,0,sizeof(mo));
920
2.05M
  for(bi=0;bi<4;bi++){
921
1.64M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
922
1.64M
    frags[fragi].refi=refi;
923
1.64M
    frags[fragi].mb_mode=mb_mode;
924
1.64M
    if(oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
925
1.64M
     _rd_scale[bi],_rd_iscale[bi],&mo,_pipe->fr+0,&stackptr)){
926
1.27M
      coded_fragis[ncoded_fragis++]=fragi;
927
1.27M
      ncoded++;
928
1.27M
    }
929
368k
    else *(uncoded_fragis-++nuncoded_fragis)=fragi;
930
1.64M
  }
931
411k
  if(ncoded>0&&!mo.dc_flag){
932
353k
    int cost;
933
    /*Some individual blocks were worth coding.
934
      See if that's still true when accounting for mode and MV overhead.*/
935
353k
    cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
936
353k
     +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
937
353k
    if(mo.uncoded_ac_ssd<=cost){
938
      /*Taking macroblock overhead into account, it is not worth coding this
939
         MB.*/
940
6.74k
      oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
941
6.74k
      *(_pipe->fr+0)=*&fr_checkpoint;
942
6.74k
      *(_pipe->qs+0)=*&qs_checkpoint;
943
33.7k
      for(bi=0;bi<4;bi++){
944
26.9k
        fragi=sb_maps[_mbi>>2][_mbi&3][bi];
945
26.9k
        if(frags[fragi].coded){
946
10.8k
          *(uncoded_fragis-++nuncoded_fragis)=fragi;
947
10.8k
          frags[fragi].coded=0;
948
10.8k
          frags[fragi].refi=OC_FRAME_NONE;
949
10.8k
        }
950
26.9k
        oc_fr_skip_block(_pipe->fr+0);
951
26.9k
      }
952
6.74k
      ncoded_fragis-=ncoded;
953
6.74k
      ncoded=0;
954
6.74k
    }
955
353k
  }
956
  /*If no luma blocks coded, the mode is forced.*/
957
411k
  if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
958
  /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
959
     with a single coded block.
960
    This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
961
     skipped blocks, while a 1MV does not.*/
962
346k
  else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
963
66
    mb_modes[_mbi]=OC_MODE_INTER_MV;
964
66
  }
965
411k
  _pipe->ncoded_fragis[0]=ncoded_fragis;
966
411k
  _pipe->nuncoded_fragis[0]=nuncoded_fragis;
967
411k
  return ncoded;
968
411k
}
969
970
static void oc_enc_sb_transform_quantize_inter_chroma(oc_enc_ctx *_enc,
971
84.9k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
972
84.9k
  const ogg_uint16_t *mcu_rd_scale;
973
84.9k
  const ogg_uint16_t *mcu_rd_iscale;
974
84.9k
  const oc_sb_map    *sb_maps;
975
84.9k
  oc_sb_flags        *sb_flags;
976
84.9k
  oc_fr_state        *fr;
977
84.9k
  ptrdiff_t          *coded_fragis;
978
84.9k
  ptrdiff_t           ncoded_fragis;
979
84.9k
  ptrdiff_t          *uncoded_fragis;
980
84.9k
  ptrdiff_t           nuncoded_fragis;
981
84.9k
  ptrdiff_t           froffset;
982
84.9k
  int                 sbi;
983
84.9k
  fr=_pipe->fr+_pli;
984
84.9k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
985
84.9k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
986
84.9k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
987
84.9k
  sb_flags=_enc->state.sb_flags;
988
84.9k
  coded_fragis=_pipe->coded_fragis[_pli];
989
84.9k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
990
84.9k
  uncoded_fragis=_pipe->uncoded_fragis[_pli];
991
84.9k
  nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
992
84.9k
  froffset=_pipe->froffset[_pli];
993
216k
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
994
    /*Worst case token stack usage for 1 fragment.*/
995
131k
    oc_token_checkpoint stack[64];
996
131k
    oc_rd_metric        mo;
997
131k
    int                 quadi;
998
131k
    int                 bi;
999
131k
    memset(&mo,0,sizeof(mo));
1000
2.63M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1001
2.10M
      ptrdiff_t fragi;
1002
2.10M
      fragi=sb_maps[sbi][quadi][bi];
1003
2.10M
      if(fragi>=0){
1004
1.00M
        oc_token_checkpoint *stackptr;
1005
1.00M
        unsigned             rd_scale;
1006
1.00M
        unsigned             rd_iscale;
1007
1.00M
        rd_scale=mcu_rd_scale[fragi-froffset];
1008
1.00M
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1009
1.00M
        stackptr=stack;
1010
1.00M
        if(oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1011
1.00M
         rd_scale,rd_iscale,&mo,fr,&stackptr)){
1012
704k
          coded_fragis[ncoded_fragis++]=fragi;
1013
704k
        }
1014
302k
        else *(uncoded_fragis-++nuncoded_fragis)=fragi;
1015
1.00M
      }
1016
2.10M
    }
1017
131k
    oc_fr_state_flush_sb(fr);
1018
131k
    sb_flags[sbi].coded_fully=fr->sb_full;
1019
131k
    sb_flags[sbi].coded_partially=fr->sb_partial;
1020
131k
  }
1021
84.9k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1022
84.9k
  _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
1023
84.9k
}
1024
1025
/*Mode decision is done by exhaustively examining all potential choices.
1026
  Obviously, doing the motion compensation, fDCT, tokenization, and then
1027
   counting the bits each token uses is computationally expensive.
1028
  Theora's EOB runs can also split the cost of these tokens across multiple
1029
   fragments, and naturally we don't know what the optimal choice of Huffman
1030
   codes will be until we know all the tokens we're going to encode in all the
1031
   fragments.
1032
  So we use a simple approach to estimating the bit cost and distortion of each
1033
   mode based upon the SATD value of the residual before coding.
1034
  The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
1035
   the process (modified somewhat from that of the paper) is very simple.
1036
  We build a non-linear regression of the mappings from
1037
   (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1038
   SSD for each qi.
1039
  A separate set of mappings is kept for each quantization type and color
1040
   plane.
1041
  The mappings are constructed by partitioning the SATD values into a small
1042
   number of bins (currently 24) and using a linear regression in each bin
1043
   (as opposed to the 0th-order regression used by Kim).
1044
  The bit counts and SSD measurements are obtained by examining actual encoded
1045
   frames, with appropriate lambda values and optimal Huffman codes selected.
1046
  EOB bits are assigned to the fragment that started the EOB run (as opposed to
1047
   dividing them among all the blocks in the run; the latter approach seems
1048
   more theoretically correct, but Monty's testing showed a small improvement
1049
   with the former, though that may have been merely statistical noise).
1050
1051
  @ARTICLE{Kim03,
1052
    author="Hyun Mun Kim",
1053
    title="Adaptive Rate Control Using Nonlinear Regression",
1054
    journal="IEEE Transactions on Circuits and Systems for Video Technology",
1055
    volume=13,
1056
    number=5,
1057
    pages="432--439",
1058
    month=May,
1059
    year=2003
1060
  }*/
1061
1062
/*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1063
   overflow for large lambda values.*/
1064
#define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1065
97.9M
 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1066
97.9M
 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1067
97.9M
 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1068
1069
62.0k
static void oc_enc_mode_rd_init(oc_enc_ctx *_enc){
1070
62.0k
#if !defined(OC_COLLECT_METRICS)
1071
62.0k
  const
1072
62.0k
#endif
1073
62.0k
  oc_mode_rd (*oc_mode_rd_table)[3][2][OC_COMP_BINS]=
1074
62.0k
   _enc->sp_level<OC_SP_LEVEL_NOSATD?OC_MODE_RD_SATD:OC_MODE_RD_SAD;
1075
62.0k
  int qii;
1076
#if defined(OC_COLLECT_METRICS)
1077
  oc_enc_mode_metrics_load(_enc);
1078
#endif
1079
164k
  for(qii=0;qii<_enc->state.nqis;qii++){
1080
102k
    int qi;
1081
102k
    int pli;
1082
102k
    qi=_enc->state.qis[qii];
1083
409k
    for(pli=0;pli<3;pli++){
1084
307k
      int qti;
1085
922k
      for(qti=0;qti<2;qti++){
1086
614k
        int log_plq;
1087
614k
        int modeline;
1088
614k
        int bin;
1089
614k
        int dx;
1090
614k
        int dq;
1091
614k
        log_plq=_enc->log_plq[qi][pli][qti];
1092
        /*Find the pair of rows in the mode table that bracket this quantizer.
1093
          If it falls outside the range the table covers, then we just use a
1094
           pair on the edge for linear extrapolation.*/
1095
2.89M
        for(modeline=0;modeline<OC_LOGQ_BINS-1&&
1096
2.89M
         OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
1097
        /*Interpolate a row for this quantizer.*/
1098
614k
        dx=OC_MODE_LOGQ[modeline][pli][qti]-log_plq;
1099
614k
        dq=OC_MODE_LOGQ[modeline][pli][qti]-OC_MODE_LOGQ[modeline+1][pli][qti];
1100
614k
        if(dq==0)dq=1;
1101
15.3M
        for(bin=0;bin<OC_COMP_BINS;bin++){
1102
14.7M
          int y0;
1103
14.7M
          int z0;
1104
14.7M
          int dy;
1105
14.7M
          int dz;
1106
14.7M
          y0=oc_mode_rd_table[modeline][pli][qti][bin].rate;
1107
14.7M
          z0=oc_mode_rd_table[modeline][pli][qti][bin].rmse;
1108
14.7M
          dy=oc_mode_rd_table[modeline+1][pli][qti][bin].rate-y0;
1109
14.7M
          dz=oc_mode_rd_table[modeline+1][pli][qti][bin].rmse-z0;
1110
14.7M
          _enc->mode_rd[qii][pli][qti][bin].rate=
1111
14.7M
           (ogg_int16_t)OC_CLAMPI(-32768,y0+(dy*dx+(dq>>1))/dq,32767);
1112
14.7M
          _enc->mode_rd[qii][pli][qti][bin].rmse=
1113
14.7M
           (ogg_int16_t)OC_CLAMPI(-32768,z0+(dz*dx+(dq>>1))/dq,32767);
1114
14.7M
        }
1115
614k
      }
1116
307k
    }
1117
102k
  }
1118
62.0k
}
1119
1120
/*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1121
   prediction.*/
1122
static unsigned oc_dct_cost2(oc_enc_ctx *_enc,unsigned *_ssd,
1123
62.1M
 int _qii,int _pli,int _qti,int _satd){
1124
62.1M
  unsigned rmse;
1125
62.1M
  int      shift;
1126
62.1M
  int      bin;
1127
62.1M
  int      dx;
1128
62.1M
  int      y0;
1129
62.1M
  int      z0;
1130
62.1M
  int      dy;
1131
62.1M
  int      dz;
1132
  /*SATD metrics for chroma planes vary much less than luma, so we scale them
1133
     by 4 to distribute them into the mode decision bins more evenly.*/
1134
62.1M
  _satd<<=_pli+1&2;
1135
62.1M
  shift=_enc->sp_level<OC_SP_LEVEL_NOSATD?OC_SATD_SHIFT:OC_SAD_SHIFT;
1136
62.1M
  bin=OC_MINI(_satd>>shift,OC_COMP_BINS-2);
1137
62.1M
  dx=_satd-(bin<<shift);
1138
62.1M
  y0=_enc->mode_rd[_qii][_pli][_qti][bin].rate;
1139
62.1M
  z0=_enc->mode_rd[_qii][_pli][_qti][bin].rmse;
1140
62.1M
  dy=_enc->mode_rd[_qii][_pli][_qti][bin+1].rate-y0;
1141
62.1M
  dz=_enc->mode_rd[_qii][_pli][_qti][bin+1].rmse-z0;
1142
62.1M
  rmse=OC_MAXI(z0+(dz*dx>>shift),0);
1143
62.1M
  *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1144
62.1M
  return OC_MAXI(y0+(dy*dx>>shift),0);
1145
62.1M
}
1146
1147
/*activity_avg must be positive, or flat regions could get a zero weight, which
1148
   confounds analysis.
1149
  We set the minimum to this value so that it also avoids the need for divide
1150
   by zero checks in oc_mb_masking().*/
1151
# define OC_ACTIVITY_AVG_MIN (1<<OC_RD_SCALE_BITS)
1152
1153
static unsigned oc_mb_activity(oc_enc_ctx *_enc,unsigned _mbi,
1154
3.23M
 unsigned _activity[4]){
1155
3.23M
  const unsigned char *src;
1156
3.23M
  const ptrdiff_t     *frag_buf_offs;
1157
3.23M
  const ptrdiff_t     *sb_map;
1158
3.23M
  unsigned             luma;
1159
3.23M
  int                  ystride;
1160
3.23M
  ptrdiff_t            frag_offs;
1161
3.23M
  ptrdiff_t            fragi;
1162
3.23M
  int                  bi;
1163
3.23M
  frag_buf_offs=_enc->state.frag_buf_offs;
1164
3.23M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1165
3.23M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1166
3.23M
  ystride=_enc->state.ref_ystride[0];
1167
3.23M
  luma=0;
1168
16.1M
  for(bi=0;bi<4;bi++){
1169
12.9M
    const unsigned char *s;
1170
12.9M
    unsigned             x;
1171
12.9M
    unsigned             x2;
1172
12.9M
    unsigned             act;
1173
12.9M
    int                  i;
1174
12.9M
    int                  j;
1175
12.9M
    fragi=sb_map[bi];
1176
12.9M
    frag_offs=frag_buf_offs[fragi];
1177
    /*TODO: This could be replaced with SATD^2, since we already have to
1178
       compute SATD.*/
1179
12.9M
    x=x2=0;
1180
12.9M
    s=src+frag_offs;
1181
116M
    for(i=0;i<8;i++){
1182
930M
      for(j=0;j<8;j++){
1183
827M
        unsigned c;
1184
827M
        c=s[j];
1185
827M
        x+=c;
1186
827M
        x2+=c*c;
1187
827M
      }
1188
103M
      s+=ystride;
1189
103M
    }
1190
12.9M
    luma+=x;
1191
12.9M
    act=(x2<<6)-x*x;
1192
12.9M
    if(act<8<<12){
1193
      /*The region is flat.*/
1194
8.97M
      act=OC_MINI(act,5<<12);
1195
8.97M
    }
1196
3.94M
    else{
1197
3.94M
      unsigned e1;
1198
3.94M
      unsigned e2;
1199
3.94M
      unsigned e3;
1200
3.94M
      unsigned e4;
1201
      /*Test for an edge.
1202
        TODO: There are probably much simpler ways to do this (e.g., it could
1203
         probably be combined with the SATD calculation).
1204
        Alternatively, we could split the block around the mean and compute the
1205
         reduction in variance in each half.
1206
        For a Gaussian source the reduction should be
1207
         (1-2/pi) ~= 0.36338022763241865692446494650994.
1208
        Significantly more reduction is a good indication of a bi-level image.
1209
        This has the advantage of identifying, in addition to straight edges,
1210
         small text regions, which would otherwise be classified as "texture".*/
1211
3.94M
      e1=e2=e3=e4=0;
1212
3.94M
      s=src+frag_offs-1;
1213
35.5M
      for(i=0;i<8;i++){
1214
284M
        for(j=0;j<8;j++){
1215
252M
          e1+=abs((s[j+2]-s[j]<<1)+(s-ystride)[j+2]-(s-ystride)[j]
1216
252M
           +(s+ystride)[j+2]-(s+ystride)[j]);
1217
252M
          e2+=abs(((s+ystride)[j+1]-(s-ystride)[j+1]<<1)
1218
252M
           +(s+ystride)[j]-(s-ystride)[j]+(s+ystride)[j+2]-(s-ystride)[j+2]);
1219
252M
          e3+=abs(((s+ystride)[j+2]-(s-ystride)[j]<<1)
1220
252M
           +(s+ystride)[j+1]-s[j]+s[j+2]-(s-ystride)[j+1]);
1221
252M
          e4+=abs(((s+ystride)[j]-(s-ystride)[j+2]<<1)
1222
252M
           +(s+ystride)[j+1]-s[j+2]+s[j]-(s-ystride)[j+1]);
1223
252M
        }
1224
31.5M
        s+=ystride;
1225
31.5M
      }
1226
      /*If the largest component of the edge energy is at least 40% of the
1227
         total, then classify the block as an edge block.*/
1228
3.94M
      if(5*OC_MAXI(OC_MAXI(e1,e2),OC_MAXI(e3,e4))>2*(e1+e2+e3+e4)){
1229
         /*act=act_th*(act/act_th)**0.7
1230
              =exp(log(act_th)+0.7*(log(act)-log(act_th))).
1231
           Here act_th=5.0 and 0x394A=oc_blog32_q10(5<<12).*/
1232
44.5k
         act=oc_bexp32_q10(0x394A+(7*(oc_blog32_q10(act)-0x394A+5)/10));
1233
44.5k
      }
1234
3.94M
    }
1235
12.9M
    _activity[bi]=act;
1236
12.9M
  }
1237
3.23M
  return luma;
1238
3.23M
}
1239
1240
static void oc_mb_activity_fast(oc_enc_ctx *_enc,unsigned _mbi,
1241
0
 unsigned _activity[4],const unsigned _intra_satd[12]){
1242
0
  int bi;
1243
0
  for(bi=0;bi<4;bi++){
1244
0
    unsigned act;
1245
0
    act=(11*_intra_satd[bi]>>8)*_intra_satd[bi];
1246
0
    if(act<8<<12){
1247
      /*The region is flat.*/
1248
0
      act=OC_MINI(act,5<<12);
1249
0
    }
1250
0
    _activity[bi]=act;
1251
0
  }
1252
0
}
1253
1254
/*Compute the masking scales for the blocks in a macro block.
1255
  All masking is computed from the luma blocks.
1256
  We derive scaling factors for the chroma blocks from these, and use the same
1257
   ones for all chroma blocks, regardless of the subsampling.
1258
  It's possible for luma to be perfectly flat and yet have high chroma energy,
1259
   but this is unlikely in non-artificial images, and not a case that has been
1260
   addressed by any research to my knowledge.
1261
  The output of the masking process is two scale factors, which are fed into
1262
   the various R-D optimizations.
1263
  The first, rd_scale, is applied to D in the equation
1264
    D*rd_scale+lambda*R.
1265
  This is the form that must be used to properly combine scores from multiple
1266
   blocks, and can be interpreted as scaling distortions by their visibility.
1267
  The inverse, rd_iscale, is applied to lambda in the equation
1268
    D+rd_iscale*lambda*R.
1269
  This is equivalent to the first form within a single block, but much faster
1270
   to use when evaluating many possible distortions (e.g., during actual
1271
   quantization, where separate distortions are evaluated for every
1272
   coefficient).
1273
  The two macros OC_RD_SCALE(rd_scale,d) and OC_RD_ISCALE(rd_iscale,lambda) are
1274
   used to perform the multiplications with the proper re-scaling for the range
1275
   of the scaling factors.
1276
  Many researchers apply masking values directly to the quantizers used, and
1277
   not to the R-D cost.
1278
  Since we generally use MSE for D, rd_scale must use the square of their
1279
   values to generate an equivalent effect.*/
1280
static unsigned oc_mb_masking(unsigned _rd_scale[5],unsigned _rd_iscale[5],
1281
 const ogg_uint16_t _chroma_rd_scale[2],const unsigned _activity[4],
1282
3.23M
 unsigned _activity_avg,unsigned _luma,unsigned _luma_avg){
1283
3.23M
  unsigned activity_sum;
1284
3.23M
  unsigned la;
1285
3.23M
  unsigned lb;
1286
3.23M
  unsigned d;
1287
3.23M
  int      bi;
1288
3.23M
  int      bi_min;
1289
3.23M
  int      bi_min2;
1290
  /*The ratio lb/la is meant to approximate
1291
     ((((_luma-16)/219)*(255/128))**0.649**0.4**2), which is the
1292
     effective luminance masking from~\cite{LKW06} (including the self-masking
1293
     deflator).
1294
    The following actually turns out to be a pretty good approximation for
1295
     _luma>75 or so.
1296
    For smaller values luminance does not really follow Weber's Law anyway, and
1297
     this approximation gives a much less aggressive bitrate boost in this
1298
     region.
1299
    Though some researchers claim that contrast sensitivity actually decreases
1300
     for very low luminance values, in my experience excessive brightness on
1301
     LCDs or buggy color conversions (e.g., treating Y' as full-range instead
1302
     of the CCIR 601 range) make artifacts in such regions extremely visible.
1303
    We substitute _luma_avg for 128 to allow the strength of the masking to
1304
     vary with the actual average image luminance, within certain limits (the
1305
     caller has clamped _luma_avg to the range [90,160], inclusive).
1306
    @ARTICLE{LKW06,
1307
      author="Zhen Liu and Lina J. Karam and Andrew B. Watson",
1308
      title="{JPEG2000} Encoding With Perceptual Distortion Control",
1309
      journal="{IEEE} Transactions on Image Processing",
1310
      volume=15,
1311
      number=7,
1312
      pages="1763--1778",
1313
      month=Jul,
1314
      year=2006
1315
    }*/
1316
#if 0
1317
  la=_luma+4*_luma_avg;
1318
  lb=4*_luma+_luma_avg;
1319
#else
1320
  /*Disable luminance masking.*/
1321
3.23M
  la=lb=1;
1322
3.23M
#endif
1323
3.23M
  activity_sum=0;
1324
16.1M
  for(bi=0;bi<4;bi++){
1325
12.9M
    unsigned a;
1326
12.9M
    unsigned b;
1327
12.9M
    activity_sum+=_activity[bi];
1328
    /*Apply activity masking.*/
1329
12.9M
    a=_activity[bi]+4*_activity_avg;
1330
12.9M
    b=4*_activity[bi]+_activity_avg;
1331
12.9M
    d=OC_RD_SCALE(b,1);
1332
    /*And luminance masking.*/
1333
12.9M
    d=(a+(d>>1))/d;
1334
12.9M
    _rd_scale[bi]=(d*la+(lb>>1))/lb;
1335
    /*And now the inverse.*/
1336
12.9M
    d=OC_MAXI(OC_RD_ISCALE(a,1),1);
1337
12.9M
    d=(b+(d>>1))/d;
1338
12.9M
    _rd_iscale[bi]=(d*lb+(la>>1))/la;
1339
12.9M
  }
1340
  /*Now compute scaling factors for chroma blocks.
1341
    We start by finding the two smallest iscales from the luma blocks.*/
1342
3.23M
  bi_min=_rd_iscale[1]<_rd_iscale[0];
1343
3.23M
  bi_min2=1-bi_min;
1344
9.69M
  for(bi=2;bi<4;bi++){
1345
6.46M
    if(_rd_iscale[bi]<_rd_iscale[bi_min]){
1346
692k
      bi_min2=bi_min;
1347
692k
      bi_min=bi;
1348
692k
    }
1349
5.76M
    else if(_rd_iscale[bi]<_rd_iscale[bi_min2])bi_min2=bi;
1350
6.46M
  }
1351
  /*If the minimum iscale is less than 1.0, use the second smallest instead,
1352
     and force the value to at least 1.0 (inflating chroma is a waste).*/
1353
3.23M
  if(_rd_iscale[bi_min]<(1<<OC_RD_ISCALE_BITS))bi_min=bi_min2;
1354
3.23M
  d=OC_MINI(_rd_scale[bi_min],1<<OC_RD_SCALE_BITS);
1355
3.23M
  _rd_scale[4]=OC_RD_SCALE(d,_chroma_rd_scale[0]);
1356
3.23M
  d=OC_MAXI(_rd_iscale[bi_min],1<<OC_RD_ISCALE_BITS);
1357
3.23M
  _rd_iscale[4]=OC_RD_ISCALE(d,_chroma_rd_scale[1]);
1358
3.23M
  return activity_sum;
1359
3.23M
}
1360
1361
static int oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1362
411k
 unsigned _frag_satd[12]){
1363
411k
  const unsigned char   *src;
1364
411k
  const ptrdiff_t       *frag_buf_offs;
1365
411k
  const ptrdiff_t       *sb_map;
1366
411k
  const oc_mb_map_plane *mb_map;
1367
411k
  const unsigned char   *map_idxs;
1368
411k
  int                    map_nidxs;
1369
411k
  int                    mapii;
1370
411k
  int                    mapi;
1371
411k
  int                    ystride;
1372
411k
  int                    pli;
1373
411k
  int                    bi;
1374
411k
  ptrdiff_t              fragi;
1375
411k
  ptrdiff_t              frag_offs;
1376
411k
  unsigned               luma;
1377
411k
  int                    dc;
1378
411k
  frag_buf_offs=_enc->state.frag_buf_offs;
1379
411k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1380
411k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1381
411k
  ystride=_enc->state.ref_ystride[0];
1382
411k
  luma=0;
1383
2.05M
  for(bi=0;bi<4;bi++){
1384
1.64M
    fragi=sb_map[bi];
1385
1.64M
    frag_offs=frag_buf_offs[fragi];
1386
1.64M
    _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1387
1.64M
    luma+=dc;
1388
1.64M
  }
1389
411k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1390
411k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1391
411k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1392
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1393
411k
  ystride=_enc->state.ref_ystride[1];
1394
1.41M
  for(mapii=4;mapii<map_nidxs;mapii++){
1395
1.00M
    mapi=map_idxs[mapii];
1396
1.00M
    pli=mapi>>2;
1397
1.00M
    bi=mapi&3;
1398
1.00M
    fragi=mb_map[pli][bi];
1399
1.00M
    frag_offs=frag_buf_offs[fragi];
1400
1.00M
    _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1401
1.00M
  }
1402
411k
  return luma;
1403
411k
}
1404
1405
/*Select luma block-level quantizers for a MB in an INTRA frame.*/
1406
static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1407
2.81M
 const oc_qii_state *_qs,unsigned _mbi,const unsigned _rd_scale[4]){
1408
2.81M
  const unsigned char *src;
1409
2.81M
  const ptrdiff_t     *frag_buf_offs;
1410
2.81M
  const oc_sb_map     *sb_maps;
1411
2.81M
  oc_fragment         *frags;
1412
2.81M
  ptrdiff_t            frag_offs;
1413
2.81M
  ptrdiff_t            fragi;
1414
2.81M
  oc_qii_state         qs[4][3];
1415
2.81M
  unsigned             cost[4][3];
1416
2.81M
  unsigned             ssd[4][3];
1417
2.81M
  unsigned             rate[4][3];
1418
2.81M
  int                  prev[3][3];
1419
2.81M
  unsigned             satd;
1420
2.81M
  int                  dc;
1421
2.81M
  unsigned             best_cost;
1422
2.81M
  unsigned             best_ssd;
1423
2.81M
  unsigned             best_rate;
1424
2.81M
  int                  best_qii;
1425
2.81M
  int                  qii;
1426
2.81M
  int                  lambda;
1427
2.81M
  int                  ystride;
1428
2.81M
  int                  nqis;
1429
2.81M
  int                  bi;
1430
2.81M
  frag_buf_offs=_enc->state.frag_buf_offs;
1431
2.81M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1432
2.81M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1433
2.81M
  ystride=_enc->state.ref_ystride[0];
1434
2.81M
  fragi=sb_maps[_mbi>>2][_mbi&3][0];
1435
2.81M
  frag_offs=frag_buf_offs[fragi];
1436
2.81M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1437
2.81M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1438
2.81M
  }
1439
0
  else{
1440
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1441
0
  }
1442
2.81M
  nqis=_enc->state.nqis;
1443
2.81M
  lambda=_enc->lambda;
1444
7.18M
  for(qii=0;qii<nqis;qii++){
1445
4.36M
    oc_qii_state_advance(qs[0]+qii,_qs,qii);
1446
4.36M
    rate[0][qii]=oc_dct_cost2(_enc,ssd[0]+qii,qii,0,0,satd)
1447
4.36M
     +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1448
4.36M
    ssd[0][qii]=OC_RD_SCALE(ssd[0][qii],_rd_scale[0]);
1449
4.36M
    cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1450
4.36M
  }
1451
11.2M
  for(bi=1;bi<4;bi++){
1452
8.45M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1453
8.45M
    frag_offs=frag_buf_offs[fragi];
1454
8.45M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1455
8.45M
      satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1456
8.45M
    }
1457
0
    else{
1458
0
      satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1459
0
    }
1460
21.5M
    for(qii=0;qii<nqis;qii++){
1461
13.0M
      oc_qii_state qt[3];
1462
13.0M
      unsigned     cur_ssd;
1463
13.0M
      unsigned     cur_rate;
1464
13.0M
      int          best_qij;
1465
13.0M
      int          qij;
1466
13.0M
      oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1467
13.0M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,0,satd);
1468
13.0M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1469
13.0M
      best_ssd=ssd[bi-1][0]+cur_ssd;
1470
13.0M
      best_rate=rate[bi-1][0]+cur_rate
1471
13.0M
       +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1472
13.0M
      best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1473
13.0M
      best_qij=0;
1474
26.3M
      for(qij=1;qij<nqis;qij++){
1475
13.2M
        unsigned chain_ssd;
1476
13.2M
        unsigned chain_rate;
1477
13.2M
        unsigned chain_cost;
1478
13.2M
        oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1479
13.2M
        chain_ssd=ssd[bi-1][qij]+cur_ssd;
1480
13.2M
        chain_rate=rate[bi-1][qij]+cur_rate
1481
13.2M
         +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1482
13.2M
        chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1483
13.2M
        if(chain_cost<best_cost){
1484
5.39M
          best_cost=chain_cost;
1485
5.39M
          best_ssd=chain_ssd;
1486
5.39M
          best_rate=chain_rate;
1487
5.39M
          best_qij=qij;
1488
5.39M
        }
1489
13.2M
      }
1490
13.0M
      *(qs[bi]+qii)=*(qt+best_qij);
1491
13.0M
      cost[bi][qii]=best_cost;
1492
13.0M
      ssd[bi][qii]=best_ssd;
1493
13.0M
      rate[bi][qii]=best_rate;
1494
13.0M
      prev[bi-1][qii]=best_qij;
1495
13.0M
    }
1496
8.45M
  }
1497
2.81M
  best_qii=0;
1498
2.81M
  best_cost=cost[3][0];
1499
4.36M
  for(qii=1;qii<nqis;qii++){
1500
1.54M
    if(cost[3][qii]<best_cost){
1501
634k
      best_cost=cost[3][qii];
1502
634k
      best_qii=qii;
1503
634k
    }
1504
1.54M
  }
1505
2.81M
  frags=_enc->state.frags;
1506
11.2M
  for(bi=3;;){
1507
11.2M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1508
11.2M
    frags[fragi].qii=best_qii;
1509
11.2M
    if(bi--<=0)break;
1510
8.45M
    best_qii=prev[bi][best_qii];
1511
8.45M
  }
1512
2.81M
  return best_cost;
1513
2.81M
}
1514
1515
/*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1516
static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1517
11.2M
 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi,unsigned _rd_scale){
1518
11.2M
  const unsigned char *src;
1519
11.2M
  oc_fragment         *frags;
1520
11.2M
  ptrdiff_t            frag_offs;
1521
11.2M
  oc_qii_state         qt[3];
1522
11.2M
  unsigned             cost[3];
1523
11.2M
  unsigned             satd;
1524
11.2M
  int                  dc;
1525
11.2M
  unsigned             best_cost;
1526
11.2M
  int                  best_qii;
1527
11.2M
  int                  qii;
1528
11.2M
  int                  lambda;
1529
11.2M
  int                  ystride;
1530
11.2M
  int                  nqis;
1531
11.2M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1532
11.2M
  ystride=_enc->state.ref_ystride[_pli];
1533
11.2M
  frag_offs=_enc->state.frag_buf_offs[_fragi];
1534
11.2M
  if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
1535
11.2M
    satd=oc_enc_frag_intra_satd(_enc,&dc,src+frag_offs,ystride);
1536
11.2M
  }
1537
0
  else{
1538
0
    satd=oc_enc_frag_intra_sad(_enc,src+frag_offs,ystride);
1539
0
  }
1540
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1541
     worth spending the bits to change the AC quantizer.
1542
    TODO: This may be worth revisiting when we separate out DC and AC
1543
     predictions from SATD.*/
1544
#if 0
1545
  nqis=_enc->state.nqis;
1546
#else
1547
11.2M
  nqis=1;
1548
11.2M
#endif
1549
11.2M
  lambda=_enc->lambda;
1550
11.2M
  best_qii=0;
1551
22.4M
  for(qii=0;qii<nqis;qii++){
1552
11.2M
    unsigned cur_rate;
1553
11.2M
    unsigned cur_ssd;
1554
11.2M
    oc_qii_state_advance(qt+qii,_qs,qii);
1555
11.2M
    cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,_pli,0,satd)
1556
11.2M
     +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1557
11.2M
    cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1558
11.2M
    cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1559
11.2M
  }
1560
11.2M
  best_cost=cost[0];
1561
11.2M
  for(qii=1;qii<nqis;qii++){
1562
0
    if(cost[qii]<best_cost){
1563
0
      best_cost=cost[qii];
1564
0
      best_qii=qii;
1565
0
    }
1566
0
  }
1567
11.2M
  frags=_enc->state.frags;
1568
11.2M
  frags[_fragi].qii=best_qii;
1569
11.2M
  return best_cost;
1570
11.2M
}
1571
1572
static void oc_enc_mb_transform_quantize_intra_luma(oc_enc_ctx *_enc,
1573
 oc_enc_pipeline_state *_pipe,unsigned _mbi,
1574
2.81M
 const unsigned _rd_scale[4],const unsigned _rd_iscale[4]){
1575
  /*Worst case token stack usage for 4 fragments.*/
1576
2.81M
  oc_token_checkpoint  stack[64*4];
1577
2.81M
  oc_token_checkpoint *stackptr;
1578
2.81M
  const oc_sb_map     *sb_maps;
1579
2.81M
  oc_fragment         *frags;
1580
2.81M
  ptrdiff_t           *coded_fragis;
1581
2.81M
  ptrdiff_t            ncoded_fragis;
1582
2.81M
  ptrdiff_t            fragi;
1583
2.81M
  int                  bi;
1584
2.81M
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1585
2.81M
  frags=_enc->state.frags;
1586
2.81M
  coded_fragis=_pipe->coded_fragis[0];
1587
2.81M
  ncoded_fragis=_pipe->ncoded_fragis[0];
1588
2.81M
  stackptr=stack;
1589
14.0M
  for(bi=0;bi<4;bi++){
1590
11.2M
    fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1591
11.2M
    frags[fragi].refi=OC_FRAME_SELF;
1592
11.2M
    frags[fragi].mb_mode=OC_MODE_INTRA;
1593
11.2M
    oc_enc_block_transform_quantize(_enc,_pipe,0,fragi,
1594
11.2M
     _rd_scale[bi],_rd_iscale[bi],NULL,NULL,&stackptr);
1595
11.2M
    coded_fragis[ncoded_fragis++]=fragi;
1596
11.2M
  }
1597
2.81M
  _pipe->ncoded_fragis[0]=ncoded_fragis;
1598
2.81M
}
1599
1600
static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1601
346k
 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1602
346k
  const ogg_uint16_t *mcu_rd_scale;
1603
346k
  const ogg_uint16_t *mcu_rd_iscale;
1604
346k
  const oc_sb_map    *sb_maps;
1605
346k
  ptrdiff_t          *coded_fragis;
1606
346k
  ptrdiff_t           ncoded_fragis;
1607
346k
  ptrdiff_t           froffset;
1608
346k
  int                 sbi;
1609
346k
  mcu_rd_scale=(const ogg_uint16_t *)_enc->mcu_rd_scale;
1610
346k
  mcu_rd_iscale=(const ogg_uint16_t *)_enc->mcu_rd_iscale;
1611
346k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1612
346k
  coded_fragis=_pipe->coded_fragis[_pli];
1613
346k
  ncoded_fragis=_pipe->ncoded_fragis[_pli];
1614
346k
  froffset=_pipe->froffset[_pli];
1615
2.06M
  for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1616
    /*Worst case token stack usage for 1 fragment.*/
1617
1.72M
    oc_token_checkpoint stack[64];
1618
1.72M
    int                 quadi;
1619
1.72M
    int                 bi;
1620
34.4M
    for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1621
27.5M
      ptrdiff_t fragi;
1622
27.5M
      fragi=sb_maps[sbi][quadi][bi];
1623
27.5M
      if(fragi>=0){
1624
11.2M
        oc_token_checkpoint *stackptr;
1625
11.2M
        unsigned             rd_scale;
1626
11.2M
        unsigned             rd_iscale;
1627
11.2M
        rd_scale=mcu_rd_scale[fragi-froffset];
1628
11.2M
        rd_iscale=mcu_rd_iscale[fragi-froffset];
1629
11.2M
        oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi,rd_scale);
1630
11.2M
        stackptr=stack;
1631
11.2M
        oc_enc_block_transform_quantize(_enc,_pipe,_pli,fragi,
1632
11.2M
         rd_scale,rd_iscale,NULL,NULL,&stackptr);
1633
11.2M
        coded_fragis[ncoded_fragis++]=fragi;
1634
11.2M
      }
1635
27.5M
    }
1636
1.72M
  }
1637
346k
  _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1638
346k
}
1639
1640
/*Analysis stage for an INTRA frame.*/
1641
22.6k
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1642
22.6k
  ogg_int64_t             activity_sum;
1643
22.6k
  ogg_int64_t             luma_sum;
1644
22.6k
  unsigned                activity_avg;
1645
22.6k
  unsigned                luma_avg;
1646
22.6k
  const ogg_uint16_t     *chroma_rd_scale;
1647
22.6k
  ogg_uint16_t           *mcu_rd_scale;
1648
22.6k
  ogg_uint16_t           *mcu_rd_iscale;
1649
22.6k
  const unsigned char    *map_idxs;
1650
22.6k
  int                     nmap_idxs;
1651
22.6k
  oc_sb_flags            *sb_flags;
1652
22.6k
  signed char            *mb_modes;
1653
22.6k
  const oc_mb_map        *mb_maps;
1654
22.6k
  const oc_sb_map        *sb_maps;
1655
22.6k
  oc_fragment            *frags;
1656
22.6k
  unsigned                stripe_sby;
1657
22.6k
  unsigned                mcu_nvsbs;
1658
22.6k
  int                     notstart;
1659
22.6k
  int                     notdone;
1660
22.6k
  int                     refi;
1661
22.6k
  int                     pli;
1662
22.6k
  _enc->state.frame_type=OC_INTRA_FRAME;
1663
22.6k
  oc_enc_tokenize_start(_enc);
1664
22.6k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
1665
22.6k
  oc_enc_mode_rd_init(_enc);
1666
22.6k
  activity_sum=luma_sum=0;
1667
22.6k
  activity_avg=_enc->activity_avg;
1668
22.6k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
1669
22.6k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTRA_FRAME][_enc->state.qis[0]];
1670
22.6k
  mcu_rd_scale=_enc->mcu_rd_scale;
1671
22.6k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
1672
  /*Choose MVs and MB modes and quantize and code luma.
1673
    Must be done in Hilbert order.*/
1674
22.6k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1675
22.6k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1676
22.6k
  _enc->state.ncoded_fragis[0]=0;
1677
22.6k
  _enc->state.ncoded_fragis[1]=0;
1678
22.6k
  _enc->state.ncoded_fragis[2]=0;
1679
22.6k
  sb_flags=_enc->state.sb_flags;
1680
22.6k
  mb_modes=_enc->state.mb_modes;
1681
22.6k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1682
22.6k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1683
22.6k
  frags=_enc->state.frags;
1684
22.6k
  notstart=0;
1685
22.6k
  notdone=1;
1686
22.6k
  mcu_nvsbs=_enc->mcu_nvsbs;
1687
196k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1688
173k
    ptrdiff_t cfroffset;
1689
173k
    unsigned  sbi;
1690
173k
    unsigned  sbi_end;
1691
173k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
1692
173k
    sbi_end=_enc->pipe.sbi_end[0];
1693
173k
    cfroffset=_enc->pipe.froffset[1];
1694
1.48M
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
1695
1.31M
      int quadi;
1696
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
1697
6.56M
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1698
2.81M
        unsigned  activity[4];
1699
2.81M
        unsigned  rd_scale[5];
1700
2.81M
        unsigned  rd_iscale[5];
1701
2.81M
        unsigned  luma;
1702
2.81M
        unsigned  mbi;
1703
2.81M
        int       mapii;
1704
2.81M
        int       mapi;
1705
2.81M
        int       bi;
1706
2.81M
        ptrdiff_t fragi;
1707
2.81M
        mbi=sbi<<2|quadi;
1708
        /*Activity masking.*/
1709
2.81M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1710
2.81M
          luma=oc_mb_activity(_enc,mbi,activity);
1711
2.81M
        }
1712
0
        else{
1713
0
          unsigned intra_satd[12];
1714
0
          luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
1715
0
          oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
1716
0
          for(bi=0;bi<4;bi++)frags[sb_maps[mbi>>2][mbi&3][bi]].qii=0;
1717
0
        }
1718
2.81M
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
1719
2.81M
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
1720
2.81M
        luma_sum+=luma;
1721
        /*Motion estimation:
1722
          We do a basic 1MV search for all macroblocks, coded or not,
1723
           keyframe or not, unless we aren't using motion estimation at all.*/
1724
2.81M
        if(!_recode&&_enc->state.curframe_num>0&&
1725
2.81M
         _enc->sp_level<OC_SP_LEVEL_NOMC&&_enc->keyframe_frequency_force>1){
1726
648
          oc_mcenc_search(_enc,mbi);
1727
648
        }
1728
2.81M
        if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
1729
2.81M
          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
1730
2.81M
        }
1731
2.81M
        mb_modes[mbi]=OC_MODE_INTRA;
1732
2.81M
        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
1733
2.81M
         mbi,rd_scale,rd_iscale);
1734
        /*Propagate final MB mode and MVs to the chroma blocks.*/
1735
14.0M
        for(mapii=4;mapii<nmap_idxs;mapii++){
1736
11.2M
          mapi=map_idxs[mapii];
1737
11.2M
          pli=mapi>>2;
1738
11.2M
          bi=mapi&3;
1739
11.2M
          fragi=mb_maps[mbi][pli][bi];
1740
11.2M
          frags[fragi].refi=OC_FRAME_SELF;
1741
11.2M
          frags[fragi].mb_mode=OC_MODE_INTRA;
1742
11.2M
        }
1743
        /*Save masking scale factors for chroma blocks.*/
1744
8.42M
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
1745
5.60M
          mapi=map_idxs[mapii];
1746
5.60M
          bi=mapi&3;
1747
5.60M
          fragi=mb_maps[mbi][1][bi];
1748
5.60M
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
1749
5.60M
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
1750
5.60M
        }
1751
2.81M
      }
1752
1.31M
    }
1753
173k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
1754
    /*Code chroma planes.*/
1755
520k
    for(pli=1;pli<3;pli++){
1756
346k
      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
1757
346k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
1758
346k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
1759
346k
    }
1760
173k
    notstart=1;
1761
173k
  }
1762
  /*Compute the average block activity and MB luma score for the frame.*/
1763
22.6k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
1764
22.6k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
1765
22.6k
   _enc->state.fplanes[0].nfrags));
1766
22.6k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
1767
  /*Finish filling in the reference frame borders.*/
1768
22.6k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1769
90.6k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1770
22.6k
  _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1771
22.6k
}
1772
1773
1774
1775
/*Cost information about a MB mode.*/
1776
struct oc_mode_choice{
1777
  unsigned      cost;
1778
  unsigned      ssd;
1779
  unsigned      rate;
1780
  unsigned      overhead;
1781
  unsigned char qii[12];
1782
};
1783
1784
1785
1786
5.60M
static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1787
5.60M
  _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1788
5.60M
   _modec->rate+_modec->overhead,_lambda);
1789
5.60M
}
1790
1791
/*A set of skip SSD's to use to disable early skipping.*/
1792
static const unsigned OC_NOSKIP[12]={
1793
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1794
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1795
  UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1796
};
1797
1798
/*The estimated number of bits used by a coded chroma block to specify the AC
1799
   quantizer.
1800
  TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1801
   measurements suggest this is in the right ballpark, but it varies somewhat
1802
   with lambda.*/
1803
10.4M
#define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1804
1805
static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1806
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1807
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1808
4.26M
 const unsigned _rd_scale[4],int _qti){
1809
4.26M
  oc_fr_state  fr;
1810
4.26M
  oc_qii_state qs;
1811
4.26M
  unsigned     ssd;
1812
4.26M
  unsigned     rate;
1813
4.26M
  unsigned     satd;
1814
4.26M
  unsigned     best_ssd;
1815
4.26M
  unsigned     best_rate;
1816
4.26M
  int          best_fri;
1817
4.26M
  int          best_qii;
1818
4.26M
  int          lambda;
1819
4.26M
  int          nqis;
1820
4.26M
  int          nskipped;
1821
4.26M
  int          bi;
1822
4.26M
  lambda=_enc->lambda;
1823
4.26M
  nqis=_enc->state.nqis;
1824
  /*We could do a trellis optimization here, but we don't make final skip
1825
     decisions until after transform+quantization, so the result wouldn't be
1826
     optimal anyway.
1827
    Instead we just use a greedy approach; for most SATD values, the
1828
     differences between the qiis are large enough to drown out the cost to
1829
     code the flags, anyway.*/
1830
4.26M
  *&fr=*_fr;
1831
4.26M
  *&qs=*_qs;
1832
4.26M
  ssd=rate=nskipped=0;
1833
21.3M
  for(bi=0;bi<4;bi++){
1834
17.0M
    oc_fr_state  ft[2];
1835
17.0M
    oc_qii_state qt[3];
1836
17.0M
    unsigned     best_cost;
1837
17.0M
    unsigned     cur_cost;
1838
17.0M
    unsigned     cur_ssd;
1839
17.0M
    unsigned     cur_rate;
1840
17.0M
    unsigned     cur_overhead;
1841
17.0M
    int          qii;
1842
17.0M
    satd=_frag_satd[bi];
1843
17.0M
    *(ft+0)=*&fr;
1844
17.0M
    oc_fr_code_block(ft+0);
1845
17.0M
    cur_overhead=ft[0].bits-fr.bits;
1846
17.0M
    best_rate=oc_dct_cost2(_enc,&best_ssd,0,0,_qti,satd)
1847
17.0M
     +(cur_overhead<<OC_BIT_SCALE);
1848
17.0M
    if(nqis>1){
1849
5.56M
      oc_qii_state_advance(qt+0,&qs,0);
1850
5.56M
      best_rate+=qt[0].bits-qs.bits<<OC_BIT_SCALE;
1851
5.56M
    }
1852
17.0M
    best_ssd=OC_RD_SCALE(best_ssd,_rd_scale[bi]);
1853
17.0M
    best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1854
17.0M
    best_fri=0;
1855
17.0M
    best_qii=0;
1856
23.0M
    for(qii=1;qii<nqis;qii++){
1857
5.96M
      oc_qii_state_advance(qt+qii,&qs,qii);
1858
5.96M
      cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,0,_qti,satd)
1859
5.96M
       +(cur_overhead+qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1860
5.96M
      cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale[bi]);
1861
5.96M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1862
5.96M
      if(cur_cost<best_cost){
1863
1.67M
        best_cost=cur_cost;
1864
1.67M
        best_ssd=cur_ssd;
1865
1.67M
        best_rate=cur_rate;
1866
1.67M
        best_qii=qii;
1867
1.67M
      }
1868
5.96M
    }
1869
17.0M
    if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)&&nskipped<3){
1870
9.45M
      *(ft+1)=*&fr;
1871
9.45M
      oc_fr_skip_block(ft+1);
1872
9.45M
      cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1873
9.45M
      cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1874
9.45M
      cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1875
9.45M
      if(cur_cost<=best_cost){
1876
2.67M
        best_ssd=cur_ssd;
1877
2.67M
        best_rate=cur_overhead;
1878
2.67M
        best_fri=1;
1879
2.67M
        best_qii+=4;
1880
2.67M
      }
1881
9.45M
    }
1882
17.0M
    rate+=best_rate;
1883
17.0M
    ssd+=best_ssd;
1884
17.0M
    *&fr=*(ft+best_fri);
1885
17.0M
    if(best_fri==0)*&qs=*(qt+best_qii);
1886
2.67M
    else nskipped++;
1887
17.0M
    _modec->qii[bi]=best_qii;
1888
17.0M
  }
1889
4.26M
  _modec->ssd=ssd;
1890
4.26M
  _modec->rate=rate;
1891
4.26M
}
1892
1893
static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1894
 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1895
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
1896
4.26M
 unsigned _rd_scale,int _qti){
1897
4.26M
  unsigned ssd;
1898
4.26M
  unsigned rate;
1899
4.26M
  unsigned satd;
1900
4.26M
  unsigned best_ssd;
1901
4.26M
  unsigned best_rate;
1902
4.26M
  int      best_qii;
1903
4.26M
  unsigned cur_cost;
1904
4.26M
  unsigned cur_ssd;
1905
4.26M
  unsigned cur_rate;
1906
4.26M
  int      lambda;
1907
4.26M
  int      nblocks;
1908
4.26M
  int      nqis;
1909
4.26M
  int      pli;
1910
4.26M
  int      bi;
1911
4.26M
  int      qii;
1912
4.26M
  lambda=_enc->lambda;
1913
  /*Most chroma blocks have no AC coefficients to speak of anyway, so it's not
1914
     worth spending the bits to change the AC quantizer.
1915
    TODO: This may be worth revisiting when we separate out DC and AC
1916
     predictions from SATD.*/
1917
#if 0
1918
  nqis=_enc->state.nqis;
1919
#else
1920
4.26M
  nqis=1;
1921
4.26M
#endif
1922
4.26M
  ssd=_modec->ssd;
1923
4.26M
  rate=_modec->rate;
1924
  /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1925
     order, we assume a constant overhead for coded block and qii flags.*/
1926
4.26M
  nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1927
4.26M
  nblocks=(nblocks-4>>1)+4;
1928
4.26M
  bi=4;
1929
12.8M
  for(pli=1;pli<3;pli++){
1930
18.9M
    for(;bi<nblocks;bi++){
1931
10.4M
      unsigned best_cost;
1932
10.4M
      satd=_frag_satd[bi];
1933
10.4M
      best_rate=oc_dct_cost2(_enc,&best_ssd,0,pli,_qti,satd)
1934
10.4M
       +OC_CHROMA_QII_RATE;
1935
10.4M
      best_ssd=OC_RD_SCALE(best_ssd,_rd_scale);
1936
10.4M
      best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1937
10.4M
      best_qii=0;
1938
10.4M
      for(qii=1;qii<nqis;qii++){
1939
0
        cur_rate=oc_dct_cost2(_enc,&cur_ssd,qii,pli,_qti,satd)
1940
0
         +OC_CHROMA_QII_RATE;
1941
0
        cur_ssd=OC_RD_SCALE(cur_ssd,_rd_scale);
1942
0
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1943
0
        if(cur_cost<best_cost){
1944
0
          best_cost=cur_cost;
1945
0
          best_ssd=cur_ssd;
1946
0
          best_rate=cur_rate;
1947
0
          best_qii=qii;
1948
0
        }
1949
0
      }
1950
10.4M
      if(_skip_ssd[bi]<(UINT_MAX>>OC_BIT_SCALE+2)){
1951
7.44M
        cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1952
7.44M
        cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1953
7.44M
        if(cur_cost<=best_cost){
1954
3.06M
          best_ssd=cur_ssd;
1955
3.06M
          best_rate=0;
1956
3.06M
          best_qii+=4;
1957
3.06M
        }
1958
7.44M
      }
1959
10.4M
      rate+=best_rate;
1960
10.4M
      ssd+=best_ssd;
1961
10.4M
      _modec->qii[bi]=best_qii;
1962
10.4M
    }
1963
8.53M
    nblocks=(nblocks-4<<1)+4;
1964
8.53M
  }
1965
4.26M
  _modec->ssd=ssd;
1966
4.26M
  _modec->rate=rate;
1967
4.26M
}
1968
1969
static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1970
411k
 unsigned _mbi,const unsigned _rd_scale[4],unsigned _ssd[12]){
1971
411k
  const unsigned char   *src;
1972
411k
  const unsigned char   *ref;
1973
411k
  int                    ystride;
1974
411k
  const oc_fragment     *frags;
1975
411k
  const ptrdiff_t       *frag_buf_offs;
1976
411k
  const ptrdiff_t       *sb_map;
1977
411k
  const oc_mb_map_plane *mb_map;
1978
411k
  const unsigned char   *map_idxs;
1979
411k
  oc_mv                 *mvs;
1980
411k
  int                    map_nidxs;
1981
411k
  unsigned               uncoded_ssd;
1982
411k
  int                    mapii;
1983
411k
  int                    mapi;
1984
411k
  int                    pli;
1985
411k
  int                    bi;
1986
411k
  ptrdiff_t              fragi;
1987
411k
  ptrdiff_t              frag_offs;
1988
411k
  int                    borderi;
1989
411k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
1990
411k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
1991
411k
  ystride=_enc->state.ref_ystride[0];
1992
411k
  frags=_enc->state.frags;
1993
411k
  frag_buf_offs=_enc->state.frag_buf_offs;
1994
411k
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1995
411k
  mvs=_enc->mb_info[_mbi].block_mv;
1996
2.05M
  for(bi=0;bi<4;bi++){
1997
1.64M
    fragi=sb_map[bi];
1998
1.64M
    borderi=frags[fragi].borderi;
1999
1.64M
    frag_offs=frag_buf_offs[fragi];
2000
1.64M
    if(borderi<0){
2001
1.19M
      uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2002
1.19M
    }
2003
449k
    else{
2004
449k
      uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2005
449k
       src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2006
449k
    }
2007
    /*Scale to match DCT domain and RD.*/
2008
1.64M
    uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[bi]);
2009
    /*Motion is a special case; if there is more than a full-pixel motion
2010
       against the prior frame, penalize skipping.
2011
      TODO: The factor of two here is a kludge, but it tested out better than a
2012
       hard limit.*/
2013
1.64M
    if(mvs[bi]!=0)uncoded_ssd*=2;
2014
1.64M
    _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
2015
1.64M
  }
2016
411k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2017
411k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2018
411k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2019
411k
  map_nidxs=(map_nidxs-4>>1)+4;
2020
411k
  mapii=4;
2021
411k
  mvs=_enc->mb_info[_mbi].unref_mv;
2022
1.23M
  for(pli=1;pli<3;pli++){
2023
822k
    ystride=_enc->state.ref_ystride[pli];
2024
1.82M
    for(;mapii<map_nidxs;mapii++){
2025
1.00M
      mapi=map_idxs[mapii];
2026
1.00M
      bi=mapi&3;
2027
1.00M
      fragi=mb_map[pli][bi];
2028
1.00M
      borderi=frags[fragi].borderi;
2029
1.00M
      frag_offs=frag_buf_offs[fragi];
2030
1.00M
      if(borderi<0){
2031
716k
        uncoded_ssd=oc_enc_frag_ssd(_enc,src+frag_offs,ref+frag_offs,ystride);
2032
716k
      }
2033
289k
      else{
2034
289k
        uncoded_ssd=oc_enc_frag_border_ssd(_enc,
2035
289k
         src+frag_offs,ref+frag_offs,ystride,_enc->state.borders[borderi].mask);
2036
289k
      }
2037
      /*Scale to match DCT domain and RD.*/
2038
1.00M
      uncoded_ssd=OC_RD_SKIP_SCALE(uncoded_ssd,_rd_scale[4]);
2039
      /*Motion is a special case; if there is more than a full-pixel motion
2040
         against the prior frame, penalize skipping.
2041
        TODO: The factor of two here is a kludge, but it tested out better than
2042
         a hard limit*/
2043
1.00M
      if(mvs[OC_FRAME_PREV]!=0)uncoded_ssd*=2;
2044
1.00M
      _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
2045
1.00M
    }
2046
822k
    map_nidxs=(map_nidxs-4<<1)+4;
2047
822k
  }
2048
411k
}
2049
2050
2051
static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2052
 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
2053
 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],
2054
822k
 const unsigned _rd_scale[5]){
2055
822k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,_rd_scale,0);
2056
822k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2057
822k
   _frag_satd,_skip_ssd,_rd_scale[4],0);
2058
822k
  _modec->overhead=
2059
822k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
2060
822k
  oc_mode_set_cost(_modec,_enc->lambda);
2061
822k
}
2062
2063
static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2064
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2065
 const oc_fr_state *_fr,const oc_qii_state *_qs,
2066
2.98M
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2067
2.98M
  unsigned               frag_satd[12];
2068
2.98M
  const unsigned char   *src;
2069
2.98M
  const unsigned char   *ref;
2070
2.98M
  int                    ystride;
2071
2.98M
  const ptrdiff_t       *frag_buf_offs;
2072
2.98M
  const ptrdiff_t       *sb_map;
2073
2.98M
  const oc_mb_map_plane *mb_map;
2074
2.98M
  const unsigned char   *map_idxs;
2075
2.98M
  int                    map_nidxs;
2076
2.98M
  int                    mapii;
2077
2.98M
  int                    mapi;
2078
2.98M
  int                    mv_offs[2];
2079
2.98M
  int                    pli;
2080
2.98M
  int                    bi;
2081
2.98M
  ptrdiff_t              fragi;
2082
2.98M
  ptrdiff_t              frag_offs;
2083
2.98M
  int                    dc;
2084
2.98M
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2085
2.98M
  ref=_enc->state.ref_frame_data[OC_FRAME_FOR_MODE(_mb_mode)];
2086
2.98M
  ystride=_enc->state.ref_ystride[0];
2087
2.98M
  frag_buf_offs=_enc->state.frag_buf_offs;
2088
2.98M
  sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
2089
2.98M
  _modec->rate=_modec->ssd=0;
2090
2.98M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv)>1){
2091
2.87M
    for(bi=0;bi<4;bi++){
2092
2.30M
      fragi=sb_map[bi];
2093
2.30M
      frag_offs=frag_buf_offs[fragi];
2094
2.30M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2095
2.30M
        frag_satd[bi]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2096
2.30M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2097
2.30M
        frag_satd[bi]+=abs(dc);
2098
2.30M
      }
2099
0
      else{
2100
0
        frag_satd[bi]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2101
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2102
0
      }
2103
2.30M
    }
2104
575k
  }
2105
2.40M
  else{
2106
12.0M
    for(bi=0;bi<4;bi++){
2107
9.61M
      fragi=sb_map[bi];
2108
9.61M
      frag_offs=frag_buf_offs[fragi];
2109
9.61M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2110
9.61M
        frag_satd[bi]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2111
9.61M
         ref+frag_offs+mv_offs[0],ystride);
2112
9.61M
        frag_satd[bi]+=abs(dc);
2113
9.61M
      }
2114
0
      else{
2115
0
        frag_satd[bi]=oc_enc_frag_sad(_enc,src+frag_offs,
2116
0
         ref+frag_offs+mv_offs[0],ystride);
2117
0
      }
2118
9.61M
    }
2119
2.40M
  }
2120
2.98M
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2121
2.98M
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2122
2.98M
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2123
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2124
2.98M
  ystride=_enc->state.ref_ystride[1];
2125
2.98M
  if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,_mv)>1){
2126
3.50M
    for(mapii=4;mapii<map_nidxs;mapii++){
2127
2.42M
      mapi=map_idxs[mapii];
2128
2.42M
      pli=mapi>>2;
2129
2.42M
      bi=mapi&3;
2130
2.42M
      fragi=mb_map[pli][bi];
2131
2.42M
      frag_offs=frag_buf_offs[fragi];
2132
2.42M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2133
2.42M
        frag_satd[mapii]=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2134
2.42M
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2135
2.42M
        frag_satd[mapii]+=abs(dc);
2136
2.42M
      }
2137
0
      else{
2138
0
        frag_satd[mapii]=oc_enc_frag_sad2_thresh(_enc,src+frag_offs,
2139
0
         ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
2140
0
      }
2141
2.42M
    }
2142
1.08M
  }
2143
1.89M
  else{
2144
6.75M
    for(mapii=4;mapii<map_nidxs;mapii++){
2145
4.85M
      mapi=map_idxs[mapii];
2146
4.85M
      pli=mapi>>2;
2147
4.85M
      bi=mapi&3;
2148
4.85M
      fragi=mb_map[pli][bi];
2149
4.85M
      frag_offs=frag_buf_offs[fragi];
2150
4.85M
      if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
2151
4.85M
        frag_satd[mapii]=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2152
4.85M
         ref+frag_offs+mv_offs[0],ystride);
2153
4.85M
        frag_satd[mapii]+=abs(dc);
2154
4.85M
      }
2155
0
      else{
2156
0
        frag_satd[mapii]=oc_enc_frag_sad(_enc,src+frag_offs,
2157
0
         ref+frag_offs+mv_offs[0],ystride);
2158
0
      }
2159
4.85M
    }
2160
1.89M
  }
2161
2.98M
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,_rd_scale,1);
2162
2.98M
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2163
2.98M
   frag_satd,_skip_ssd,_rd_scale[4],1);
2164
2.98M
  _modec->overhead=
2165
2.98M
   oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
2166
2.98M
  oc_mode_set_cost(_modec,_enc->lambda);
2167
2.98M
}
2168
2169
static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2170
 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
2171
822k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[4]){
2172
822k
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,0,_fr,_qs,_skip_ssd,_rd_scale);
2173
822k
}
2174
2175
static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2176
 unsigned _mbi,int _mb_mode,oc_mv _mv,
2177
 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12],
2178
1.33M
 const unsigned _rd_scale[4]){
2179
1.33M
  int bits0;
2180
1.33M
  oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd,_rd_scale);
2181
1.33M
  bits0=OC_MV_BITS[0][OC_MV_X(_mv)+31]+OC_MV_BITS[0][OC_MV_Y(_mv)+31];
2182
1.33M
  _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
2183
1.33M
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2184
1.33M
  oc_mode_set_cost(_modec,_enc->lambda);
2185
1.33M
  return bits0;
2186
1.33M
}
2187
2188
/*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
2189
static const unsigned char OC_MB_PHASE[4][4]={
2190
  {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
2191
};
2192
2193
static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
2194
 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
2195
465k
 const unsigned _skip_ssd[12],const unsigned _rd_scale[5]){
2196
465k
  unsigned               frag_satd[12];
2197
465k
  oc_mv                  lbmvs[4];
2198
465k
  oc_mv                  cbmvs[4];
2199
465k
  const unsigned char   *src;
2200
465k
  const unsigned char   *ref;
2201
465k
  int                    ystride;
2202
465k
  const ptrdiff_t       *frag_buf_offs;
2203
465k
  oc_mv                 *frag_mvs;
2204
465k
  const oc_mb_map_plane *mb_map;
2205
465k
  const unsigned char   *map_idxs;
2206
465k
  int                    map_nidxs;
2207
465k
  int                    nqis;
2208
465k
  int                    mapii;
2209
465k
  int                    mapi;
2210
465k
  int                    mv_offs[2];
2211
465k
  int                    pli;
2212
465k
  int                    bi;
2213
465k
  ptrdiff_t              fragi;
2214
465k
  ptrdiff_t              frag_offs;
2215
465k
  int                    bits0;
2216
465k
  int                    bits1;
2217
465k
  unsigned               satd;
2218
465k
  int                    dc;
2219
465k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
2220
465k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
2221
465k
  ystride=_enc->state.ref_ystride[0];
2222
465k
  frag_buf_offs=_enc->state.frag_buf_offs;
2223
465k
  frag_mvs=_enc->state.frag_mvs;
2224
465k
  mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
2225
465k
  _modec->rate=_modec->ssd=0;
2226
2.32M
  for(bi=0;bi<4;bi++){
2227
1.86M
    fragi=mb_map[0][bi];
2228
    /*Save the block MVs as the current ones while we're here; we'll replace
2229
       them if we don't ultimately choose 4MV mode.*/
2230
1.86M
    frag_mvs[fragi]=_mv[bi];
2231
1.86M
    frag_offs=frag_buf_offs[fragi];
2232
1.86M
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,_mv[bi])>1){
2233
184k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2234
184k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2235
184k
    }
2236
1.67M
    else{
2237
1.67M
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2238
1.67M
       ref+frag_offs+mv_offs[0],ystride);
2239
1.67M
    }
2240
1.86M
    frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd+abs(dc);
2241
1.86M
  }
2242
465k
  oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
2243
465k
   _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,_rd_scale,1);
2244
  /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
2245
465k
  bits0=0;
2246
465k
  bits1=0;
2247
465k
  nqis=_enc->state.nqis;
2248
2.32M
  for(bi=0;bi<4;bi++){
2249
1.86M
    if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis)lbmvs[bi]=0;
2250
1.56M
    else{
2251
1.56M
      lbmvs[bi]=_mv[bi];
2252
1.56M
      bits0+=OC_MV_BITS[0][OC_MV_X(_mv[bi])+31]
2253
1.56M
       +OC_MV_BITS[0][OC_MV_Y(_mv[bi])+31];
2254
1.56M
      bits1+=12;
2255
1.56M
    }
2256
1.86M
  }
2257
465k
  (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,lbmvs);
2258
465k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2259
465k
  map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2260
  /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
2261
465k
  ystride=_enc->state.ref_ystride[1];
2262
1.61M
  for(mapii=4;mapii<map_nidxs;mapii++){
2263
1.14M
    mapi=map_idxs[mapii];
2264
1.14M
    pli=mapi>>2;
2265
1.14M
    bi=mapi&3;
2266
1.14M
    fragi=mb_map[pli][bi];
2267
1.14M
    frag_offs=frag_buf_offs[fragi];
2268
    /*TODO: We could save half these calls by re-using the results for the Cb
2269
       and Cr planes; is it worth it?*/
2270
1.14M
    if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,cbmvs[bi])>1){
2271
673k
      satd=oc_enc_frag_satd2(_enc,&dc,src+frag_offs,
2272
673k
       ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride);
2273
673k
    }
2274
473k
    else{
2275
473k
      satd=oc_enc_frag_satd(_enc,&dc,src+frag_offs,
2276
473k
       ref+frag_offs+mv_offs[0],ystride);
2277
473k
    }
2278
1.14M
    frag_satd[mapii]=satd+abs(dc);
2279
1.14M
  }
2280
465k
  oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,
2281
465k
   frag_satd,_skip_ssd,_rd_scale[4],1);
2282
465k
  _modec->overhead=
2283
465k
   oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
2284
465k
   +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
2285
465k
   -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2286
465k
  oc_mode_set_cost(_modec,_enc->lambda);
2287
465k
}
2288
2289
39.4k
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
2290
39.4k
  oc_set_chroma_mvs_func  set_chroma_mvs;
2291
39.4k
  oc_qii_state            intra_luma_qs;
2292
39.4k
  oc_mv                   last_mv;
2293
39.4k
  oc_mv                   prior_mv;
2294
39.4k
  ogg_int64_t             interbits;
2295
39.4k
  ogg_int64_t             intrabits;
2296
39.4k
  ogg_int64_t             activity_sum;
2297
39.4k
  ogg_int64_t             luma_sum;
2298
39.4k
  unsigned                activity_avg;
2299
39.4k
  unsigned                luma_avg;
2300
39.4k
  const ogg_uint16_t     *chroma_rd_scale;
2301
39.4k
  ogg_uint16_t           *mcu_rd_scale;
2302
39.4k
  ogg_uint16_t           *mcu_rd_iscale;
2303
39.4k
  const unsigned char    *map_idxs;
2304
39.4k
  int                     nmap_idxs;
2305
39.4k
  unsigned               *coded_mbis;
2306
39.4k
  unsigned               *uncoded_mbis;
2307
39.4k
  size_t                  ncoded_mbis;
2308
39.4k
  size_t                  nuncoded_mbis;
2309
39.4k
  oc_sb_flags            *sb_flags;
2310
39.4k
  signed char            *mb_modes;
2311
39.4k
  const oc_sb_map        *sb_maps;
2312
39.4k
  const oc_mb_map        *mb_maps;
2313
39.4k
  oc_mb_enc_info         *embs;
2314
39.4k
  oc_fragment            *frags;
2315
39.4k
  oc_mv                  *frag_mvs;
2316
39.4k
  unsigned                stripe_sby;
2317
39.4k
  unsigned                mcu_nvsbs;
2318
39.4k
  int                     notstart;
2319
39.4k
  int                     notdone;
2320
39.4k
  unsigned                sbi;
2321
39.4k
  unsigned                sbi_end;
2322
39.4k
  int                     refi;
2323
39.4k
  int                     pli;
2324
39.4k
  int                     sp_level;
2325
39.4k
  sp_level=_enc->sp_level;
2326
39.4k
  set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
2327
39.4k
  _enc->state.frame_type=OC_INTER_FRAME;
2328
39.4k
  oc_mode_scheme_chooser_reset(&_enc->chooser);
2329
39.4k
  oc_enc_tokenize_start(_enc);
2330
39.4k
  oc_enc_pipeline_init(_enc,&_enc->pipe);
2331
39.4k
  oc_enc_mode_rd_init(_enc);
2332
39.4k
  if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
2333
39.4k
  _enc->mv_bits[0]=_enc->mv_bits[1]=0;
2334
39.4k
  interbits=intrabits=0;
2335
39.4k
  activity_sum=luma_sum=0;
2336
39.4k
  activity_avg=_enc->activity_avg;
2337
39.4k
  luma_avg=OC_CLAMPI(90<<8,_enc->luma_avg,160<<8);
2338
39.4k
  chroma_rd_scale=_enc->chroma_rd_scale[OC_INTER_FRAME][_enc->state.qis[0]];
2339
39.4k
  mcu_rd_scale=_enc->mcu_rd_scale;
2340
39.4k
  mcu_rd_iscale=_enc->mcu_rd_iscale;
2341
39.4k
  last_mv=prior_mv=0;
2342
  /*Choose MVs and MB modes and quantize and code luma.
2343
    Must be done in Hilbert order.*/
2344
39.4k
  map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
2345
39.4k
  nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
2346
39.4k
  coded_mbis=_enc->coded_mbis;
2347
39.4k
  uncoded_mbis=coded_mbis+_enc->state.nmbs;
2348
39.4k
  ncoded_mbis=0;
2349
39.4k
  nuncoded_mbis=0;
2350
39.4k
  _enc->state.ncoded_fragis[0]=0;
2351
39.4k
  _enc->state.ncoded_fragis[1]=0;
2352
39.4k
  _enc->state.ncoded_fragis[2]=0;
2353
39.4k
  sb_flags=_enc->state.sb_flags;
2354
39.4k
  mb_modes=_enc->state.mb_modes;
2355
39.4k
  sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
2356
39.4k
  mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
2357
39.4k
  embs=_enc->mb_info;
2358
39.4k
  frags=_enc->state.frags;
2359
39.4k
  frag_mvs=_enc->state.frag_mvs;
2360
39.4k
  notstart=0;
2361
39.4k
  notdone=1;
2362
39.4k
  mcu_nvsbs=_enc->mcu_nvsbs;
2363
81.8k
  for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
2364
42.4k
    ptrdiff_t cfroffset;
2365
42.4k
    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
2366
42.4k
    sbi_end=_enc->pipe.sbi_end[0];
2367
42.4k
    cfroffset=_enc->pipe.froffset[1];
2368
166k
    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
2369
123k
      int quadi;
2370
      /*Mode addressing is through Y plane, always 4 MB per SB.*/
2371
618k
      for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
2372
411k
        oc_mode_choice modes[8];
2373
411k
        unsigned       activity[4];
2374
411k
        unsigned       rd_scale[5];
2375
411k
        unsigned       rd_iscale[5];
2376
411k
        unsigned       skip_ssd[12];
2377
411k
        unsigned       intra_satd[12];
2378
411k
        unsigned       luma;
2379
411k
        int            mb_mv_bits_0;
2380
411k
        int            mb_gmv_bits_0;
2381
411k
        int            inter_mv_pref;
2382
411k
        int            mb_mode;
2383
411k
        int            refi;
2384
411k
        int            mv;
2385
411k
        unsigned       mbi;
2386
411k
        int            mapii;
2387
411k
        int            mapi;
2388
411k
        int            bi;
2389
411k
        ptrdiff_t      fragi;
2390
411k
        mbi=sbi<<2|quadi;
2391
411k
        luma=oc_mb_intra_satd(_enc,mbi,intra_satd);
2392
        /*Activity masking.*/
2393
411k
        if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2394
411k
          oc_mb_activity(_enc,mbi,activity);
2395
411k
        }
2396
0
        else oc_mb_activity_fast(_enc,mbi,activity,intra_satd);
2397
411k
        luma_sum+=luma;
2398
411k
        activity_sum+=oc_mb_masking(rd_scale,rd_iscale,
2399
411k
         chroma_rd_scale,activity,activity_avg,luma,luma_avg);
2400
        /*Motion estimation:
2401
          We always do a basic 1MV search for all macroblocks, coded or not,
2402
           keyframe or not.*/
2403
411k
        if(!_recode&&sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
2404
411k
        mv=0;
2405
        /*Find the block choice with the lowest estimated coding cost.
2406
          If a Cb or Cr block is coded but no Y' block from a macro block then
2407
           the mode MUST be OC_MODE_INTER_NOMV.
2408
          This is the default state to which the mode data structure is
2409
           initialised in encoder and decoder at the start of each frame.*/
2410
        /*Block coding cost is estimated from correlated SATD metrics.*/
2411
        /*At this point, all blocks that are in frame are still marked coded.*/
2412
411k
        if(!_recode){
2413
313k
          embs[mbi].unref_mv[OC_FRAME_GOLD]=
2414
313k
           embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2415
313k
          embs[mbi].unref_mv[OC_FRAME_PREV]=
2416
313k
           embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2417
313k
          embs[mbi].refined=0;
2418
313k
        }
2419
        /*Estimate the cost of coding this MB in a keyframe.*/
2420
411k
        if(_allow_keyframe){
2421
411k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2422
411k
           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
2423
411k
          intrabits+=modes[OC_MODE_INTRA].rate;
2424
2.05M
          for(bi=0;bi<4;bi++){
2425
1.64M
            oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
2426
1.64M
             modes[OC_MODE_INTRA].qii[bi]);
2427
1.64M
          }
2428
411k
        }
2429
        /*Estimate the cost in a delta frame for various modes.*/
2430
411k
        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
2431
411k
        if(sp_level<OC_SP_LEVEL_NOMC){
2432
411k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2433
411k
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2434
411k
           skip_ssd,rd_scale);
2435
411k
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2436
411k
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2437
411k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2438
411k
           OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
2439
411k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2440
411k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
2441
411k
           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2442
411k
           skip_ssd,rd_scale);
2443
411k
          oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
2444
411k
           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2445
411k
           skip_ssd,rd_scale);
2446
411k
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2447
411k
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2448
411k
           skip_ssd,rd_scale);
2449
411k
          mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2450
411k
           OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
2451
411k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2452
          /*The explicit MV modes (2,6,7) have not yet gone through halfpel
2453
             refinement.
2454
            We choose the explicit MV mode that's already furthest ahead on
2455
             R-D cost and refine only that one.
2456
            We have to be careful to remember which ones we've refined so that
2457
             we don't refine it again if we re-encode this frame.*/
2458
411k
          inter_mv_pref=_enc->lambda*3;
2459
411k
          if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
2460
411k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2461
411k
             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2462
411k
             skip_ssd,rd_scale);
2463
411k
          }
2464
0
          else{
2465
0
            modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
2466
0
          }
2467
411k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
2468
411k
           modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
2469
53.8k
            if(!(embs[mbi].refined&0x80)){
2470
41.8k
              oc_mcenc_refine4mv(_enc,mbi);
2471
41.8k
              embs[mbi].refined|=0x80;
2472
41.8k
            }
2473
53.8k
            oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2474
53.8k
             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
2475
53.8k
             skip_ssd,rd_scale);
2476
53.8k
          }
2477
357k
          else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2478
357k
           modes[OC_MODE_INTER_MV].cost){
2479
100k
            if(!(embs[mbi].refined&0x40)){
2480
86.6k
              oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2481
86.6k
              embs[mbi].refined|=0x40;
2482
86.6k
            }
2483
100k
            mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2484
100k
             OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2485
100k
             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2486
100k
          }
2487
411k
          if(!(embs[mbi].refined&0x04)){
2488
313k
            oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2489
313k
            embs[mbi].refined|=0x04;
2490
313k
          }
2491
411k
          mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2492
411k
           OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2493
411k
           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
2494
          /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2495
411k
          mb_mode=OC_MODE_INTER_NOMV;
2496
411k
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2497
319k
            mb_mode=OC_MODE_INTRA;
2498
319k
          }
2499
411k
          if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2500
61.9k
            mb_mode=OC_MODE_INTER_MV_LAST;
2501
61.9k
          }
2502
411k
          if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2503
10.9k
            mb_mode=OC_MODE_INTER_MV_LAST2;
2504
10.9k
          }
2505
411k
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2506
16.5k
            mb_mode=OC_MODE_GOLDEN_NOMV;
2507
16.5k
          }
2508
411k
          if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2509
33.0k
            mb_mode=OC_MODE_GOLDEN_MV;
2510
33.0k
          }
2511
411k
          if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2512
17.0k
            mb_mode=OC_MODE_INTER_MV_FOUR;
2513
17.0k
          }
2514
          /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2515
411k
          if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2516
53.6k
            inter_mv_pref=0;
2517
53.6k
          }
2518
411k
          if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2519
31.0k
            mb_mode=OC_MODE_INTER_MV;
2520
31.0k
          }
2521
411k
        }
2522
0
        else{
2523
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
2524
0
           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2525
0
           skip_ssd,rd_scale);
2526
0
          oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
2527
0
           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
2528
0
          oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2529
0
           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
2530
0
           skip_ssd,rd_scale);
2531
0
          mb_mode=OC_MODE_INTER_NOMV;
2532
0
          if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2533
0
            mb_mode=OC_MODE_INTRA;
2534
0
          }
2535
0
          if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2536
0
            mb_mode=OC_MODE_GOLDEN_NOMV;
2537
0
          }
2538
0
          mb_mv_bits_0=mb_gmv_bits_0=0;
2539
0
        }
2540
411k
        mb_modes[mbi]=mb_mode;
2541
        /*Propagate the MVs to the luma blocks.*/
2542
411k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2543
401k
          switch(mb_mode){
2544
31.0k
            case OC_MODE_INTER_MV:{
2545
31.0k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2546
31.0k
            }break;
2547
42.2k
            case OC_MODE_INTER_MV_LAST:mv=last_mv;break;
2548
7.46k
            case OC_MODE_INTER_MV_LAST2:mv=prior_mv;break;
2549
23.2k
            case OC_MODE_GOLDEN_MV:{
2550
23.2k
              mv=embs[mbi].analysis_mv[0][OC_FRAME_GOLD];
2551
23.2k
            }break;
2552
401k
          }
2553
2.00M
          for(bi=0;bi<4;bi++){
2554
1.60M
            fragi=mb_maps[mbi][0][bi];
2555
1.60M
            frag_mvs[fragi]=mv;
2556
1.60M
          }
2557
401k
        }
2558
2.05M
        for(bi=0;bi<4;bi++){
2559
1.64M
          fragi=sb_maps[mbi>>2][mbi&3][bi];
2560
1.64M
          frags[fragi].qii=modes[mb_mode].qii[bi];
2561
1.64M
        }
2562
411k
        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
2563
411k
         modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
2564
346k
          int orig_mb_mode;
2565
346k
          orig_mb_mode=mb_mode;
2566
346k
          mb_mode=mb_modes[mbi];
2567
346k
          refi=OC_FRAME_FOR_MODE(mb_mode);
2568
346k
          switch(mb_mode){
2569
18.5k
            case OC_MODE_INTER_MV:{
2570
18.5k
              prior_mv=last_mv;
2571
              /*If we're backing out from 4MV, find the MV we're actually
2572
                 using.*/
2573
18.5k
              if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2574
186
                for(bi=0;;bi++){
2575
186
                  fragi=mb_maps[mbi][0][bi];
2576
186
                  if(frags[fragi].coded){
2577
66
                    mv=last_mv=frag_mvs[fragi];
2578
66
                    break;
2579
66
                  }
2580
186
                }
2581
66
                mb_mv_bits_0=OC_MV_BITS[0][OC_MV_X(mv)+31]
2582
66
                 +OC_MV_BITS[0][OC_MV_Y(mv)+31];
2583
66
              }
2584
              /*Otherwise we used the original analysis MV.*/
2585
18.5k
              else last_mv=embs[mbi].analysis_mv[0][OC_FRAME_PREV];
2586
18.5k
              _enc->mv_bits[0]+=mb_mv_bits_0;
2587
18.5k
              _enc->mv_bits[1]+=12;
2588
18.5k
            }break;
2589
5.86k
            case OC_MODE_INTER_MV_LAST2:{
2590
5.86k
              oc_mv tmp_mv;
2591
5.86k
              tmp_mv=prior_mv;
2592
5.86k
              prior_mv=last_mv;
2593
5.86k
              last_mv=tmp_mv;
2594
5.86k
            }break;
2595
19.0k
            case OC_MODE_GOLDEN_MV:{
2596
19.0k
              _enc->mv_bits[0]+=mb_gmv_bits_0;
2597
19.0k
              _enc->mv_bits[1]+=12;
2598
19.0k
            }break;
2599
7.75k
            case OC_MODE_INTER_MV_FOUR:{
2600
7.75k
              oc_mv lbmvs[4];
2601
7.75k
              oc_mv cbmvs[4];
2602
7.75k
              prior_mv=last_mv;
2603
38.7k
              for(bi=0;bi<4;bi++){
2604
31.0k
                fragi=mb_maps[mbi][0][bi];
2605
31.0k
                if(frags[fragi].coded){
2606
29.5k
                  lbmvs[bi]=last_mv=frag_mvs[fragi];
2607
29.5k
                  _enc->mv_bits[0]+=OC_MV_BITS[0][OC_MV_X(last_mv)+31]
2608
29.5k
                   +OC_MV_BITS[0][OC_MV_Y(last_mv)+31];
2609
29.5k
                  _enc->mv_bits[1]+=12;
2610
29.5k
                }
2611
                /*Replace the block MVs for not-coded blocks with (0,0).*/
2612
1.48k
                else lbmvs[bi]=0;
2613
31.0k
              }
2614
7.75k
              (*set_chroma_mvs)(cbmvs,lbmvs);
2615
26.8k
              for(mapii=4;mapii<nmap_idxs;mapii++){
2616
19.0k
                mapi=map_idxs[mapii];
2617
19.0k
                pli=mapi>>2;
2618
19.0k
                bi=mapi&3;
2619
19.0k
                fragi=mb_maps[mbi][pli][bi];
2620
19.0k
                frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2621
19.0k
                frags[fragi].refi=refi;
2622
19.0k
                frags[fragi].mb_mode=mb_mode;
2623
19.0k
                frag_mvs[fragi]=cbmvs[bi];
2624
19.0k
              }
2625
7.75k
            }break;
2626
346k
          }
2627
346k
          coded_mbis[ncoded_mbis++]=mbi;
2628
346k
          oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2629
346k
          interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2630
346k
        }
2631
65.1k
        else{
2632
65.1k
          *(uncoded_mbis-++nuncoded_mbis)=mbi;
2633
65.1k
          mb_mode=OC_MODE_INTER_NOMV;
2634
65.1k
          refi=OC_FRAME_PREV;
2635
65.1k
          mv=0;
2636
65.1k
        }
2637
        /*Propagate final MB mode and MVs to the chroma blocks.
2638
          This has already been done for 4MV mode, since it requires individual
2639
           block motion vectors.*/
2640
411k
        if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2641
1.39M
          for(mapii=4;mapii<nmap_idxs;mapii++){
2642
987k
            mapi=map_idxs[mapii];
2643
987k
            pli=mapi>>2;
2644
987k
            bi=mapi&3;
2645
987k
            fragi=mb_maps[mbi][pli][bi];
2646
            /*If we switched from 4MV mode to INTER_MV mode, then the qii
2647
               values won't have been chosen with the right MV, but it's
2648
               probaby not worth re-estimating them.*/
2649
987k
            frags[fragi].qii=modes[mb_mode].qii[mapii];
2650
987k
            frags[fragi].refi=refi;
2651
987k
            frags[fragi].mb_mode=mb_mode;
2652
987k
            frag_mvs[fragi]=mv;
2653
987k
          }
2654
403k
        }
2655
        /*Save masking scale factors for chroma blocks.*/
2656
914k
        for(mapii=4;mapii<(nmap_idxs-4>>1)+4;mapii++){
2657
503k
          mapi=map_idxs[mapii];
2658
503k
          bi=mapi&3;
2659
503k
          fragi=mb_maps[mbi][1][bi];
2660
503k
          mcu_rd_scale[fragi-cfroffset]=(ogg_uint16_t)rd_scale[4];
2661
503k
          mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
2662
503k
        }
2663
411k
      }
2664
123k
      oc_fr_state_flush_sb(_enc->pipe.fr+0);
2665
123k
      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
2666
123k
      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
2667
123k
    }
2668
42.4k
    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
2669
    /*Code chroma planes.*/
2670
127k
    for(pli=1;pli<3;pli++){
2671
84.9k
      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
2672
84.9k
       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
2673
84.9k
      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
2674
84.9k
    }
2675
42.4k
    notstart=1;
2676
42.4k
  }
2677
  /*Update the average block activity and MB luma score for the frame.
2678
    We could use a Bessel follower here, but fast reaction is probably almost
2679
     always best.*/
2680
39.4k
  _enc->activity_avg=OC_MAXI(OC_ACTIVITY_AVG_MIN,
2681
39.4k
   (unsigned)((activity_sum+(_enc->state.fplanes[0].nfrags>>1))/
2682
39.4k
   _enc->state.fplanes[0].nfrags));
2683
39.4k
  _enc->luma_avg=(unsigned)((luma_sum+(_enc->state.nmbs>>1))/_enc->state.nmbs);
2684
  /*Finish filling in the reference frame borders.*/
2685
39.4k
  refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2686
157k
  for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2687
  /*Finish adding flagging overhead costs to inter bit counts to determine if
2688
     we should have coded a key frame instead.*/
2689
39.4k
  if(_allow_keyframe){
2690
    /*Technically the chroma plane counts are over-estimations, because they
2691
       don't account for continuing runs from the luma planes, but the
2692
       inaccuracy is small.
2693
      We don't need to add the luma plane coding flag costs, because they are
2694
       already included in the MB rate estimates.*/
2695
118k
    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
2696
39.4k
    if(interbits>intrabits)return 1;
2697
39.4k
  }
2698
24.7k
  _enc->ncoded_mbis=ncoded_mbis;
2699
  /*Compact the coded fragment list.*/
2700
24.7k
  {
2701
24.7k
    ptrdiff_t ncoded_fragis;
2702
24.7k
    ncoded_fragis=_enc->state.ncoded_fragis[0];
2703
74.1k
    for(pli=1;pli<3;pli++){
2704
49.4k
      memmove(_enc->state.coded_fragis+ncoded_fragis,
2705
49.4k
       _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2706
49.4k
       _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2707
49.4k
      ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2708
49.4k
    }
2709
24.7k
    _enc->state.ntotal_coded_fragis=ncoded_fragis;
2710
24.7k
  }
2711
24.7k
  return 0;
2712
39.4k
}