Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/mcenc.c
Line
Count
Source (jump to first uncovered line)
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id$
15
16
 ********************************************************************/
17
#include <stdlib.h>
18
#include <limits.h>
19
#include <string.h>
20
#include "encint.h"
21
22
23
24
typedef struct oc_mcenc_ctx           oc_mcenc_ctx;
25
26
27
28
/*Temporary state used for motion estimation.*/
29
struct oc_mcenc_ctx{
30
  /*The candidate motion vectors.*/
31
  int                candidates[13][2];
32
  /*The start of the Set B candidates.*/
33
  int                setb0;
34
  /*The total number of candidates.*/
35
  int                ncandidates;
36
};
37
38
39
40
/*The maximum Y plane SAD value for accepting the median predictor.*/
41
627k
#define OC_YSAD_THRESH1            (256)
42
/*The amount to right shift the minimum error by when inflating it for
43
   computing the second maximum Y plane SAD threshold.*/
44
559k
#define OC_YSAD_THRESH2_SCALE_BITS (4)
45
/*The amount to add to the second maximum Y plane threshold when inflating
46
   it.*/
47
559k
#define OC_YSAD_THRESH2_OFFSET     (64)
48
49
/*The vector offsets in the X direction for each search site in the square
50
   pattern.*/
51
static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1};
52
/*The vector offsets in the Y direction for each search site in the square
53
   pattern.*/
54
static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1};
55
/*The number of sites to search for each boundary condition in the square
56
   pattern.
57
  Bit flags for the boundary conditions are as follows:
58
  1: -16==dx
59
  2:      dx==15(.5)
60
  4: -16==dy
61
  8:      dy==15(.5)*/
62
static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3};
63
/*The list of sites to search for each boundary condition in the square
64
   pattern.*/
65
static const int OC_SQUARE_SITES[11][8]={
66
  /* -15.5<dx<31,       -15.5<dy<15(.5)*/
67
  {0,1,2,3,5,6,7,8},
68
  /*-15.5==dx,          -15.5<dy<15(.5)*/
69
  {1,2,5,7,8},
70
  /*     dx==15(.5),    -15.5<dy<15(.5)*/
71
  {0,1,3,6,7},
72
  /*-15.5==dx==15(.5),  -15.5<dy<15(.5)*/
73
  {-1},
74
  /* -15.5<dx<15(.5),  -15.5==dy*/
75
  {3,5,6,7,8},
76
  /*-15.5==dx,         -15.5==dy*/
77
  {5,7,8},
78
  /*     dx==15(.5),   -15.5==dy*/
79
  {3,6,7},
80
  /*-15.5==dx==15(.5), -15.5==dy*/
81
  {-1},
82
  /*-15.5dx<15(.5),           dy==15(.5)*/
83
  {0,1,2,3,5},
84
  /*-15.5==dx,                dy==15(.5)*/
85
  {1,2,5},
86
  /*       dx==15(.5),        dy==15(.5)*/
87
  {0,1,3}
88
};
89
90
91
static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
92
627k
 oc_mv _accum,int _mbi,int _frame){
93
627k
  oc_mb_enc_info *embs;
94
627k
  int             accum_x;
95
627k
  int             accum_y;
96
627k
  int             a[3][2];
97
627k
  int             ncandidates;
98
627k
  unsigned        nmbi;
99
627k
  int             i;
100
627k
  embs=_enc->mb_info;
101
  /*Skip a position to store the median predictor in.*/
102
627k
  ncandidates=1;
103
627k
  if(embs[_mbi].ncneighbors>0){
104
    /*Fill in the first part of set A: the vectors from adjacent blocks.*/
105
1.79M
    for(i=0;i<embs[_mbi].ncneighbors;i++){
106
1.23M
      nmbi=embs[_mbi].cneighbors[i];
107
1.23M
      _mcenc->candidates[ncandidates][0]=
108
1.23M
       OC_MV_X(embs[nmbi].analysis_mv[0][_frame]);
109
1.23M
      _mcenc->candidates[ncandidates][1]=
110
1.23M
       OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]);
111
1.23M
      ncandidates++;
112
1.23M
    }
113
551k
  }
114
627k
  accum_x=OC_MV_X(_accum);
115
627k
  accum_y=OC_MV_Y(_accum);
116
  /*Add a few additional vectors to set A: the vectors used in the previous
117
     frames and the (0,0) vector.*/
118
627k
  _mcenc->candidates[ncandidates][0]=accum_x;
119
627k
  _mcenc->candidates[ncandidates][1]=accum_y;
120
627k
  ncandidates++;
121
627k
  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
122
627k
   OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31);
123
627k
  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
124
627k
   OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31);
125
627k
  ncandidates++;
126
627k
  _mcenc->candidates[ncandidates][0]=0;
127
627k
  _mcenc->candidates[ncandidates][1]=0;
128
627k
  ncandidates++;
129
  /*Use the first three vectors of set A to find our best predictor: their
130
     median.*/
131
627k
  memcpy(a,_mcenc->candidates+1,sizeof(a));
132
627k
  OC_SORT2I(a[0][0],a[1][0]);
133
627k
  OC_SORT2I(a[0][1],a[1][1]);
134
627k
  OC_SORT2I(a[1][0],a[2][0]);
135
627k
  OC_SORT2I(a[1][1],a[2][1]);
136
627k
  OC_SORT2I(a[0][0],a[1][0]);
137
627k
  OC_SORT2I(a[0][1],a[1][1]);
138
627k
  _mcenc->candidates[0][0]=a[1][0];
139
627k
  _mcenc->candidates[0][1]=a[1][1];
140
627k
  _mcenc->setb0=ncandidates;
141
627k
}
142
143
static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
144
135k
 oc_mv _accum,int _mbi,int _frame){
145
135k
  oc_mb_enc_info *embs;
146
135k
  int             accum_x;
147
135k
  int             accum_y;
148
135k
  int             ncandidates;
149
135k
  embs=_enc->mb_info;
150
135k
  accum_x=OC_MV_X(_accum);
151
135k
  accum_y=OC_MV_Y(_accum);
152
  /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
153
135k
  ncandidates=_mcenc->setb0;
154
  /*Use only the current block. Using more did not appear to be helpful
155
    with the current selection logic due to escaping the local search too
156
    quickly.*/
157
135k
  _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
158
135k
   2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
159
135k
   -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
160
135k
  _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
161
135k
   2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
162
135k
   -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
163
135k
  ncandidates++;
164
135k
  _mcenc->ncandidates=ncandidates;
165
135k
}
166
167
static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
168
 const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
169
 int _mvoffset0,int _mvoffset1,const unsigned char *_src,
170
0
 const unsigned char *_ref,int _ystride,unsigned _best_err){
171
0
  unsigned err;
172
0
  int      bi;
173
0
  err=0;
174
0
  for(bi=0;bi<4;bi++){
175
0
    ptrdiff_t frag_offs;
176
0
    frag_offs=_frag_buf_offs[_fragis[bi]];
177
0
    err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
178
0
     _ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
179
0
  }
180
0
  return err;
181
0
}
182
183
static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
184
 const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
185
 int _mvoffset0,int _mvoffset1,const unsigned char *_src,
186
3.19M
 const unsigned char *_ref,int _ystride,unsigned _best_err){
187
3.19M
  unsigned err;
188
3.19M
  int      dc;
189
3.19M
  int      bi;
190
3.19M
  err=0;
191
15.9M
  for(bi=0;bi<4;bi++){
192
12.7M
    ptrdiff_t frag_offs;
193
12.7M
    frag_offs=_frag_buf_offs[_fragis[bi]];
194
12.7M
    err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
195
12.7M
     _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
196
12.7M
    err+=abs(dc);
197
12.7M
  }
198
3.19M
  return err;
199
3.19M
}
200
201
static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
202
 const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
203
 const unsigned char *_src,const unsigned char *_ref,int _ystride,
204
4.83M
 unsigned _block_err[4]){
205
4.83M
  unsigned err;
206
4.83M
  int      mvoffset;
207
4.83M
  int      bi;
208
4.83M
  mvoffset=_dx+_dy*_ystride;
209
4.83M
  err=0;
210
24.1M
  for(bi=0;bi<4;bi++){
211
19.3M
    ptrdiff_t frag_offs;
212
19.3M
    unsigned  block_err;
213
19.3M
    frag_offs=_frag_buf_offs[_fragis[bi]];
214
19.3M
    block_err=oc_enc_frag_sad(_enc,
215
19.3M
     _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
216
19.3M
    _block_err[bi]=block_err;
217
19.3M
    err+=block_err;
218
19.3M
  }
219
4.83M
  return err;
220
4.83M
}
221
222
static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
223
 const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
224
627k
 const unsigned char *_src,const unsigned char *_ref,int _ystride){
225
627k
  int mvoffset;
226
627k
  int err;
227
627k
  int bi;
228
627k
  mvoffset=_dx+_dy*_ystride;
229
627k
  err=0;
230
3.13M
  for(bi=0;bi<4;bi++){
231
2.51M
    ptrdiff_t frag_offs;
232
2.51M
    int       dc;
233
2.51M
    frag_offs=_frag_buf_offs[_fragis[bi]];
234
2.51M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
235
2.51M
      err+=oc_enc_frag_satd(_enc,&dc,
236
2.51M
       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
237
2.51M
      err+=abs(dc);
238
2.51M
    }
239
0
    else{
240
0
      err+=oc_enc_frag_sad(_enc,
241
0
       _src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
242
0
    }
243
2.51M
  }
244
627k
  return err;
245
627k
}
246
247
static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
248
 ptrdiff_t _frag_offs,int _dx,int _dy,
249
1.25M
 const unsigned char *_src,const unsigned char *_ref,int _ystride){
250
1.25M
  unsigned err;
251
1.25M
  int      dc;
252
1.25M
  err=oc_enc_frag_satd(_enc,&dc,
253
1.25M
   _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
254
1.25M
  return err+abs(dc);
255
1.25M
}
256
257
/*Perform a motion vector search for this macro block against a single
258
   reference frame.
259
  As a bonus, individual block motion vectors are computed as well, as much of
260
   the work can be shared.
261
  The actual motion vector is stored in the appropriate place in the
262
   oc_mb_enc_info structure.
263
  _accum:      Drop frame/golden MV accumulators.
264
  _mbi:        The macro block index.
265
  _frame:      The frame to use for SATD calculations and refinement,
266
                either OC_FRAME_PREV or OC_FRAME_GOLD.
267
  _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV,
268
                OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/
269
void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
270
627k
 int _frame_full){
271
  /*Note: Traditionally this search is done using a rate-distortion objective
272
     function of the form D+lambda*R.
273
    However, xiphmont tested this and found it produced a small degredation,
274
     while requiring extra computation.
275
    This is most likely due to Theora's peculiar MV encoding scheme: MVs are
276
     not coded relative to a predictor, and the only truly cheap way to use a
277
     MV is in the LAST or LAST2 MB modes, which are not being considered here.
278
    Therefore if we use the MV found here, it's only because both LAST and
279
     LAST2 performed poorly, and therefore the MB is not likely to be uniform
280
     or suffer from the aperture problem.
281
    Furthermore we would like to re-use the MV found here for as many MBs as
282
     possible, so picking a slightly sub-optimal vector to save a bit or two
283
     may cause increased degredation in many blocks to come.
284
    We could artificially reduce lambda to compensate, but it's faster to just
285
     disable it entirely, and use D (the distortion) as the sole criterion.*/
286
627k
  oc_mcenc_ctx         mcenc;
287
627k
  const ptrdiff_t     *frag_buf_offs;
288
627k
  const ptrdiff_t     *fragis;
289
627k
  const unsigned char *src;
290
627k
  const unsigned char *ref;
291
627k
  const unsigned char *satd_ref;
292
627k
  int                  ystride;
293
627k
  oc_mb_enc_info      *embs;
294
627k
  ogg_int32_t          hit_cache[31];
295
627k
  ogg_int32_t          hitbit;
296
627k
  unsigned             best_block_err[4];
297
627k
  unsigned             block_err[4];
298
627k
  unsigned             best_err;
299
627k
  int                  best_vec[2];
300
627k
  int                  best_block_vec[4][2];
301
627k
  int                  candx;
302
627k
  int                  candy;
303
627k
  int                  bi;
304
627k
  embs=_enc->mb_info;
305
  /*Find some candidate motion vectors.*/
306
627k
  oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame);
307
  /*Clear the cache of locations we've examined.*/
308
627k
  memset(hit_cache,0,sizeof(hit_cache));
309
  /*Start with the median predictor.*/
310
627k
  candx=OC_DIV2(mcenc.candidates[0][0]);
311
627k
  candy=OC_DIV2(mcenc.candidates[0][1]);
312
627k
  hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
313
627k
  frag_buf_offs=_enc->state.frag_buf_offs;
314
627k
  fragis=_enc->state.mb_maps[_mbi][0];
315
627k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
316
627k
  ref=_enc->state.ref_frame_data[_frame_full];
317
627k
  satd_ref=_enc->state.ref_frame_data[_frame];
318
627k
  ystride=_enc->state.ref_ystride[0];
319
  /*TODO: customize error function for speed/(quality+size) tradeoff.*/
320
627k
  best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
321
627k
   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
322
627k
  best_vec[0]=candx;
323
627k
  best_vec[1]=candy;
324
627k
  if(_frame==OC_FRAME_PREV){
325
1.56M
    for(bi=0;bi<4;bi++){
326
1.25M
      best_block_err[bi]=block_err[bi];
327
1.25M
      best_block_vec[bi][0]=candx;
328
1.25M
      best_block_vec[bi][1]=candy;
329
1.25M
    }
330
313k
  }
331
  /*If this predictor fails, move on to set A.*/
332
627k
  if(best_err>OC_YSAD_THRESH1){
333
559k
    unsigned err;
334
559k
    unsigned t2;
335
559k
    int      ncs;
336
559k
    int      ci;
337
    /*Compute the early termination threshold for set A.*/
338
559k
    t2=embs[_mbi].error[_frame];
339
559k
    ncs=OC_MINI(3,embs[_mbi].ncneighbors);
340
1.58M
    for(ci=0;ci<ncs;ci++){
341
1.02M
      t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]);
342
1.02M
    }
343
559k
    t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
344
    /*Examine the candidates in set A.*/
345
3.34M
    for(ci=1;ci<mcenc.setb0;ci++){
346
2.78M
      candx=OC_DIV2(mcenc.candidates[ci][0]);
347
2.78M
      candy=OC_DIV2(mcenc.candidates[ci][1]);
348
      /*If we've already examined this vector, then we would be using it if it
349
         was better than what we are using.*/
350
2.78M
      hitbit=(ogg_int32_t)1<<candx+15;
351
2.78M
      if(hit_cache[candy+15]&hitbit)continue;
352
1.06M
      hit_cache[candy+15]|=hitbit;
353
1.06M
      err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
354
1.06M
       frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
355
1.06M
      if(err<best_err){
356
383k
        best_err=err;
357
383k
        best_vec[0]=candx;
358
383k
        best_vec[1]=candy;
359
383k
      }
360
1.06M
      if(_frame==OC_FRAME_PREV){
361
2.31M
        for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
362
687k
          best_block_err[bi]=block_err[bi];
363
687k
          best_block_vec[bi][0]=candx;
364
687k
          best_block_vec[bi][1]=candy;
365
687k
        }
366
462k
      }
367
1.06M
    }
368
559k
    if(best_err>t2){
369
135k
      oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame);
370
      /*Examine the candidates in set B.*/
371
271k
      for(;ci<mcenc.ncandidates;ci++){
372
135k
        candx=OC_DIV2(mcenc.candidates[ci][0]);
373
135k
        candy=OC_DIV2(mcenc.candidates[ci][1]);
374
135k
        hitbit=(ogg_int32_t)1<<candx+15;
375
135k
        if(hit_cache[candy+15]&hitbit)continue;
376
59.5k
        hit_cache[candy+15]|=hitbit;
377
59.5k
        err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
378
59.5k
         frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
379
59.5k
        if(err<best_err){
380
17.4k
          best_err=err;
381
17.4k
          best_vec[0]=candx;
382
17.4k
          best_vec[1]=candy;
383
17.4k
        }
384
59.5k
        if(_frame==OC_FRAME_PREV){
385
161k
          for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
386
38.2k
            best_block_err[bi]=block_err[bi];
387
38.2k
            best_block_vec[bi][0]=candx;
388
38.2k
            best_block_vec[bi][1]=candy;
389
38.2k
          }
390
32.3k
        }
391
59.5k
      }
392
      /*Use the same threshold for set B as in set A.*/
393
135k
      if(best_err>t2){
394
130k
        int best_site;
395
130k
        int nsites;
396
130k
        int sitei;
397
130k
        int site;
398
130k
        int b;
399
        /*Square pattern search.*/
400
531k
        for(;;){
401
531k
          best_site=4;
402
          /*Compose the bit flags for boundary conditions.*/
403
531k
          b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1|
404
531k
           OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3;
405
531k
          nsites=OC_SQUARE_NSITES[b];
406
4.51M
          for(sitei=0;sitei<nsites;sitei++){
407
3.98M
            site=OC_SQUARE_SITES[b][sitei];
408
3.98M
            candx=best_vec[0]+OC_SQUARE_DX[site];
409
3.98M
            candy=best_vec[1]+OC_SQUARE_DY[site];
410
3.98M
            hitbit=(ogg_int32_t)1<<candx+15;
411
3.98M
            if(hit_cache[candy+15]&hitbit)continue;
412
2.32M
            hit_cache[candy+15]|=hitbit;
413
2.32M
            err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
414
2.32M
             frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
415
2.32M
            if(err<best_err){
416
636k
              best_err=err;
417
636k
              best_site=site;
418
636k
            }
419
2.32M
            if(_frame==OC_FRAME_PREV){
420
6.34M
              for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
421
795k
                best_block_err[bi]=block_err[bi];
422
795k
                best_block_vec[bi][0]=candx;
423
795k
                best_block_vec[bi][1]=candy;
424
795k
              }
425
1.26M
            }
426
2.32M
          }
427
531k
          if(best_site==4)break;
428
400k
          best_vec[0]+=OC_SQUARE_DX[best_site];
429
400k
          best_vec[1]+=OC_SQUARE_DY[best_site];
430
400k
        }
431
        /*Final 4-MV search.*/
432
        /*Simply use 1/4 of the macro block set A and B threshold as the
433
           individual block threshold.*/
434
130k
        if(_frame==OC_FRAME_PREV){
435
70.4k
          t2>>=2;
436
352k
          for(bi=0;bi<4;bi++){
437
281k
            if(best_block_err[bi]>t2){
438
              /*Square pattern search.
439
                We do this in a slightly interesting manner.
440
                We continue to check the SAD of all four blocks in the
441
                 macro block.
442
                This gives us two things:
443
                 1) We can continue to use the hit_cache to avoid duplicate
444
                     checks.
445
                    Otherwise we could continue to read it, but not write to it
446
                     without saving and restoring it for each block.
447
                    Note that we could still eliminate a large number of
448
                     duplicate checks by taking into account the site we came
449
                     from when choosing the site list.
450
                    We can still do that to avoid extra hit_cache queries, and
451
                     it might even be a speed win.
452
                 2) It gives us a slightly better chance of escaping local
453
                     minima.
454
                    We would not be here if we weren't doing a fairly bad job
455
                     in finding a good vector, and checking these vectors can
456
                     save us from 100 to several thousand points off our SAD 1
457
                     in 15 times.
458
                TODO: Is this a good idea?
459
                Who knows.
460
                It needs more testing.*/
461
257k
              for(;;){
462
257k
                int bestx;
463
257k
                int besty;
464
257k
                int bj;
465
257k
                bestx=best_block_vec[bi][0];
466
257k
                besty=best_block_vec[bi][1];
467
                /*Compose the bit flags for boundary conditions.*/
468
257k
                b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1|
469
257k
                 OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3;
470
257k
                nsites=OC_SQUARE_NSITES[b];
471
2.21M
                for(sitei=0;sitei<nsites;sitei++){
472
1.95M
                  site=OC_SQUARE_SITES[b][sitei];
473
1.95M
                  candx=bestx+OC_SQUARE_DX[site];
474
1.95M
                  candy=besty+OC_SQUARE_DY[site];
475
1.95M
                  hitbit=(ogg_int32_t)1<<candx+15;
476
1.95M
                  if(hit_cache[candy+15]&hitbit)continue;
477
761k
                  hit_cache[candy+15]|=hitbit;
478
761k
                  err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
479
761k
                   frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
480
761k
                  if(err<best_err){
481
61.3k
                    best_err=err;
482
61.3k
                    best_vec[0]=candx;
483
61.3k
                    best_vec[1]=candy;
484
61.3k
                  }
485
3.80M
                  for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){
486
315k
                    best_block_err[bj]=block_err[bj];
487
315k
                    best_block_vec[bj][0]=candx;
488
315k
                    best_block_vec[bj][1]=candy;
489
315k
                  }
490
761k
                }
491
257k
                if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){
492
120k
                  break;
493
120k
                }
494
257k
              }
495
120k
            }
496
281k
          }
497
70.4k
        }
498
130k
      }
499
135k
    }
500
559k
  }
501
627k
  embs[_mbi].error[_frame]=(ogg_uint16_t)best_err;
502
627k
  candx=best_vec[0];
503
627k
  candy=best_vec[1];
504
627k
  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
505
627k
   frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride);
506
627k
  embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1);
507
627k
  if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
508
1.56M
    for(bi=0;bi<4;bi++){
509
1.25M
      candx=best_block_vec[bi][0];
510
1.25M
      candy=best_block_vec[bi][1];
511
1.25M
      embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
512
1.25M
       frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride);
513
1.25M
      embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1);
514
1.25M
    }
515
313k
  }
516
627k
}
517
518
313k
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
519
313k
  oc_mv2 *mvs;
520
313k
  oc_mv   accum_p;
521
313k
  oc_mv   accum_g;
522
313k
  oc_mv   mv2_p;
523
313k
  mvs=_enc->mb_info[_mbi].analysis_mv;
524
313k
  if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV];
525
290k
  else accum_p=0;
526
313k
  accum_g=mvs[2][OC_FRAME_GOLD];
527
  /*Move the motion vector predictors back a frame.*/
528
313k
  mv2_p=mvs[2][OC_FRAME_PREV];
529
313k
  mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD];
530
313k
  mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV];
531
313k
  mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD];
532
313k
  mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p);
533
  /*Search the last frame.*/
534
313k
  oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG);
535
313k
  mvs[2][OC_FRAME_PREV]=accum_p;
536
  /*GOLDEN MVs are different from PREV MVs in that they're each absolute
537
     offsets from some frame in the past rather than relative offsets from the
538
     frame before.
539
    For predictor calculation to make sense, we need them to be in the same
540
     form as PREV MVs.*/
541
313k
  mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
542
313k
  mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g);
543
  /*Search the golden frame.*/
544
313k
  oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG);
545
  /*Put GOLDEN MVs back into absolute offset form.
546
    The newest MV is already an absolute offset.*/
547
313k
  mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g);
548
313k
  mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
549
313k
}
550
551
#if 0
552
static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
553
 int _vec[2],int _best_err,int _frame){
554
  const unsigned char *src;
555
  const unsigned char *ref;
556
  const ptrdiff_t     *frag_buf_offs;
557
  const ptrdiff_t     *fragis;
558
  int                  offset_y[9];
559
  int                  ystride;
560
  int                  mvoffset_base;
561
  int                  best_site;
562
  int                  sitei;
563
  int                  err;
564
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
565
  ref=_enc->state.ref_frame_data[_framei];
566
  frag_buf_offs=_enc->state.frag_buf_offs;
567
  fragis=_enc->state.mb_maps[_mbi][0];
568
  ystride=_enc->state.ref_ystride[0];
569
  mvoffset_base=_vec[0]+_vec[1]*ystride;
570
  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
571
  offset_y[3]=offset_y[5]=0;
572
  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
573
  best_site=4;
574
  for(sitei=0;sitei<8;sitei++){
575
    int site;
576
    int xmask;
577
    int ymask;
578
    int dx;
579
    int dy;
580
    int mvoffset0;
581
    int mvoffset1;
582
    site=OC_SQUARE_SITES[0][sitei];
583
    dx=OC_SQUARE_DX[site];
584
    dy=OC_SQUARE_DY[site];
585
    /*The following code SHOULD be equivalent to
586
        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
587
         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
588
      However, it should also be much faster, as it involves no multiplies and
589
       doesn't have to handle chroma vectors.*/
590
    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
591
    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
592
    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
593
    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
594
    err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
595
     mvoffset0,mvoffset1,src,ref,ystride,_best_err);
596
    if(err<_best_err){
597
      _best_err=err;
598
      best_site=site;
599
    }
600
  }
601
  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
602
  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
603
  return _best_err;
604
}
605
#endif
606
607
static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
608
399k
 int _mbi,int _vec[2],unsigned _best_err,int _frame){
609
399k
  const unsigned char *src;
610
399k
  const unsigned char *ref;
611
399k
  const ptrdiff_t     *frag_buf_offs;
612
399k
  const ptrdiff_t     *fragis;
613
399k
  int                  offset_y[9];
614
399k
  int                  ystride;
615
399k
  int                  mvoffset_base;
616
399k
  int                  best_site;
617
399k
  int                  sitei;
618
399k
  int                  err;
619
399k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
620
399k
  ref=_enc->state.ref_frame_data[_frame];
621
399k
  frag_buf_offs=_enc->state.frag_buf_offs;
622
399k
  fragis=_enc->state.mb_maps[_mbi][0];
623
399k
  ystride=_enc->state.ref_ystride[0];
624
399k
  mvoffset_base=_vec[0]+_vec[1]*ystride;
625
399k
  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
626
399k
  offset_y[3]=offset_y[5]=0;
627
399k
  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
628
399k
  best_site=4;
629
3.59M
  for(sitei=0;sitei<8;sitei++){
630
3.19M
    int site;
631
3.19M
    int xmask;
632
3.19M
    int ymask;
633
3.19M
    int dx;
634
3.19M
    int dy;
635
3.19M
    int mvoffset0;
636
3.19M
    int mvoffset1;
637
3.19M
    site=OC_SQUARE_SITES[0][sitei];
638
3.19M
    dx=OC_SQUARE_DX[site];
639
3.19M
    dy=OC_SQUARE_DY[site];
640
    /*The following code SHOULD be equivalent to
641
        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
642
         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
643
      However, it should also be much faster, as it involves no multiplies and
644
       doesn't have to handle chroma vectors.*/
645
3.19M
    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
646
3.19M
    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
647
3.19M
    mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
648
3.19M
    mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
649
3.19M
    if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
650
3.19M
      err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
651
3.19M
       mvoffset0,mvoffset1,src,ref,ystride,_best_err);
652
3.19M
    }
653
0
    else{
654
0
      err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
655
0
           mvoffset0,mvoffset1,src,ref,ystride,_best_err);
656
0
    }
657
3.19M
    if(err<_best_err){
658
706k
      _best_err=err;
659
706k
      best_site=site;
660
706k
    }
661
3.19M
  }
662
399k
  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
663
399k
  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
664
399k
  return _best_err;
665
399k
}
666
667
399k
void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
668
399k
  oc_mb_enc_info *embs;
669
399k
  int             vec[2];
670
399k
  embs=_enc->mb_info;
671
399k
  vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame]));
672
399k
  vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame]));
673
399k
  embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
674
399k
   _mbi,vec,embs[_mbi].satd[_frame],_frame);
675
399k
  embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]);
676
399k
}
677
678
#if 0
679
static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc,
680
 int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
681
 int _offset_y[9],unsigned _best_err){
682
  int mvoffset_base;
683
  int best_site;
684
  int sitei;
685
  mvoffset_base=_vec[0]+_vec[1]*_ystride;
686
  best_site=4;
687
  for(sitei=0;sitei<8;sitei++){
688
    unsigned err;
689
    int      site;
690
    int      xmask;
691
    int      ymask;
692
    int      dx;
693
    int      dy;
694
    int      mvoffset0;
695
    int      mvoffset1;
696
    site=OC_SQUARE_SITES[0][sitei];
697
    dx=OC_SQUARE_DX[site];
698
    dy=OC_SQUARE_DY[site];
699
    /*The following code SHOULD be equivalent to
700
        oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
701
         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
702
      However, it should also be much faster, as it involves no multiplies and
703
       doesn't have to handle chroma vectors.*/
704
    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
705
    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
706
    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
707
    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
708
    err=oc_enc_frag_sad2_thresh(_enc,_src,
709
     _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err);
710
    if(err<_best_err){
711
      _best_err=err;
712
      best_site=site;
713
    }
714
  }
715
  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
716
  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
717
  return _best_err;
718
}
719
#endif
720
721
static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
722
 int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
723
167k
 int _offset_y[9],unsigned _best_err){
724
167k
  int mvoffset_base;
725
167k
  int best_site;
726
167k
  int sitei;
727
167k
  mvoffset_base=_vec[0]+_vec[1]*_ystride;
728
167k
  best_site=4;
729
1.50M
  for(sitei=0;sitei<8;sitei++){
730
1.34M
    unsigned err;
731
1.34M
    int      dc;
732
1.34M
    int      site;
733
1.34M
    int      xmask;
734
1.34M
    int      ymask;
735
1.34M
    int      dx;
736
1.34M
    int      dy;
737
1.34M
    int      mvoffset0;
738
1.34M
    int      mvoffset1;
739
1.34M
    site=OC_SQUARE_SITES[0][sitei];
740
1.34M
    dx=OC_SQUARE_DX[site];
741
1.34M
    dy=OC_SQUARE_DY[site];
742
    /*The following code SHOULD be equivalent to
743
        oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0,
744
         (_vec[0]<<1)+dx,(_vec[1]<<1)+dy);
745
      However, it should also be much faster, as it involves no multiplies and
746
       doesn't have to handle chroma vectors.*/
747
1.34M
    xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
748
1.34M
    ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
749
1.34M
    mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
750
1.34M
    mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
751
1.34M
    err=oc_enc_frag_satd2(_enc,&dc,_src,
752
1.34M
     _ref+mvoffset0,_ref+mvoffset1,_ystride);
753
1.34M
    err+=abs(dc);
754
1.34M
    if(err<_best_err){
755
336k
      _best_err=err;
756
336k
      best_site=site;
757
336k
    }
758
1.34M
  }
759
167k
  _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
760
167k
  _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
761
167k
  return _best_err;
762
167k
}
763
764
41.8k
void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
765
41.8k
  oc_mb_enc_info      *embs;
766
41.8k
  const ptrdiff_t     *frag_buf_offs;
767
41.8k
  const ptrdiff_t     *fragis;
768
41.8k
  const unsigned char *src;
769
41.8k
  const unsigned char *ref;
770
41.8k
  int                  offset_y[9];
771
41.8k
  int                  ystride;
772
41.8k
  int                  bi;
773
41.8k
  ystride=_enc->state.ref_ystride[0];
774
41.8k
  frag_buf_offs=_enc->state.frag_buf_offs;
775
41.8k
  fragis=_enc->state.mb_maps[_mbi][0];
776
41.8k
  src=_enc->state.ref_frame_data[OC_FRAME_IO];
777
41.8k
  ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
778
41.8k
  offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
779
41.8k
  offset_y[3]=offset_y[5]=0;
780
41.8k
  offset_y[6]=offset_y[7]=offset_y[8]=ystride;
781
41.8k
  embs=_enc->mb_info;
782
209k
  for(bi=0;bi<4;bi++){
783
167k
    ptrdiff_t frag_offs;
784
167k
    int       vec[2];
785
167k
    frag_offs=frag_buf_offs[fragis[bi]];
786
167k
    vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi]));
787
167k
    vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi]));
788
167k
    embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
789
167k
     src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
790
167k
    embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]);
791
167k
  }
792
41.8k
}