Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id$ |
15 | | |
16 | | ********************************************************************/ |
17 | | #include <stdlib.h> |
18 | | #include <limits.h> |
19 | | #include <string.h> |
20 | | #include "encint.h" |
21 | | |
22 | | |
23 | | |
24 | | typedef struct oc_mcenc_ctx oc_mcenc_ctx; |
25 | | |
26 | | |
27 | | |
28 | | /*Temporary state used for motion estimation.*/ |
29 | | struct oc_mcenc_ctx{ |
30 | | /*The candidate motion vectors.*/ |
31 | | int candidates[13][2]; |
32 | | /*The start of the Set B candidates.*/ |
33 | | int setb0; |
34 | | /*The total number of candidates.*/ |
35 | | int ncandidates; |
36 | | }; |
37 | | |
38 | | |
39 | | |
40 | | /*The maximum Y plane SAD value for accepting the median predictor.*/ |
41 | 627k | #define OC_YSAD_THRESH1 (256) |
42 | | /*The amount to right shift the minimum error by when inflating it for |
43 | | computing the second maximum Y plane SAD threshold.*/ |
44 | 559k | #define OC_YSAD_THRESH2_SCALE_BITS (4) |
45 | | /*The amount to add to the second maximum Y plane threshold when inflating |
46 | | it.*/ |
47 | 559k | #define OC_YSAD_THRESH2_OFFSET (64) |
48 | | |
49 | | /*The vector offsets in the X direction for each search site in the square |
50 | | pattern.*/ |
51 | | static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1}; |
52 | | /*The vector offsets in the Y direction for each search site in the square |
53 | | pattern.*/ |
54 | | static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1}; |
55 | | /*The number of sites to search for each boundary condition in the square |
56 | | pattern. |
57 | | Bit flags for the boundary conditions are as follows: |
58 | | 1: -16==dx |
59 | | 2: dx==15(.5) |
60 | | 4: -16==dy |
61 | | 8: dy==15(.5)*/ |
62 | | static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3}; |
63 | | /*The list of sites to search for each boundary condition in the square |
64 | | pattern.*/ |
65 | | static const int OC_SQUARE_SITES[11][8]={ |
66 | | /* -15.5<dx<31, -15.5<dy<15(.5)*/ |
67 | | {0,1,2,3,5,6,7,8}, |
68 | | /*-15.5==dx, -15.5<dy<15(.5)*/ |
69 | | {1,2,5,7,8}, |
70 | | /* dx==15(.5), -15.5<dy<15(.5)*/ |
71 | | {0,1,3,6,7}, |
72 | | /*-15.5==dx==15(.5), -15.5<dy<15(.5)*/ |
73 | | {-1}, |
74 | | /* -15.5<dx<15(.5), -15.5==dy*/ |
75 | | {3,5,6,7,8}, |
76 | | /*-15.5==dx, -15.5==dy*/ |
77 | | {5,7,8}, |
78 | | /* dx==15(.5), -15.5==dy*/ |
79 | | {3,6,7}, |
80 | | /*-15.5==dx==15(.5), -15.5==dy*/ |
81 | | {-1}, |
82 | | /*-15.5dx<15(.5), dy==15(.5)*/ |
83 | | {0,1,2,3,5}, |
84 | | /*-15.5==dx, dy==15(.5)*/ |
85 | | {1,2,5}, |
86 | | /* dx==15(.5), dy==15(.5)*/ |
87 | | {0,1,3} |
88 | | }; |
89 | | |
90 | | |
91 | | static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
92 | 627k | oc_mv _accum,int _mbi,int _frame){ |
93 | 627k | oc_mb_enc_info *embs; |
94 | 627k | int accum_x; |
95 | 627k | int accum_y; |
96 | 627k | int a[3][2]; |
97 | 627k | int ncandidates; |
98 | 627k | unsigned nmbi; |
99 | 627k | int i; |
100 | 627k | embs=_enc->mb_info; |
101 | | /*Skip a position to store the median predictor in.*/ |
102 | 627k | ncandidates=1; |
103 | 627k | if(embs[_mbi].ncneighbors>0){ |
104 | | /*Fill in the first part of set A: the vectors from adjacent blocks.*/ |
105 | 1.79M | for(i=0;i<embs[_mbi].ncneighbors;i++){ |
106 | 1.23M | nmbi=embs[_mbi].cneighbors[i]; |
107 | 1.23M | _mcenc->candidates[ncandidates][0]= |
108 | 1.23M | OC_MV_X(embs[nmbi].analysis_mv[0][_frame]); |
109 | 1.23M | _mcenc->candidates[ncandidates][1]= |
110 | 1.23M | OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]); |
111 | 1.23M | ncandidates++; |
112 | 1.23M | } |
113 | 551k | } |
114 | 627k | accum_x=OC_MV_X(_accum); |
115 | 627k | accum_y=OC_MV_Y(_accum); |
116 | | /*Add a few additional vectors to set A: the vectors used in the previous |
117 | | frames and the (0,0) vector.*/ |
118 | 627k | _mcenc->candidates[ncandidates][0]=accum_x; |
119 | 627k | _mcenc->candidates[ncandidates][1]=accum_y; |
120 | 627k | ncandidates++; |
121 | 627k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
122 | 627k | OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31); |
123 | 627k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
124 | 627k | OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31); |
125 | 627k | ncandidates++; |
126 | 627k | _mcenc->candidates[ncandidates][0]=0; |
127 | 627k | _mcenc->candidates[ncandidates][1]=0; |
128 | 627k | ncandidates++; |
129 | | /*Use the first three vectors of set A to find our best predictor: their |
130 | | median.*/ |
131 | 627k | memcpy(a,_mcenc->candidates+1,sizeof(a)); |
132 | 627k | OC_SORT2I(a[0][0],a[1][0]); |
133 | 627k | OC_SORT2I(a[0][1],a[1][1]); |
134 | 627k | OC_SORT2I(a[1][0],a[2][0]); |
135 | 627k | OC_SORT2I(a[1][1],a[2][1]); |
136 | 627k | OC_SORT2I(a[0][0],a[1][0]); |
137 | 627k | OC_SORT2I(a[0][1],a[1][1]); |
138 | 627k | _mcenc->candidates[0][0]=a[1][0]; |
139 | 627k | _mcenc->candidates[0][1]=a[1][1]; |
140 | 627k | _mcenc->setb0=ncandidates; |
141 | 627k | } |
142 | | |
143 | | static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
144 | 135k | oc_mv _accum,int _mbi,int _frame){ |
145 | 135k | oc_mb_enc_info *embs; |
146 | 135k | int accum_x; |
147 | 135k | int accum_y; |
148 | 135k | int ncandidates; |
149 | 135k | embs=_enc->mb_info; |
150 | 135k | accum_x=OC_MV_X(_accum); |
151 | 135k | accum_y=OC_MV_Y(_accum); |
152 | | /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ |
153 | 135k | ncandidates=_mcenc->setb0; |
154 | | /*Use only the current block. Using more did not appear to be helpful |
155 | | with the current selection logic due to escaping the local search too |
156 | | quickly.*/ |
157 | 135k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
158 | 135k | 2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame]) |
159 | 135k | -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31); |
160 | 135k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
161 | 135k | 2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame]) |
162 | 135k | -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31); |
163 | 135k | ncandidates++; |
164 | 135k | _mcenc->ncandidates=ncandidates; |
165 | 135k | } |
166 | | |
167 | | static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, |
168 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
169 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
170 | 0 | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
171 | 0 | unsigned err; |
172 | 0 | int bi; |
173 | 0 | err=0; |
174 | 0 | for(bi=0;bi<4;bi++){ |
175 | 0 | ptrdiff_t frag_offs; |
176 | 0 | frag_offs=_frag_buf_offs[_fragis[bi]]; |
177 | 0 | err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, |
178 | 0 | _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); |
179 | 0 | } |
180 | 0 | return err; |
181 | 0 | } |
182 | | |
183 | | static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc, |
184 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
185 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
186 | 3.19M | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
187 | 3.19M | unsigned err; |
188 | 3.19M | int dc; |
189 | 3.19M | int bi; |
190 | 3.19M | err=0; |
191 | 15.9M | for(bi=0;bi<4;bi++){ |
192 | 12.7M | ptrdiff_t frag_offs; |
193 | 12.7M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
194 | 12.7M | err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs, |
195 | 12.7M | _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride); |
196 | 12.7M | err+=abs(dc); |
197 | 12.7M | } |
198 | 3.19M | return err; |
199 | 3.19M | } |
200 | | |
201 | | static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
202 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
203 | | const unsigned char *_src,const unsigned char *_ref,int _ystride, |
204 | 4.83M | unsigned _block_err[4]){ |
205 | 4.83M | unsigned err; |
206 | 4.83M | int mvoffset; |
207 | 4.83M | int bi; |
208 | 4.83M | mvoffset=_dx+_dy*_ystride; |
209 | 4.83M | err=0; |
210 | 24.1M | for(bi=0;bi<4;bi++){ |
211 | 19.3M | ptrdiff_t frag_offs; |
212 | 19.3M | unsigned block_err; |
213 | 19.3M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
214 | 19.3M | block_err=oc_enc_frag_sad(_enc, |
215 | 19.3M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
216 | 19.3M | _block_err[bi]=block_err; |
217 | 19.3M | err+=block_err; |
218 | 19.3M | } |
219 | 4.83M | return err; |
220 | 4.83M | } |
221 | | |
222 | | static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
223 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
224 | 627k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
225 | 627k | int mvoffset; |
226 | 627k | int err; |
227 | 627k | int bi; |
228 | 627k | mvoffset=_dx+_dy*_ystride; |
229 | 627k | err=0; |
230 | 3.13M | for(bi=0;bi<4;bi++){ |
231 | 2.51M | ptrdiff_t frag_offs; |
232 | 2.51M | int dc; |
233 | 2.51M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
234 | 2.51M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
235 | 2.51M | err+=oc_enc_frag_satd(_enc,&dc, |
236 | 2.51M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
237 | 2.51M | err+=abs(dc); |
238 | 2.51M | } |
239 | 0 | else{ |
240 | 0 | err+=oc_enc_frag_sad(_enc, |
241 | 0 | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
242 | 0 | } |
243 | 2.51M | } |
244 | 627k | return err; |
245 | 627k | } |
246 | | |
247 | | static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, |
248 | | ptrdiff_t _frag_offs,int _dx,int _dy, |
249 | 1.25M | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
250 | 1.25M | unsigned err; |
251 | 1.25M | int dc; |
252 | 1.25M | err=oc_enc_frag_satd(_enc,&dc, |
253 | 1.25M | _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride); |
254 | 1.25M | return err+abs(dc); |
255 | 1.25M | } |
256 | | |
257 | | /*Perform a motion vector search for this macro block against a single |
258 | | reference frame. |
259 | | As a bonus, individual block motion vectors are computed as well, as much of |
260 | | the work can be shared. |
261 | | The actual motion vector is stored in the appropriate place in the |
262 | | oc_mb_enc_info structure. |
263 | | _accum: Drop frame/golden MV accumulators. |
264 | | _mbi: The macro block index. |
265 | | _frame: The frame to use for SATD calculations and refinement, |
266 | | either OC_FRAME_PREV or OC_FRAME_GOLD. |
267 | | _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, |
268 | | OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ |
269 | | void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, |
270 | 627k | int _frame_full){ |
271 | | /*Note: Traditionally this search is done using a rate-distortion objective |
272 | | function of the form D+lambda*R. |
273 | | However, xiphmont tested this and found it produced a small degredation, |
274 | | while requiring extra computation. |
275 | | This is most likely due to Theora's peculiar MV encoding scheme: MVs are |
276 | | not coded relative to a predictor, and the only truly cheap way to use a |
277 | | MV is in the LAST or LAST2 MB modes, which are not being considered here. |
278 | | Therefore if we use the MV found here, it's only because both LAST and |
279 | | LAST2 performed poorly, and therefore the MB is not likely to be uniform |
280 | | or suffer from the aperture problem. |
281 | | Furthermore we would like to re-use the MV found here for as many MBs as |
282 | | possible, so picking a slightly sub-optimal vector to save a bit or two |
283 | | may cause increased degredation in many blocks to come. |
284 | | We could artificially reduce lambda to compensate, but it's faster to just |
285 | | disable it entirely, and use D (the distortion) as the sole criterion.*/ |
286 | 627k | oc_mcenc_ctx mcenc; |
287 | 627k | const ptrdiff_t *frag_buf_offs; |
288 | 627k | const ptrdiff_t *fragis; |
289 | 627k | const unsigned char *src; |
290 | 627k | const unsigned char *ref; |
291 | 627k | const unsigned char *satd_ref; |
292 | 627k | int ystride; |
293 | 627k | oc_mb_enc_info *embs; |
294 | 627k | ogg_int32_t hit_cache[31]; |
295 | 627k | ogg_int32_t hitbit; |
296 | 627k | unsigned best_block_err[4]; |
297 | 627k | unsigned block_err[4]; |
298 | 627k | unsigned best_err; |
299 | 627k | int best_vec[2]; |
300 | 627k | int best_block_vec[4][2]; |
301 | 627k | int candx; |
302 | 627k | int candy; |
303 | 627k | int bi; |
304 | 627k | embs=_enc->mb_info; |
305 | | /*Find some candidate motion vectors.*/ |
306 | 627k | oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame); |
307 | | /*Clear the cache of locations we've examined.*/ |
308 | 627k | memset(hit_cache,0,sizeof(hit_cache)); |
309 | | /*Start with the median predictor.*/ |
310 | 627k | candx=OC_DIV2(mcenc.candidates[0][0]); |
311 | 627k | candy=OC_DIV2(mcenc.candidates[0][1]); |
312 | 627k | hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; |
313 | 627k | frag_buf_offs=_enc->state.frag_buf_offs; |
314 | 627k | fragis=_enc->state.mb_maps[_mbi][0]; |
315 | 627k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
316 | 627k | ref=_enc->state.ref_frame_data[_frame_full]; |
317 | 627k | satd_ref=_enc->state.ref_frame_data[_frame]; |
318 | 627k | ystride=_enc->state.ref_ystride[0]; |
319 | | /*TODO: customize error function for speed/(quality+size) tradeoff.*/ |
320 | 627k | best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
321 | 627k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
322 | 627k | best_vec[0]=candx; |
323 | 627k | best_vec[1]=candy; |
324 | 627k | if(_frame==OC_FRAME_PREV){ |
325 | 1.56M | for(bi=0;bi<4;bi++){ |
326 | 1.25M | best_block_err[bi]=block_err[bi]; |
327 | 1.25M | best_block_vec[bi][0]=candx; |
328 | 1.25M | best_block_vec[bi][1]=candy; |
329 | 1.25M | } |
330 | 313k | } |
331 | | /*If this predictor fails, move on to set A.*/ |
332 | 627k | if(best_err>OC_YSAD_THRESH1){ |
333 | 559k | unsigned err; |
334 | 559k | unsigned t2; |
335 | 559k | int ncs; |
336 | 559k | int ci; |
337 | | /*Compute the early termination threshold for set A.*/ |
338 | 559k | t2=embs[_mbi].error[_frame]; |
339 | 559k | ncs=OC_MINI(3,embs[_mbi].ncneighbors); |
340 | 1.58M | for(ci=0;ci<ncs;ci++){ |
341 | 1.02M | t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); |
342 | 1.02M | } |
343 | 559k | t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; |
344 | | /*Examine the candidates in set A.*/ |
345 | 3.34M | for(ci=1;ci<mcenc.setb0;ci++){ |
346 | 2.78M | candx=OC_DIV2(mcenc.candidates[ci][0]); |
347 | 2.78M | candy=OC_DIV2(mcenc.candidates[ci][1]); |
348 | | /*If we've already examined this vector, then we would be using it if it |
349 | | was better than what we are using.*/ |
350 | 2.78M | hitbit=(ogg_int32_t)1<<candx+15; |
351 | 2.78M | if(hit_cache[candy+15]&hitbit)continue; |
352 | 1.06M | hit_cache[candy+15]|=hitbit; |
353 | 1.06M | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
354 | 1.06M | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
355 | 1.06M | if(err<best_err){ |
356 | 383k | best_err=err; |
357 | 383k | best_vec[0]=candx; |
358 | 383k | best_vec[1]=candy; |
359 | 383k | } |
360 | 1.06M | if(_frame==OC_FRAME_PREV){ |
361 | 2.31M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
362 | 687k | best_block_err[bi]=block_err[bi]; |
363 | 687k | best_block_vec[bi][0]=candx; |
364 | 687k | best_block_vec[bi][1]=candy; |
365 | 687k | } |
366 | 462k | } |
367 | 1.06M | } |
368 | 559k | if(best_err>t2){ |
369 | 135k | oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame); |
370 | | /*Examine the candidates in set B.*/ |
371 | 271k | for(;ci<mcenc.ncandidates;ci++){ |
372 | 135k | candx=OC_DIV2(mcenc.candidates[ci][0]); |
373 | 135k | candy=OC_DIV2(mcenc.candidates[ci][1]); |
374 | 135k | hitbit=(ogg_int32_t)1<<candx+15; |
375 | 135k | if(hit_cache[candy+15]&hitbit)continue; |
376 | 59.5k | hit_cache[candy+15]|=hitbit; |
377 | 59.5k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
378 | 59.5k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
379 | 59.5k | if(err<best_err){ |
380 | 17.4k | best_err=err; |
381 | 17.4k | best_vec[0]=candx; |
382 | 17.4k | best_vec[1]=candy; |
383 | 17.4k | } |
384 | 59.5k | if(_frame==OC_FRAME_PREV){ |
385 | 161k | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
386 | 38.2k | best_block_err[bi]=block_err[bi]; |
387 | 38.2k | best_block_vec[bi][0]=candx; |
388 | 38.2k | best_block_vec[bi][1]=candy; |
389 | 38.2k | } |
390 | 32.3k | } |
391 | 59.5k | } |
392 | | /*Use the same threshold for set B as in set A.*/ |
393 | 135k | if(best_err>t2){ |
394 | 130k | int best_site; |
395 | 130k | int nsites; |
396 | 130k | int sitei; |
397 | 130k | int site; |
398 | 130k | int b; |
399 | | /*Square pattern search.*/ |
400 | 531k | for(;;){ |
401 | 531k | best_site=4; |
402 | | /*Compose the bit flags for boundary conditions.*/ |
403 | 531k | b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| |
404 | 531k | OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; |
405 | 531k | nsites=OC_SQUARE_NSITES[b]; |
406 | 4.51M | for(sitei=0;sitei<nsites;sitei++){ |
407 | 3.98M | site=OC_SQUARE_SITES[b][sitei]; |
408 | 3.98M | candx=best_vec[0]+OC_SQUARE_DX[site]; |
409 | 3.98M | candy=best_vec[1]+OC_SQUARE_DY[site]; |
410 | 3.98M | hitbit=(ogg_int32_t)1<<candx+15; |
411 | 3.98M | if(hit_cache[candy+15]&hitbit)continue; |
412 | 2.32M | hit_cache[candy+15]|=hitbit; |
413 | 2.32M | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
414 | 2.32M | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
415 | 2.32M | if(err<best_err){ |
416 | 636k | best_err=err; |
417 | 636k | best_site=site; |
418 | 636k | } |
419 | 2.32M | if(_frame==OC_FRAME_PREV){ |
420 | 6.34M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
421 | 795k | best_block_err[bi]=block_err[bi]; |
422 | 795k | best_block_vec[bi][0]=candx; |
423 | 795k | best_block_vec[bi][1]=candy; |
424 | 795k | } |
425 | 1.26M | } |
426 | 2.32M | } |
427 | 531k | if(best_site==4)break; |
428 | 400k | best_vec[0]+=OC_SQUARE_DX[best_site]; |
429 | 400k | best_vec[1]+=OC_SQUARE_DY[best_site]; |
430 | 400k | } |
431 | | /*Final 4-MV search.*/ |
432 | | /*Simply use 1/4 of the macro block set A and B threshold as the |
433 | | individual block threshold.*/ |
434 | 130k | if(_frame==OC_FRAME_PREV){ |
435 | 70.4k | t2>>=2; |
436 | 352k | for(bi=0;bi<4;bi++){ |
437 | 281k | if(best_block_err[bi]>t2){ |
438 | | /*Square pattern search. |
439 | | We do this in a slightly interesting manner. |
440 | | We continue to check the SAD of all four blocks in the |
441 | | macro block. |
442 | | This gives us two things: |
443 | | 1) We can continue to use the hit_cache to avoid duplicate |
444 | | checks. |
445 | | Otherwise we could continue to read it, but not write to it |
446 | | without saving and restoring it for each block. |
447 | | Note that we could still eliminate a large number of |
448 | | duplicate checks by taking into account the site we came |
449 | | from when choosing the site list. |
450 | | We can still do that to avoid extra hit_cache queries, and |
451 | | it might even be a speed win. |
452 | | 2) It gives us a slightly better chance of escaping local |
453 | | minima. |
454 | | We would not be here if we weren't doing a fairly bad job |
455 | | in finding a good vector, and checking these vectors can |
456 | | save us from 100 to several thousand points off our SAD 1 |
457 | | in 15 times. |
458 | | TODO: Is this a good idea? |
459 | | Who knows. |
460 | | It needs more testing.*/ |
461 | 257k | for(;;){ |
462 | 257k | int bestx; |
463 | 257k | int besty; |
464 | 257k | int bj; |
465 | 257k | bestx=best_block_vec[bi][0]; |
466 | 257k | besty=best_block_vec[bi][1]; |
467 | | /*Compose the bit flags for boundary conditions.*/ |
468 | 257k | b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| |
469 | 257k | OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; |
470 | 257k | nsites=OC_SQUARE_NSITES[b]; |
471 | 2.21M | for(sitei=0;sitei<nsites;sitei++){ |
472 | 1.95M | site=OC_SQUARE_SITES[b][sitei]; |
473 | 1.95M | candx=bestx+OC_SQUARE_DX[site]; |
474 | 1.95M | candy=besty+OC_SQUARE_DY[site]; |
475 | 1.95M | hitbit=(ogg_int32_t)1<<candx+15; |
476 | 1.95M | if(hit_cache[candy+15]&hitbit)continue; |
477 | 761k | hit_cache[candy+15]|=hitbit; |
478 | 761k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
479 | 761k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
480 | 761k | if(err<best_err){ |
481 | 61.3k | best_err=err; |
482 | 61.3k | best_vec[0]=candx; |
483 | 61.3k | best_vec[1]=candy; |
484 | 61.3k | } |
485 | 3.80M | for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ |
486 | 315k | best_block_err[bj]=block_err[bj]; |
487 | 315k | best_block_vec[bj][0]=candx; |
488 | 315k | best_block_vec[bj][1]=candy; |
489 | 315k | } |
490 | 761k | } |
491 | 257k | if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ |
492 | 120k | break; |
493 | 120k | } |
494 | 257k | } |
495 | 120k | } |
496 | 281k | } |
497 | 70.4k | } |
498 | 130k | } |
499 | 135k | } |
500 | 559k | } |
501 | 627k | embs[_mbi].error[_frame]=(ogg_uint16_t)best_err; |
502 | 627k | candx=best_vec[0]; |
503 | 627k | candy=best_vec[1]; |
504 | 627k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc, |
505 | 627k | frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride); |
506 | 627k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1); |
507 | 627k | if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
508 | 1.56M | for(bi=0;bi<4;bi++){ |
509 | 1.25M | candx=best_block_vec[bi][0]; |
510 | 1.25M | candy=best_block_vec[bi][1]; |
511 | 1.25M | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc, |
512 | 1.25M | frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride); |
513 | 1.25M | embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1); |
514 | 1.25M | } |
515 | 313k | } |
516 | 627k | } |
517 | | |
518 | 313k | void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){ |
519 | 313k | oc_mv2 *mvs; |
520 | 313k | oc_mv accum_p; |
521 | 313k | oc_mv accum_g; |
522 | 313k | oc_mv mv2_p; |
523 | 313k | mvs=_enc->mb_info[_mbi].analysis_mv; |
524 | 313k | if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV]; |
525 | 290k | else accum_p=0; |
526 | 313k | accum_g=mvs[2][OC_FRAME_GOLD]; |
527 | | /*Move the motion vector predictors back a frame.*/ |
528 | 313k | mv2_p=mvs[2][OC_FRAME_PREV]; |
529 | 313k | mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD]; |
530 | 313k | mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV]; |
531 | 313k | mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD]; |
532 | 313k | mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p); |
533 | | /*Search the last frame.*/ |
534 | 313k | oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG); |
535 | 313k | mvs[2][OC_FRAME_PREV]=accum_p; |
536 | | /*GOLDEN MVs are different from PREV MVs in that they're each absolute |
537 | | offsets from some frame in the past rather than relative offsets from the |
538 | | frame before. |
539 | | For predictor calculation to make sense, we need them to be in the same |
540 | | form as PREV MVs.*/ |
541 | 313k | mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
542 | 313k | mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g); |
543 | | /*Search the golden frame.*/ |
544 | 313k | oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG); |
545 | | /*Put GOLDEN MVs back into absolute offset form. |
546 | | The newest MV is already an absolute offset.*/ |
547 | 313k | mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g); |
548 | 313k | mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
549 | 313k | } |
550 | | |
551 | | #if 0 |
552 | | static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi, |
553 | | int _vec[2],int _best_err,int _frame){ |
554 | | const unsigned char *src; |
555 | | const unsigned char *ref; |
556 | | const ptrdiff_t *frag_buf_offs; |
557 | | const ptrdiff_t *fragis; |
558 | | int offset_y[9]; |
559 | | int ystride; |
560 | | int mvoffset_base; |
561 | | int best_site; |
562 | | int sitei; |
563 | | int err; |
564 | | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
565 | | ref=_enc->state.ref_frame_data[_framei]; |
566 | | frag_buf_offs=_enc->state.frag_buf_offs; |
567 | | fragis=_enc->state.mb_maps[_mbi][0]; |
568 | | ystride=_enc->state.ref_ystride[0]; |
569 | | mvoffset_base=_vec[0]+_vec[1]*ystride; |
570 | | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
571 | | offset_y[3]=offset_y[5]=0; |
572 | | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
573 | | best_site=4; |
574 | | for(sitei=0;sitei<8;sitei++){ |
575 | | int site; |
576 | | int xmask; |
577 | | int ymask; |
578 | | int dx; |
579 | | int dy; |
580 | | int mvoffset0; |
581 | | int mvoffset1; |
582 | | site=OC_SQUARE_SITES[0][sitei]; |
583 | | dx=OC_SQUARE_DX[site]; |
584 | | dy=OC_SQUARE_DY[site]; |
585 | | /*The following code SHOULD be equivalent to |
586 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
587 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
588 | | However, it should also be much faster, as it involves no multiplies and |
589 | | doesn't have to handle chroma vectors.*/ |
590 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
591 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
592 | | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
593 | | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
594 | | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
595 | | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
596 | | if(err<_best_err){ |
597 | | _best_err=err; |
598 | | best_site=site; |
599 | | } |
600 | | } |
601 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
602 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
603 | | return _best_err; |
604 | | } |
605 | | #endif |
606 | | |
607 | | static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, |
608 | 399k | int _mbi,int _vec[2],unsigned _best_err,int _frame){ |
609 | 399k | const unsigned char *src; |
610 | 399k | const unsigned char *ref; |
611 | 399k | const ptrdiff_t *frag_buf_offs; |
612 | 399k | const ptrdiff_t *fragis; |
613 | 399k | int offset_y[9]; |
614 | 399k | int ystride; |
615 | 399k | int mvoffset_base; |
616 | 399k | int best_site; |
617 | 399k | int sitei; |
618 | 399k | int err; |
619 | 399k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
620 | 399k | ref=_enc->state.ref_frame_data[_frame]; |
621 | 399k | frag_buf_offs=_enc->state.frag_buf_offs; |
622 | 399k | fragis=_enc->state.mb_maps[_mbi][0]; |
623 | 399k | ystride=_enc->state.ref_ystride[0]; |
624 | 399k | mvoffset_base=_vec[0]+_vec[1]*ystride; |
625 | 399k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
626 | 399k | offset_y[3]=offset_y[5]=0; |
627 | 399k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
628 | 399k | best_site=4; |
629 | 3.59M | for(sitei=0;sitei<8;sitei++){ |
630 | 3.19M | int site; |
631 | 3.19M | int xmask; |
632 | 3.19M | int ymask; |
633 | 3.19M | int dx; |
634 | 3.19M | int dy; |
635 | 3.19M | int mvoffset0; |
636 | 3.19M | int mvoffset1; |
637 | 3.19M | site=OC_SQUARE_SITES[0][sitei]; |
638 | 3.19M | dx=OC_SQUARE_DX[site]; |
639 | 3.19M | dy=OC_SQUARE_DY[site]; |
640 | | /*The following code SHOULD be equivalent to |
641 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
642 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
643 | | However, it should also be much faster, as it involves no multiplies and |
644 | | doesn't have to handle chroma vectors.*/ |
645 | 3.19M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
646 | 3.19M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
647 | 3.19M | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
648 | 3.19M | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
649 | 3.19M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
650 | 3.19M | err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, |
651 | 3.19M | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
652 | 3.19M | } |
653 | 0 | else{ |
654 | 0 | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
655 | 0 | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
656 | 0 | } |
657 | 3.19M | if(err<_best_err){ |
658 | 706k | _best_err=err; |
659 | 706k | best_site=site; |
660 | 706k | } |
661 | 3.19M | } |
662 | 399k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
663 | 399k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
664 | 399k | return _best_err; |
665 | 399k | } |
666 | | |
667 | 399k | void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){ |
668 | 399k | oc_mb_enc_info *embs; |
669 | 399k | int vec[2]; |
670 | 399k | embs=_enc->mb_info; |
671 | 399k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame])); |
672 | 399k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame])); |
673 | 399k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc, |
674 | 399k | _mbi,vec,embs[_mbi].satd[_frame],_frame); |
675 | 399k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]); |
676 | 399k | } |
677 | | |
678 | | #if 0 |
679 | | static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc, |
680 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
681 | | int _offset_y[9],unsigned _best_err){ |
682 | | int mvoffset_base; |
683 | | int best_site; |
684 | | int sitei; |
685 | | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
686 | | best_site=4; |
687 | | for(sitei=0;sitei<8;sitei++){ |
688 | | unsigned err; |
689 | | int site; |
690 | | int xmask; |
691 | | int ymask; |
692 | | int dx; |
693 | | int dy; |
694 | | int mvoffset0; |
695 | | int mvoffset1; |
696 | | site=OC_SQUARE_SITES[0][sitei]; |
697 | | dx=OC_SQUARE_DX[site]; |
698 | | dy=OC_SQUARE_DY[site]; |
699 | | /*The following code SHOULD be equivalent to |
700 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
701 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
702 | | However, it should also be much faster, as it involves no multiplies and |
703 | | doesn't have to handle chroma vectors.*/ |
704 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
705 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
706 | | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
707 | | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
708 | | err=oc_enc_frag_sad2_thresh(_enc,_src, |
709 | | _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err); |
710 | | if(err<_best_err){ |
711 | | _best_err=err; |
712 | | best_site=site; |
713 | | } |
714 | | } |
715 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
716 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
717 | | return _best_err; |
718 | | } |
719 | | #endif |
720 | | |
721 | | static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, |
722 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
723 | 167k | int _offset_y[9],unsigned _best_err){ |
724 | 167k | int mvoffset_base; |
725 | 167k | int best_site; |
726 | 167k | int sitei; |
727 | 167k | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
728 | 167k | best_site=4; |
729 | 1.50M | for(sitei=0;sitei<8;sitei++){ |
730 | 1.34M | unsigned err; |
731 | 1.34M | int dc; |
732 | 1.34M | int site; |
733 | 1.34M | int xmask; |
734 | 1.34M | int ymask; |
735 | 1.34M | int dx; |
736 | 1.34M | int dy; |
737 | 1.34M | int mvoffset0; |
738 | 1.34M | int mvoffset1; |
739 | 1.34M | site=OC_SQUARE_SITES[0][sitei]; |
740 | 1.34M | dx=OC_SQUARE_DX[site]; |
741 | 1.34M | dy=OC_SQUARE_DY[site]; |
742 | | /*The following code SHOULD be equivalent to |
743 | | oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0, |
744 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy); |
745 | | However, it should also be much faster, as it involves no multiplies and |
746 | | doesn't have to handle chroma vectors.*/ |
747 | 1.34M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
748 | 1.34M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
749 | 1.34M | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
750 | 1.34M | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
751 | 1.34M | err=oc_enc_frag_satd2(_enc,&dc,_src, |
752 | 1.34M | _ref+mvoffset0,_ref+mvoffset1,_ystride); |
753 | 1.34M | err+=abs(dc); |
754 | 1.34M | if(err<_best_err){ |
755 | 336k | _best_err=err; |
756 | 336k | best_site=site; |
757 | 336k | } |
758 | 1.34M | } |
759 | 167k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
760 | 167k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
761 | 167k | return _best_err; |
762 | 167k | } |
763 | | |
764 | 41.8k | void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ |
765 | 41.8k | oc_mb_enc_info *embs; |
766 | 41.8k | const ptrdiff_t *frag_buf_offs; |
767 | 41.8k | const ptrdiff_t *fragis; |
768 | 41.8k | const unsigned char *src; |
769 | 41.8k | const unsigned char *ref; |
770 | 41.8k | int offset_y[9]; |
771 | 41.8k | int ystride; |
772 | 41.8k | int bi; |
773 | 41.8k | ystride=_enc->state.ref_ystride[0]; |
774 | 41.8k | frag_buf_offs=_enc->state.frag_buf_offs; |
775 | 41.8k | fragis=_enc->state.mb_maps[_mbi][0]; |
776 | 41.8k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
777 | 41.8k | ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; |
778 | 41.8k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
779 | 41.8k | offset_y[3]=offset_y[5]=0; |
780 | 41.8k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
781 | 41.8k | embs=_enc->mb_info; |
782 | 209k | for(bi=0;bi<4;bi++){ |
783 | 167k | ptrdiff_t frag_offs; |
784 | 167k | int vec[2]; |
785 | 167k | frag_offs=frag_buf_offs[fragis[bi]]; |
786 | 167k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi])); |
787 | 167k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi])); |
788 | 167k | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec, |
789 | 167k | src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]); |
790 | 167k | embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]); |
791 | 167k | } |
792 | 41.8k | } |