Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation https://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | |
15 | | ********************************************************************/ |
16 | | #include <stdlib.h> |
17 | | #include <limits.h> |
18 | | #include <string.h> |
19 | | #include "encint.h" |
20 | | |
21 | | |
22 | | |
23 | | typedef struct oc_mcenc_ctx oc_mcenc_ctx; |
24 | | |
25 | | |
26 | | |
27 | | /*Temporary state used for motion estimation.*/ |
28 | | struct oc_mcenc_ctx{ |
29 | | /*The candidate motion vectors.*/ |
30 | | int candidates[13][2]; |
31 | | /*The start of the Set B candidates.*/ |
32 | | int setb0; |
33 | | /*The total number of candidates.*/ |
34 | | int ncandidates; |
35 | | }; |
36 | | |
37 | | |
38 | | |
39 | | /*The maximum Y plane SAD value for accepting the median predictor.*/ |
40 | 477k | #define OC_YSAD_THRESH1 (256) |
41 | | /*The amount to right shift the minimum error by when inflating it for |
42 | | computing the second maximum Y plane SAD threshold.*/ |
43 | 427k | #define OC_YSAD_THRESH2_SCALE_BITS (4) |
44 | | /*The amount to add to the second maximum Y plane threshold when inflating |
45 | | it.*/ |
46 | 427k | #define OC_YSAD_THRESH2_OFFSET (64) |
47 | | |
48 | | /*The vector offsets in the X direction for each search site in the square |
49 | | pattern.*/ |
50 | | static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1}; |
51 | | /*The vector offsets in the Y direction for each search site in the square |
52 | | pattern.*/ |
53 | | static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1}; |
54 | | /*The number of sites to search for each boundary condition in the square |
55 | | pattern. |
56 | | Bit flags for the boundary conditions are as follows: |
57 | | 1: -16==dx |
58 | | 2: dx==15(.5) |
59 | | 4: -16==dy |
60 | | 8: dy==15(.5)*/ |
61 | | static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3}; |
62 | | /*The list of sites to search for each boundary condition in the square |
63 | | pattern.*/ |
64 | | static const int OC_SQUARE_SITES[11][8]={ |
65 | | /* -15.5<dx<31, -15.5<dy<15(.5)*/ |
66 | | {0,1,2,3,5,6,7,8}, |
67 | | /*-15.5==dx, -15.5<dy<15(.5)*/ |
68 | | {1,2,5,7,8}, |
69 | | /* dx==15(.5), -15.5<dy<15(.5)*/ |
70 | | {0,1,3,6,7}, |
71 | | /*-15.5==dx==15(.5), -15.5<dy<15(.5)*/ |
72 | | {-1}, |
73 | | /* -15.5<dx<15(.5), -15.5==dy*/ |
74 | | {3,5,6,7,8}, |
75 | | /*-15.5==dx, -15.5==dy*/ |
76 | | {5,7,8}, |
77 | | /* dx==15(.5), -15.5==dy*/ |
78 | | {3,6,7}, |
79 | | /*-15.5==dx==15(.5), -15.5==dy*/ |
80 | | {-1}, |
81 | | /*-15.5dx<15(.5), dy==15(.5)*/ |
82 | | {0,1,2,3,5}, |
83 | | /*-15.5==dx, dy==15(.5)*/ |
84 | | {1,2,5}, |
85 | | /* dx==15(.5), dy==15(.5)*/ |
86 | | {0,1,3} |
87 | | }; |
88 | | |
89 | | |
90 | | static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
91 | 477k | oc_mv _accum,int _mbi,int _frame){ |
92 | 477k | oc_mb_enc_info *embs; |
93 | 477k | int accum_x; |
94 | 477k | int accum_y; |
95 | 477k | int a[3][2]; |
96 | 477k | int ncandidates; |
97 | 477k | unsigned nmbi; |
98 | 477k | int i; |
99 | 477k | embs=_enc->mb_info; |
100 | | /*Skip a position to store the median predictor in.*/ |
101 | 477k | ncandidates=1; |
102 | 477k | if(embs[_mbi].ncneighbors>0){ |
103 | | /*Fill in the first part of set A: the vectors from adjacent blocks.*/ |
104 | 1.26M | for(i=0;i<embs[_mbi].ncneighbors;i++){ |
105 | 846k | nmbi=embs[_mbi].cneighbors[i]; |
106 | 846k | _mcenc->candidates[ncandidates][0]= |
107 | 846k | OC_MV_X(embs[nmbi].analysis_mv[0][_frame]); |
108 | 846k | _mcenc->candidates[ncandidates][1]= |
109 | 846k | OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]); |
110 | 846k | ncandidates++; |
111 | 846k | } |
112 | 418k | } |
113 | 477k | accum_x=OC_MV_X(_accum); |
114 | 477k | accum_y=OC_MV_Y(_accum); |
115 | | /*Add a few additional vectors to set A: the vectors used in the previous |
116 | | frames and the (0,0) vector.*/ |
117 | 477k | _mcenc->candidates[ncandidates][0]=accum_x; |
118 | 477k | _mcenc->candidates[ncandidates][1]=accum_y; |
119 | 477k | ncandidates++; |
120 | 477k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
121 | 477k | OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31); |
122 | 477k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
123 | 477k | OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31); |
124 | 477k | ncandidates++; |
125 | 477k | _mcenc->candidates[ncandidates][0]=0; |
126 | 477k | _mcenc->candidates[ncandidates][1]=0; |
127 | 477k | ncandidates++; |
128 | | /*Use the first three vectors of set A to find our best predictor: their |
129 | | median.*/ |
130 | 477k | memcpy(a,_mcenc->candidates+1,sizeof(a)); |
131 | 477k | OC_SORT2I(a[0][0],a[1][0]); |
132 | 477k | OC_SORT2I(a[0][1],a[1][1]); |
133 | 477k | OC_SORT2I(a[1][0],a[2][0]); |
134 | 477k | OC_SORT2I(a[1][1],a[2][1]); |
135 | 477k | OC_SORT2I(a[0][0],a[1][0]); |
136 | 477k | OC_SORT2I(a[0][1],a[1][1]); |
137 | 477k | _mcenc->candidates[0][0]=a[1][0]; |
138 | 477k | _mcenc->candidates[0][1]=a[1][1]; |
139 | 477k | _mcenc->setb0=ncandidates; |
140 | 477k | } |
141 | | |
142 | | static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
143 | 113k | oc_mv _accum,int _mbi,int _frame){ |
144 | 113k | oc_mb_enc_info *embs; |
145 | 113k | int accum_x; |
146 | 113k | int accum_y; |
147 | 113k | int ncandidates; |
148 | 113k | embs=_enc->mb_info; |
149 | 113k | accum_x=OC_MV_X(_accum); |
150 | 113k | accum_y=OC_MV_Y(_accum); |
151 | | /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ |
152 | 113k | ncandidates=_mcenc->setb0; |
153 | | /*Use only the current block. Using more did not appear to be helpful |
154 | | with the current selection logic due to escaping the local search too |
155 | | quickly.*/ |
156 | 113k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
157 | 113k | 2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame]) |
158 | 113k | -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31); |
159 | 113k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
160 | 113k | 2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame]) |
161 | 113k | -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31); |
162 | 113k | ncandidates++; |
163 | 113k | _mcenc->ncandidates=ncandidates; |
164 | 113k | } |
165 | | |
166 | | static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, |
167 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
168 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
169 | 0 | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
170 | 0 | unsigned err; |
171 | 0 | int bi; |
172 | 0 | err=0; |
173 | 0 | for(bi=0;bi<4;bi++){ |
174 | 0 | ptrdiff_t frag_offs; |
175 | 0 | frag_offs=_frag_buf_offs[_fragis[bi]]; |
176 | 0 | err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, |
177 | 0 | _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); |
178 | 0 | } |
179 | 0 | return err; |
180 | 0 | } |
181 | | |
182 | | static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc, |
183 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
184 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
185 | 2.36M | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
186 | 2.36M | unsigned err; |
187 | 2.36M | int dc; |
188 | 2.36M | int bi; |
189 | 2.36M | err=0; |
190 | 11.8M | for(bi=0;bi<4;bi++){ |
191 | 9.45M | ptrdiff_t frag_offs; |
192 | 9.45M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
193 | 9.45M | err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs, |
194 | 9.45M | _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride); |
195 | 9.45M | err+=abs(dc); |
196 | 9.45M | } |
197 | 2.36M | return err; |
198 | 2.36M | } |
199 | | |
200 | | static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
201 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
202 | | const unsigned char *_src,const unsigned char *_ref,int _ystride, |
203 | 4.18M | unsigned _block_err[4]){ |
204 | 4.18M | unsigned err; |
205 | 4.18M | int mvoffset; |
206 | 4.18M | int bi; |
207 | 4.18M | mvoffset=_dx+_dy*_ystride; |
208 | 4.18M | err=0; |
209 | 20.9M | for(bi=0;bi<4;bi++){ |
210 | 16.7M | ptrdiff_t frag_offs; |
211 | 16.7M | unsigned block_err; |
212 | 16.7M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
213 | 16.7M | block_err=oc_enc_frag_sad(_enc, |
214 | 16.7M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
215 | 16.7M | _block_err[bi]=block_err; |
216 | 16.7M | err+=block_err; |
217 | 16.7M | } |
218 | 4.18M | return err; |
219 | 4.18M | } |
220 | | |
221 | | static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
222 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
223 | 477k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
224 | 477k | int mvoffset; |
225 | 477k | int err; |
226 | 477k | int bi; |
227 | 477k | mvoffset=_dx+_dy*_ystride; |
228 | 477k | err=0; |
229 | 2.38M | for(bi=0;bi<4;bi++){ |
230 | 1.90M | ptrdiff_t frag_offs; |
231 | 1.90M | int dc; |
232 | 1.90M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
233 | 1.90M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
234 | 1.90M | err+=oc_enc_frag_satd(_enc,&dc, |
235 | 1.90M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
236 | 1.90M | err+=abs(dc); |
237 | 1.90M | } |
238 | 0 | else{ |
239 | 0 | err+=oc_enc_frag_sad(_enc, |
240 | 0 | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
241 | 0 | } |
242 | 1.90M | } |
243 | 477k | return err; |
244 | 477k | } |
245 | | |
246 | | static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, |
247 | | ptrdiff_t _frag_offs,int _dx,int _dy, |
248 | 954k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
249 | 954k | unsigned err; |
250 | 954k | int dc; |
251 | 954k | err=oc_enc_frag_satd(_enc,&dc, |
252 | 954k | _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride); |
253 | 954k | return err+abs(dc); |
254 | 954k | } |
255 | | |
256 | | /*Perform a motion vector search for this macro block against a single |
257 | | reference frame. |
258 | | As a bonus, individual block motion vectors are computed as well, as much of |
259 | | the work can be shared. |
260 | | The actual motion vector is stored in the appropriate place in the |
261 | | oc_mb_enc_info structure. |
262 | | _accum: Drop frame/golden MV accumulators. |
263 | | _mbi: The macro block index. |
264 | | _frame: The frame to use for SATD calculations and refinement, |
265 | | either OC_FRAME_PREV or OC_FRAME_GOLD. |
266 | | _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, |
267 | | OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ |
268 | | void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, |
269 | 477k | int _frame_full){ |
270 | | /*Note: Traditionally this search is done using a rate-distortion objective |
271 | | function of the form D+lambda*R. |
272 | | However, xiphmont tested this and found it produced a small degradation, |
273 | | while requiring extra computation. |
274 | | This is most likely due to Theora's peculiar MV encoding scheme: MVs are |
275 | | not coded relative to a predictor, and the only truly cheap way to use a |
276 | | MV is in the LAST or LAST2 MB modes, which are not being considered here. |
277 | | Therefore if we use the MV found here, it's only because both LAST and |
278 | | LAST2 performed poorly, and therefore the MB is not likely to be uniform |
279 | | or suffer from the aperture problem. |
280 | | Furthermore we would like to reuse the MV found here for as many MBs as |
281 | | possible, so picking a slightly sub-optimal vector to save a bit or two |
282 | | may cause increased degradation in many blocks to come. |
283 | | We could artificially reduce lambda to compensate, but it's faster to just |
284 | | disable it entirely, and use D (the distortion) as the sole criterion.*/ |
285 | 477k | oc_mcenc_ctx mcenc; |
286 | 477k | const ptrdiff_t *frag_buf_offs; |
287 | 477k | const ptrdiff_t *fragis; |
288 | 477k | const unsigned char *src; |
289 | 477k | const unsigned char *ref; |
290 | 477k | const unsigned char *satd_ref; |
291 | 477k | int ystride; |
292 | 477k | oc_mb_enc_info *embs; |
293 | 477k | ogg_int32_t hit_cache[31]; |
294 | 477k | ogg_int32_t hitbit; |
295 | 477k | unsigned best_block_err[4]; |
296 | 477k | unsigned block_err[4]; |
297 | 477k | unsigned best_err; |
298 | 477k | int best_vec[2]; |
299 | 477k | int best_block_vec[4][2]; |
300 | 477k | int candx; |
301 | 477k | int candy; |
302 | 477k | int bi; |
303 | 477k | embs=_enc->mb_info; |
304 | | /*Find some candidate motion vectors.*/ |
305 | 477k | oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame); |
306 | | /*Clear the cache of locations we've examined.*/ |
307 | 477k | memset(hit_cache,0,sizeof(hit_cache)); |
308 | | /*Start with the median predictor.*/ |
309 | 477k | candx=OC_DIV2(mcenc.candidates[0][0]); |
310 | 477k | candy=OC_DIV2(mcenc.candidates[0][1]); |
311 | 477k | hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; |
312 | 477k | frag_buf_offs=_enc->state.frag_buf_offs; |
313 | 477k | fragis=_enc->state.mb_maps[_mbi][0]; |
314 | 477k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
315 | 477k | ref=_enc->state.ref_frame_data[_frame_full]; |
316 | 477k | satd_ref=_enc->state.ref_frame_data[_frame]; |
317 | 477k | ystride=_enc->state.ref_ystride[0]; |
318 | | /*TODO: customize error function for speed/(quality+size) tradeoff.*/ |
319 | 477k | best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
320 | 477k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
321 | 477k | best_vec[0]=candx; |
322 | 477k | best_vec[1]=candy; |
323 | 477k | if(_frame==OC_FRAME_PREV){ |
324 | 1.19M | for(bi=0;bi<4;bi++){ |
325 | 954k | best_block_err[bi]=block_err[bi]; |
326 | 954k | best_block_vec[bi][0]=candx; |
327 | 954k | best_block_vec[bi][1]=candy; |
328 | 954k | } |
329 | 238k | } |
330 | | /*If this predictor fails, move on to set A.*/ |
331 | 477k | if(best_err>OC_YSAD_THRESH1){ |
332 | 427k | unsigned err; |
333 | 427k | unsigned t2; |
334 | 427k | int ncs; |
335 | 427k | int ci; |
336 | | /*Compute the early termination threshold for set A.*/ |
337 | 427k | t2=embs[_mbi].error[_frame]; |
338 | 427k | ncs=OC_MINI(3,embs[_mbi].ncneighbors); |
339 | 1.17M | for(ci=0;ci<ncs;ci++){ |
340 | 742k | t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); |
341 | 742k | } |
342 | 427k | t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; |
343 | | /*Examine the candidates in set A.*/ |
344 | 2.48M | for(ci=1;ci<mcenc.setb0;ci++){ |
345 | 2.06M | candx=OC_DIV2(mcenc.candidates[ci][0]); |
346 | 2.06M | candy=OC_DIV2(mcenc.candidates[ci][1]); |
347 | | /*If we've already examined this vector, then we would be using it if it |
348 | | was better than what we are using.*/ |
349 | 2.06M | hitbit=(ogg_int32_t)1<<candx+15; |
350 | 2.06M | if(hit_cache[candy+15]&hitbit)continue; |
351 | 779k | hit_cache[candy+15]|=hitbit; |
352 | 779k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
353 | 779k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
354 | 779k | if(err<best_err){ |
355 | 293k | best_err=err; |
356 | 293k | best_vec[0]=candx; |
357 | 293k | best_vec[1]=candy; |
358 | 293k | } |
359 | 779k | if(_frame==OC_FRAME_PREV){ |
360 | 1.77M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
361 | 538k | best_block_err[bi]=block_err[bi]; |
362 | 538k | best_block_vec[bi][0]=candx; |
363 | 538k | best_block_vec[bi][1]=candy; |
364 | 538k | } |
365 | 355k | } |
366 | 779k | } |
367 | 427k | if(best_err>t2){ |
368 | 113k | oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame); |
369 | | /*Examine the candidates in set B.*/ |
370 | 227k | for(;ci<mcenc.ncandidates;ci++){ |
371 | 113k | candx=OC_DIV2(mcenc.candidates[ci][0]); |
372 | 113k | candy=OC_DIV2(mcenc.candidates[ci][1]); |
373 | 113k | hitbit=(ogg_int32_t)1<<candx+15; |
374 | 113k | if(hit_cache[candy+15]&hitbit)continue; |
375 | 41.3k | hit_cache[candy+15]|=hitbit; |
376 | 41.3k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
377 | 41.3k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
378 | 41.3k | if(err<best_err){ |
379 | 12.0k | best_err=err; |
380 | 12.0k | best_vec[0]=candx; |
381 | 12.0k | best_vec[1]=candy; |
382 | 12.0k | } |
383 | 41.3k | if(_frame==OC_FRAME_PREV){ |
384 | 110k | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
385 | 26.3k | best_block_err[bi]=block_err[bi]; |
386 | 26.3k | best_block_vec[bi][0]=candx; |
387 | 26.3k | best_block_vec[bi][1]=candy; |
388 | 26.3k | } |
389 | 22.1k | } |
390 | 41.3k | } |
391 | | /*Use the same threshold for set B as in set A.*/ |
392 | 113k | if(best_err>t2){ |
393 | 110k | int best_site; |
394 | 110k | int nsites; |
395 | 110k | int sitei; |
396 | 110k | int site; |
397 | 110k | int b; |
398 | | /*Square pattern search.*/ |
399 | 528k | for(;;){ |
400 | 528k | best_site=4; |
401 | | /*Compose the bit flags for boundary conditions.*/ |
402 | 528k | b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| |
403 | 528k | OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; |
404 | 528k | nsites=OC_SQUARE_NSITES[b]; |
405 | 4.46M | for(sitei=0;sitei<nsites;sitei++){ |
406 | 3.94M | site=OC_SQUARE_SITES[b][sitei]; |
407 | 3.94M | candx=best_vec[0]+OC_SQUARE_DX[site]; |
408 | 3.94M | candy=best_vec[1]+OC_SQUARE_DY[site]; |
409 | 3.94M | hitbit=(ogg_int32_t)1<<candx+15; |
410 | 3.94M | if(hit_cache[candy+15]&hitbit)continue; |
411 | 2.23M | hit_cache[candy+15]|=hitbit; |
412 | 2.23M | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
413 | 2.23M | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
414 | 2.23M | if(err<best_err){ |
415 | 675k | best_err=err; |
416 | 675k | best_site=site; |
417 | 675k | } |
418 | 2.23M | if(_frame==OC_FRAME_PREV){ |
419 | 5.99M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
420 | 810k | best_block_err[bi]=block_err[bi]; |
421 | 810k | best_block_vec[bi][0]=candx; |
422 | 810k | best_block_vec[bi][1]=candy; |
423 | 810k | } |
424 | 1.19M | } |
425 | 2.23M | } |
426 | 528k | if(best_site==4)break; |
427 | 418k | best_vec[0]+=OC_SQUARE_DX[best_site]; |
428 | 418k | best_vec[1]+=OC_SQUARE_DY[best_site]; |
429 | 418k | } |
430 | | /*Final 4-MV search.*/ |
431 | | /*Simply use 1/4 of the macro block set A and B threshold as the |
432 | | individual block threshold.*/ |
433 | 110k | if(_frame==OC_FRAME_PREV){ |
434 | 59.0k | t2>>=2; |
435 | 295k | for(bi=0;bi<4;bi++){ |
436 | 236k | if(best_block_err[bi]>t2){ |
437 | | /*Square pattern search. |
438 | | We do this in a slightly interesting manner. |
439 | | We continue to check the SAD of all four blocks in the |
440 | | macro block. |
441 | | This gives us two things: |
442 | | 1) We can continue to use the hit_cache to avoid duplicate |
443 | | checks. |
444 | | Otherwise we could continue to read it, but not write to it |
445 | | without saving and restoring it for each block. |
446 | | Note that we could still eliminate a large number of |
447 | | duplicate checks by taking into account the site we came |
448 | | from when choosing the site list. |
449 | | We can still do that to avoid extra hit_cache queries, and |
450 | | it might even be a speed win. |
451 | | 2) It gives us a slightly better chance of escaping local |
452 | | minima. |
453 | | We would not be here if we weren't doing a fairly bad job |
454 | | in finding a good vector, and checking these vectors can |
455 | | save us from 100 to several thousand points off our SAD 1 |
456 | | in 15 times. |
457 | | TODO: Is this a good idea? |
458 | | Who knows. |
459 | | It needs more testing.*/ |
460 | 222k | for(;;){ |
461 | 222k | int bestx; |
462 | 222k | int besty; |
463 | 222k | int bj; |
464 | 222k | bestx=best_block_vec[bi][0]; |
465 | 222k | besty=best_block_vec[bi][1]; |
466 | | /*Compose the bit flags for boundary conditions.*/ |
467 | 222k | b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| |
468 | 222k | OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; |
469 | 222k | nsites=OC_SQUARE_NSITES[b]; |
470 | 1.88M | for(sitei=0;sitei<nsites;sitei++){ |
471 | 1.66M | site=OC_SQUARE_SITES[b][sitei]; |
472 | 1.66M | candx=bestx+OC_SQUARE_DX[site]; |
473 | 1.66M | candy=besty+OC_SQUARE_DY[site]; |
474 | 1.66M | hitbit=(ogg_int32_t)1<<candx+15; |
475 | 1.66M | if(hit_cache[candy+15]&hitbit)continue; |
476 | 648k | hit_cache[candy+15]|=hitbit; |
477 | 648k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
478 | 648k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
479 | 648k | if(err<best_err){ |
480 | 53.8k | best_err=err; |
481 | 53.8k | best_vec[0]=candx; |
482 | 53.8k | best_vec[1]=candy; |
483 | 53.8k | } |
484 | 3.24M | for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ |
485 | 280k | best_block_err[bj]=block_err[bj]; |
486 | 280k | best_block_vec[bj][0]=candx; |
487 | 280k | best_block_vec[bj][1]=candy; |
488 | 280k | } |
489 | 648k | } |
490 | 222k | if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ |
491 | 98.4k | break; |
492 | 98.4k | } |
493 | 222k | } |
494 | 98.4k | } |
495 | 236k | } |
496 | 59.0k | } |
497 | 110k | } |
498 | 113k | } |
499 | 427k | } |
500 | 477k | embs[_mbi].error[_frame]=(ogg_uint16_t)best_err; |
501 | 477k | candx=best_vec[0]; |
502 | 477k | candy=best_vec[1]; |
503 | 477k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc, |
504 | 477k | frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride); |
505 | 477k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1); |
506 | 477k | if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
507 | 1.19M | for(bi=0;bi<4;bi++){ |
508 | 954k | candx=best_block_vec[bi][0]; |
509 | 954k | candy=best_block_vec[bi][1]; |
510 | 954k | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc, |
511 | 954k | frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride); |
512 | 954k | embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1); |
513 | 954k | } |
514 | 238k | } |
515 | 477k | } |
516 | | |
517 | 238k | void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){ |
518 | 238k | oc_mv2 *mvs; |
519 | 238k | oc_mv accum_p; |
520 | 238k | oc_mv accum_g; |
521 | 238k | oc_mv mv2_p; |
522 | 238k | mvs=_enc->mb_info[_mbi].analysis_mv; |
523 | 238k | if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV]; |
524 | 206k | else accum_p=0; |
525 | 238k | accum_g=mvs[2][OC_FRAME_GOLD]; |
526 | | /*Move the motion vector predictors back a frame.*/ |
527 | 238k | mv2_p=mvs[2][OC_FRAME_PREV]; |
528 | 238k | mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD]; |
529 | 238k | mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV]; |
530 | 238k | mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD]; |
531 | 238k | mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p); |
532 | | /*Search the last frame.*/ |
533 | 238k | oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG); |
534 | 238k | mvs[2][OC_FRAME_PREV]=accum_p; |
535 | | /*GOLDEN MVs are different from PREV MVs in that they're each absolute |
536 | | offsets from some frame in the past rather than relative offsets from the |
537 | | frame before. |
538 | | For predictor calculation to make sense, we need them to be in the same |
539 | | form as PREV MVs.*/ |
540 | 238k | mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
541 | 238k | mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g); |
542 | | /*Search the golden frame.*/ |
543 | 238k | oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG); |
544 | | /*Put GOLDEN MVs back into absolute offset form. |
545 | | The newest MV is already an absolute offset.*/ |
546 | 238k | mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g); |
547 | 238k | mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
548 | 238k | } |
549 | | |
550 | | #if 0 |
551 | | static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi, |
552 | | int _vec[2],int _best_err,int _frame){ |
553 | | const unsigned char *src; |
554 | | const unsigned char *ref; |
555 | | const ptrdiff_t *frag_buf_offs; |
556 | | const ptrdiff_t *fragis; |
557 | | int offset_y[9]; |
558 | | int ystride; |
559 | | int mvoffset_base; |
560 | | int best_site; |
561 | | int sitei; |
562 | | int err; |
563 | | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
564 | | ref=_enc->state.ref_frame_data[_framei]; |
565 | | frag_buf_offs=_enc->state.frag_buf_offs; |
566 | | fragis=_enc->state.mb_maps[_mbi][0]; |
567 | | ystride=_enc->state.ref_ystride[0]; |
568 | | mvoffset_base=_vec[0]+_vec[1]*ystride; |
569 | | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
570 | | offset_y[3]=offset_y[5]=0; |
571 | | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
572 | | best_site=4; |
573 | | for(sitei=0;sitei<8;sitei++){ |
574 | | int site; |
575 | | int xmask; |
576 | | int ymask; |
577 | | int dx; |
578 | | int dy; |
579 | | int mvoffset0; |
580 | | int mvoffset1; |
581 | | site=OC_SQUARE_SITES[0][sitei]; |
582 | | dx=OC_SQUARE_DX[site]; |
583 | | dy=OC_SQUARE_DY[site]; |
584 | | /*The following code SHOULD be equivalent to |
585 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
586 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
587 | | However, it should also be much faster, as it involves no multiplies and |
588 | | doesn't have to handle chroma vectors.*/ |
589 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
590 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
591 | | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
592 | | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
593 | | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
594 | | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
595 | | if(err<_best_err){ |
596 | | _best_err=err; |
597 | | best_site=site; |
598 | | } |
599 | | } |
600 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
601 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
602 | | return _best_err; |
603 | | } |
604 | | #endif |
605 | | |
606 | | static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, |
607 | 295k | int _mbi,int _vec[2],unsigned _best_err,int _frame){ |
608 | 295k | const unsigned char *src; |
609 | 295k | const unsigned char *ref; |
610 | 295k | const ptrdiff_t *frag_buf_offs; |
611 | 295k | const ptrdiff_t *fragis; |
612 | 295k | int offset_y[9]; |
613 | 295k | int ystride; |
614 | 295k | int mvoffset_base; |
615 | 295k | int best_site; |
616 | 295k | int sitei; |
617 | 295k | int err; |
618 | 295k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
619 | 295k | ref=_enc->state.ref_frame_data[_frame]; |
620 | 295k | frag_buf_offs=_enc->state.frag_buf_offs; |
621 | 295k | fragis=_enc->state.mb_maps[_mbi][0]; |
622 | 295k | ystride=_enc->state.ref_ystride[0]; |
623 | 295k | mvoffset_base=_vec[0]+_vec[1]*ystride; |
624 | 295k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
625 | 295k | offset_y[3]=offset_y[5]=0; |
626 | 295k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
627 | 295k | best_site=4; |
628 | 2.65M | for(sitei=0;sitei<8;sitei++){ |
629 | 2.36M | int site; |
630 | 2.36M | int xmask; |
631 | 2.36M | int ymask; |
632 | 2.36M | int dx; |
633 | 2.36M | int dy; |
634 | 2.36M | int mvoffset0; |
635 | 2.36M | int mvoffset1; |
636 | 2.36M | site=OC_SQUARE_SITES[0][sitei]; |
637 | 2.36M | dx=OC_SQUARE_DX[site]; |
638 | 2.36M | dy=OC_SQUARE_DY[site]; |
639 | | /*The following code SHOULD be equivalent to |
640 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
641 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
642 | | However, it should also be much faster, as it involves no multiplies and |
643 | | doesn't have to handle chroma vectors.*/ |
644 | 2.36M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
645 | 2.36M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
646 | 2.36M | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
647 | 2.36M | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
648 | 2.36M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
649 | 2.36M | err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, |
650 | 2.36M | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
651 | 2.36M | } |
652 | 0 | else{ |
653 | 0 | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
654 | 0 | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
655 | 0 | } |
656 | 2.36M | if(err<_best_err){ |
657 | 566k | _best_err=err; |
658 | 566k | best_site=site; |
659 | 566k | } |
660 | 2.36M | } |
661 | 295k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
662 | 295k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
663 | 295k | return _best_err; |
664 | 295k | } |
665 | | |
666 | 295k | void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){ |
667 | 295k | oc_mb_enc_info *embs; |
668 | 295k | int vec[2]; |
669 | 295k | embs=_enc->mb_info; |
670 | 295k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame])); |
671 | 295k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame])); |
672 | 295k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc, |
673 | 295k | _mbi,vec,embs[_mbi].satd[_frame],_frame); |
674 | 295k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]); |
675 | 295k | } |
676 | | |
677 | | #if 0 |
678 | | static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc, |
679 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
680 | | int _offset_y[9],unsigned _best_err){ |
681 | | int mvoffset_base; |
682 | | int best_site; |
683 | | int sitei; |
684 | | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
685 | | best_site=4; |
686 | | for(sitei=0;sitei<8;sitei++){ |
687 | | unsigned err; |
688 | | int site; |
689 | | int xmask; |
690 | | int ymask; |
691 | | int dx; |
692 | | int dy; |
693 | | int mvoffset0; |
694 | | int mvoffset1; |
695 | | site=OC_SQUARE_SITES[0][sitei]; |
696 | | dx=OC_SQUARE_DX[site]; |
697 | | dy=OC_SQUARE_DY[site]; |
698 | | /*The following code SHOULD be equivalent to |
699 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
700 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
701 | | However, it should also be much faster, as it involves no multiplies and |
702 | | doesn't have to handle chroma vectors.*/ |
703 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
704 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
705 | | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
706 | | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
707 | | err=oc_enc_frag_sad2_thresh(_enc,_src, |
708 | | _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err); |
709 | | if(err<_best_err){ |
710 | | _best_err=err; |
711 | | best_site=site; |
712 | | } |
713 | | } |
714 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
715 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
716 | | return _best_err; |
717 | | } |
718 | | #endif |
719 | | |
720 | | static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, |
721 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
722 | 149k | int _offset_y[9],unsigned _best_err){ |
723 | 149k | int mvoffset_base; |
724 | 149k | int best_site; |
725 | 149k | int sitei; |
726 | 149k | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
727 | 149k | best_site=4; |
728 | 1.34M | for(sitei=0;sitei<8;sitei++){ |
729 | 1.19M | unsigned err; |
730 | 1.19M | int dc; |
731 | 1.19M | int site; |
732 | 1.19M | int xmask; |
733 | 1.19M | int ymask; |
734 | 1.19M | int dx; |
735 | 1.19M | int dy; |
736 | 1.19M | int mvoffset0; |
737 | 1.19M | int mvoffset1; |
738 | 1.19M | site=OC_SQUARE_SITES[0][sitei]; |
739 | 1.19M | dx=OC_SQUARE_DX[site]; |
740 | 1.19M | dy=OC_SQUARE_DY[site]; |
741 | | /*The following code SHOULD be equivalent to |
742 | | oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0, |
743 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy); |
744 | | However, it should also be much faster, as it involves no multiplies and |
745 | | doesn't have to handle chroma vectors.*/ |
746 | 1.19M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
747 | 1.19M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
748 | 1.19M | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
749 | 1.19M | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
750 | 1.19M | err=oc_enc_frag_satd2(_enc,&dc,_src, |
751 | 1.19M | _ref+mvoffset0,_ref+mvoffset1,_ystride); |
752 | 1.19M | err+=abs(dc); |
753 | 1.19M | if(err<_best_err){ |
754 | 298k | _best_err=err; |
755 | 298k | best_site=site; |
756 | 298k | } |
757 | 1.19M | } |
758 | 149k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
759 | 149k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
760 | 149k | return _best_err; |
761 | 149k | } |
762 | | |
763 | 37.2k | void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ |
764 | 37.2k | oc_mb_enc_info *embs; |
765 | 37.2k | const ptrdiff_t *frag_buf_offs; |
766 | 37.2k | const ptrdiff_t *fragis; |
767 | 37.2k | const unsigned char *src; |
768 | 37.2k | const unsigned char *ref; |
769 | 37.2k | int offset_y[9]; |
770 | 37.2k | int ystride; |
771 | 37.2k | int bi; |
772 | 37.2k | ystride=_enc->state.ref_ystride[0]; |
773 | 37.2k | frag_buf_offs=_enc->state.frag_buf_offs; |
774 | 37.2k | fragis=_enc->state.mb_maps[_mbi][0]; |
775 | 37.2k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
776 | 37.2k | ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; |
777 | 37.2k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
778 | 37.2k | offset_y[3]=offset_y[5]=0; |
779 | 37.2k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
780 | 37.2k | embs=_enc->mb_info; |
781 | 186k | for(bi=0;bi<4;bi++){ |
782 | 149k | ptrdiff_t frag_offs; |
783 | 149k | int vec[2]; |
784 | 149k | frag_offs=frag_buf_offs[fragis[bi]]; |
785 | 149k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi])); |
786 | 149k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi])); |
787 | 149k | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec, |
788 | 149k | src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]); |
789 | 149k | embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]); |
790 | 149k | } |
791 | 37.2k | } |