Line | Count | Source |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation https://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | |
15 | | ********************************************************************/ |
16 | | #include <stdlib.h> |
17 | | #include <limits.h> |
18 | | #include <string.h> |
19 | | #include "encint.h" |
20 | | |
21 | | |
22 | | |
23 | | typedef struct oc_mcenc_ctx oc_mcenc_ctx; |
24 | | |
25 | | |
26 | | |
27 | | /*Temporary state used for motion estimation.*/ |
28 | | struct oc_mcenc_ctx{ |
29 | | /*The candidate motion vectors.*/ |
30 | | int candidates[13][2]; |
31 | | /*The start of the Set B candidates.*/ |
32 | | int setb0; |
33 | | /*The total number of candidates.*/ |
34 | | int ncandidates; |
35 | | }; |
36 | | |
37 | | |
38 | | |
39 | | /*The maximum Y plane SAD value for accepting the median predictor.*/ |
40 | 445k | #define OC_YSAD_THRESH1 (256) |
41 | | /*The amount to right shift the minimum error by when inflating it for |
42 | | computing the second maximum Y plane SAD threshold.*/ |
43 | 389k | #define OC_YSAD_THRESH2_SCALE_BITS (4) |
44 | | /*The amount to add to the second maximum Y plane threshold when inflating |
45 | | it.*/ |
46 | 389k | #define OC_YSAD_THRESH2_OFFSET (64) |
47 | | |
48 | | /*The vector offsets in the X direction for each search site in the square |
49 | | pattern.*/ |
50 | | static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1}; |
51 | | /*The vector offsets in the Y direction for each search site in the square |
52 | | pattern.*/ |
53 | | static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1}; |
54 | | /*The number of sites to search for each boundary condition in the square |
55 | | pattern. |
56 | | Bit flags for the boundary conditions are as follows: |
57 | | 1: -16==dx |
58 | | 2: dx==15(.5) |
59 | | 4: -16==dy |
60 | | 8: dy==15(.5)*/ |
61 | | static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3}; |
62 | | /*The list of sites to search for each boundary condition in the square |
63 | | pattern.*/ |
64 | | static const int OC_SQUARE_SITES[11][8]={ |
65 | | /* -15.5<dx<31, -15.5<dy<15(.5)*/ |
66 | | {0,1,2,3,5,6,7,8}, |
67 | | /*-15.5==dx, -15.5<dy<15(.5)*/ |
68 | | {1,2,5,7,8}, |
69 | | /* dx==15(.5), -15.5<dy<15(.5)*/ |
70 | | {0,1,3,6,7}, |
71 | | /*-15.5==dx==15(.5), -15.5<dy<15(.5)*/ |
72 | | {-1}, |
73 | | /* -15.5<dx<15(.5), -15.5==dy*/ |
74 | | {3,5,6,7,8}, |
75 | | /*-15.5==dx, -15.5==dy*/ |
76 | | {5,7,8}, |
77 | | /* dx==15(.5), -15.5==dy*/ |
78 | | {3,6,7}, |
79 | | /*-15.5==dx==15(.5), -15.5==dy*/ |
80 | | {-1}, |
81 | | /*-15.5dx<15(.5), dy==15(.5)*/ |
82 | | {0,1,2,3,5}, |
83 | | /*-15.5==dx, dy==15(.5)*/ |
84 | | {1,2,5}, |
85 | | /* dx==15(.5), dy==15(.5)*/ |
86 | | {0,1,3} |
87 | | }; |
88 | | |
89 | | |
90 | | static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
91 | 445k | oc_mv _accum,int _mbi,int _frame){ |
92 | 445k | oc_mb_enc_info *embs; |
93 | 445k | int accum_x; |
94 | 445k | int accum_y; |
95 | 445k | int a[3][2]; |
96 | 445k | int ncandidates; |
97 | 445k | unsigned nmbi; |
98 | 445k | int i; |
99 | 445k | embs=_enc->mb_info; |
100 | | /*Skip a position to store the median predictor in.*/ |
101 | 445k | ncandidates=1; |
102 | 445k | if(embs[_mbi].ncneighbors>0){ |
103 | | /*Fill in the first part of set A: the vectors from adjacent blocks.*/ |
104 | 1.11M | for(i=0;i<embs[_mbi].ncneighbors;i++){ |
105 | 745k | nmbi=embs[_mbi].cneighbors[i]; |
106 | 745k | _mcenc->candidates[ncandidates][0]= |
107 | 745k | OC_MV_X(embs[nmbi].analysis_mv[0][_frame]); |
108 | 745k | _mcenc->candidates[ncandidates][1]= |
109 | 745k | OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]); |
110 | 745k | ncandidates++; |
111 | 745k | } |
112 | 369k | } |
113 | 445k | accum_x=OC_MV_X(_accum); |
114 | 445k | accum_y=OC_MV_Y(_accum); |
115 | | /*Add a few additional vectors to set A: the vectors used in the previous |
116 | | frames and the (0,0) vector.*/ |
117 | 445k | _mcenc->candidates[ncandidates][0]=accum_x; |
118 | 445k | _mcenc->candidates[ncandidates][1]=accum_y; |
119 | 445k | ncandidates++; |
120 | 445k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
121 | 445k | OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31); |
122 | 445k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
123 | 445k | OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31); |
124 | 445k | ncandidates++; |
125 | 445k | _mcenc->candidates[ncandidates][0]=0; |
126 | 445k | _mcenc->candidates[ncandidates][1]=0; |
127 | 445k | ncandidates++; |
128 | | /*Use the first three vectors of set A to find our best predictor: their |
129 | | median.*/ |
130 | 445k | memcpy(a,_mcenc->candidates+1,sizeof(a)); |
131 | 445k | OC_SORT2I(a[0][0],a[1][0]); |
132 | 445k | OC_SORT2I(a[0][1],a[1][1]); |
133 | 445k | OC_SORT2I(a[1][0],a[2][0]); |
134 | 445k | OC_SORT2I(a[1][1],a[2][1]); |
135 | 445k | OC_SORT2I(a[0][0],a[1][0]); |
136 | 445k | OC_SORT2I(a[0][1],a[1][1]); |
137 | 445k | _mcenc->candidates[0][0]=a[1][0]; |
138 | 445k | _mcenc->candidates[0][1]=a[1][1]; |
139 | 445k | _mcenc->setb0=ncandidates; |
140 | 445k | } |
141 | | |
142 | | static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc, |
143 | 116k | oc_mv _accum,int _mbi,int _frame){ |
144 | 116k | oc_mb_enc_info *embs; |
145 | 116k | int accum_x; |
146 | 116k | int accum_y; |
147 | 116k | int ncandidates; |
148 | 116k | embs=_enc->mb_info; |
149 | 116k | accum_x=OC_MV_X(_accum); |
150 | 116k | accum_y=OC_MV_Y(_accum); |
151 | | /*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/ |
152 | 116k | ncandidates=_mcenc->setb0; |
153 | | /*Use only the current block. Using more did not appear to be helpful |
154 | | with the current selection logic due to escaping the local search too |
155 | | quickly.*/ |
156 | 116k | _mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31, |
157 | 116k | 2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame]) |
158 | 116k | -OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31); |
159 | 116k | _mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31, |
160 | 116k | 2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame]) |
161 | 116k | -OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31); |
162 | 116k | ncandidates++; |
163 | 116k | _mcenc->ncandidates=ncandidates; |
164 | 116k | } |
165 | | |
166 | | static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc, |
167 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
168 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
169 | 0 | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
170 | 0 | unsigned err; |
171 | 0 | int bi; |
172 | 0 | err=0; |
173 | 0 | for(bi=0;bi<4;bi++){ |
174 | 0 | ptrdiff_t frag_offs; |
175 | 0 | frag_offs=_frag_buf_offs[_fragis[bi]]; |
176 | 0 | err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0, |
177 | 0 | _ref+frag_offs+_mvoffset1,_ystride,_best_err-err); |
178 | 0 | } |
179 | 0 | return err; |
180 | 0 | } |
181 | | |
182 | | static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc, |
183 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4], |
184 | | int _mvoffset0,int _mvoffset1,const unsigned char *_src, |
185 | 2.20M | const unsigned char *_ref,int _ystride,unsigned _best_err){ |
186 | 2.20M | unsigned err; |
187 | 2.20M | int dc; |
188 | 2.20M | int bi; |
189 | 2.20M | err=0; |
190 | 11.0M | for(bi=0;bi<4;bi++){ |
191 | 8.83M | ptrdiff_t frag_offs; |
192 | 8.83M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
193 | 8.83M | err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs, |
194 | 8.83M | _ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride); |
195 | 8.83M | err+=abs(dc); |
196 | 8.83M | } |
197 | 2.20M | return err; |
198 | 2.20M | } |
199 | | |
200 | | static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
201 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
202 | | const unsigned char *_src,const unsigned char *_ref,int _ystride, |
203 | 4.08M | unsigned _block_err[4]){ |
204 | 4.08M | unsigned err; |
205 | 4.08M | int mvoffset; |
206 | 4.08M | int bi; |
207 | 4.08M | mvoffset=_dx+_dy*_ystride; |
208 | 4.08M | err=0; |
209 | 20.4M | for(bi=0;bi<4;bi++){ |
210 | 16.3M | ptrdiff_t frag_offs; |
211 | 16.3M | unsigned block_err; |
212 | 16.3M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
213 | 16.3M | block_err=oc_enc_frag_sad(_enc, |
214 | 16.3M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
215 | 16.3M | _block_err[bi]=block_err; |
216 | 16.3M | err+=block_err; |
217 | 16.3M | } |
218 | 4.08M | return err; |
219 | 4.08M | } |
220 | | |
221 | | static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc, |
222 | | const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy, |
223 | 445k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
224 | 445k | int mvoffset; |
225 | 445k | int err; |
226 | 445k | int bi; |
227 | 445k | mvoffset=_dx+_dy*_ystride; |
228 | 445k | err=0; |
229 | 2.22M | for(bi=0;bi<4;bi++){ |
230 | 1.78M | ptrdiff_t frag_offs; |
231 | 1.78M | int dc; |
232 | 1.78M | frag_offs=_frag_buf_offs[_fragis[bi]]; |
233 | 1.78M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
234 | 1.78M | err+=oc_enc_frag_satd(_enc,&dc, |
235 | 1.78M | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
236 | 1.78M | err+=abs(dc); |
237 | 1.78M | } |
238 | 0 | else{ |
239 | 0 | err+=oc_enc_frag_sad(_enc, |
240 | 0 | _src+frag_offs,_ref+frag_offs+mvoffset,_ystride); |
241 | 0 | } |
242 | 1.78M | } |
243 | 445k | return err; |
244 | 445k | } |
245 | | |
246 | | static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc, |
247 | | ptrdiff_t _frag_offs,int _dx,int _dy, |
248 | 890k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
249 | 890k | unsigned err; |
250 | 890k | int dc; |
251 | 890k | err=oc_enc_frag_satd(_enc,&dc, |
252 | 890k | _src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride); |
253 | 890k | return err+abs(dc); |
254 | 890k | } |
255 | | |
256 | | /*Perform a motion vector search for this macro block against a single |
257 | | reference frame. |
258 | | As a bonus, individual block motion vectors are computed as well, as much of |
259 | | the work can be shared. |
260 | | The actual motion vector is stored in the appropriate place in the |
261 | | oc_mb_enc_info structure. |
262 | | _accum: Drop frame/golden MV accumulators. |
263 | | _mbi: The macro block index. |
264 | | _frame: The frame to use for SATD calculations and refinement, |
265 | | either OC_FRAME_PREV or OC_FRAME_GOLD. |
266 | | _frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV, |
267 | | OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/ |
268 | | void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame, |
269 | 445k | int _frame_full){ |
270 | | /*Note: Traditionally this search is done using a rate-distortion objective |
271 | | function of the form D+lambda*R. |
272 | | However, xiphmont tested this and found it produced a small degradation, |
273 | | while requiring extra computation. |
274 | | This is most likely due to Theora's peculiar MV encoding scheme: MVs are |
275 | | not coded relative to a predictor, and the only truly cheap way to use a |
276 | | MV is in the LAST or LAST2 MB modes, which are not being considered here. |
277 | | Therefore if we use the MV found here, it's only because both LAST and |
278 | | LAST2 performed poorly, and therefore the MB is not likely to be uniform |
279 | | or suffer from the aperture problem. |
280 | | Furthermore we would like to reuse the MV found here for as many MBs as |
281 | | possible, so picking a slightly sub-optimal vector to save a bit or two |
282 | | may cause increased degradation in many blocks to come. |
283 | | We could artificially reduce lambda to compensate, but it's faster to just |
284 | | disable it entirely, and use D (the distortion) as the sole criterion.*/ |
285 | 445k | oc_mcenc_ctx mcenc; |
286 | 445k | const ptrdiff_t *frag_buf_offs; |
287 | 445k | const ptrdiff_t *fragis; |
288 | 445k | const unsigned char *src; |
289 | 445k | const unsigned char *ref; |
290 | 445k | const unsigned char *satd_ref; |
291 | 445k | int ystride; |
292 | 445k | oc_mb_enc_info *embs; |
293 | 445k | ogg_int32_t hit_cache[31]; |
294 | 445k | ogg_int32_t hitbit; |
295 | 445k | unsigned best_block_err[4]; |
296 | 445k | unsigned block_err[4]; |
297 | 445k | unsigned best_err; |
298 | 445k | int best_vec[2]; |
299 | 445k | int best_block_vec[4][2]; |
300 | 445k | int candx; |
301 | 445k | int candy; |
302 | 445k | int bi; |
303 | 445k | embs=_enc->mb_info; |
304 | | /*Find some candidate motion vectors.*/ |
305 | 445k | oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame); |
306 | | /*Clear the cache of locations we've examined.*/ |
307 | 445k | memset(hit_cache,0,sizeof(hit_cache)); |
308 | | /*Start with the median predictor.*/ |
309 | 445k | candx=OC_DIV2(mcenc.candidates[0][0]); |
310 | 445k | candy=OC_DIV2(mcenc.candidates[0][1]); |
311 | 445k | hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15; |
312 | 445k | frag_buf_offs=_enc->state.frag_buf_offs; |
313 | 445k | fragis=_enc->state.mb_maps[_mbi][0]; |
314 | 445k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
315 | 445k | ref=_enc->state.ref_frame_data[_frame_full]; |
316 | 445k | satd_ref=_enc->state.ref_frame_data[_frame]; |
317 | 445k | ystride=_enc->state.ref_ystride[0]; |
318 | | /*TODO: customize error function for speed/(quality+size) tradeoff.*/ |
319 | 445k | best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
320 | 445k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
321 | 445k | best_vec[0]=candx; |
322 | 445k | best_vec[1]=candy; |
323 | 445k | if(_frame==OC_FRAME_PREV){ |
324 | 1.11M | for(bi=0;bi<4;bi++){ |
325 | 890k | best_block_err[bi]=block_err[bi]; |
326 | 890k | best_block_vec[bi][0]=candx; |
327 | 890k | best_block_vec[bi][1]=candy; |
328 | 890k | } |
329 | 222k | } |
330 | | /*If this predictor fails, move on to set A.*/ |
331 | 445k | if(best_err>OC_YSAD_THRESH1){ |
332 | 389k | unsigned err; |
333 | 389k | unsigned t2; |
334 | 389k | int ncs; |
335 | 389k | int ci; |
336 | | /*Compute the early termination threshold for set A.*/ |
337 | 389k | t2=embs[_mbi].error[_frame]; |
338 | 389k | ncs=OC_MINI(3,embs[_mbi].ncneighbors); |
339 | 1.03M | for(ci=0;ci<ncs;ci++){ |
340 | 649k | t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]); |
341 | 649k | } |
342 | 389k | t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET; |
343 | | /*Examine the candidates in set A.*/ |
344 | 2.23M | for(ci=1;ci<mcenc.setb0;ci++){ |
345 | 1.84M | candx=OC_DIV2(mcenc.candidates[ci][0]); |
346 | 1.84M | candy=OC_DIV2(mcenc.candidates[ci][1]); |
347 | | /*If we've already examined this vector, then we would be using it if it |
348 | | was better than what we are using.*/ |
349 | 1.84M | hitbit=(ogg_int32_t)1<<candx+15; |
350 | 1.84M | if(hit_cache[candy+15]&hitbit)continue; |
351 | 691k | hit_cache[candy+15]|=hitbit; |
352 | 691k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
353 | 691k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
354 | 691k | if(err<best_err){ |
355 | 267k | best_err=err; |
356 | 267k | best_vec[0]=candx; |
357 | 267k | best_vec[1]=candy; |
358 | 267k | } |
359 | 691k | if(_frame==OC_FRAME_PREV){ |
360 | 1.54M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
361 | 486k | best_block_err[bi]=block_err[bi]; |
362 | 486k | best_block_vec[bi][0]=candx; |
363 | 486k | best_block_vec[bi][1]=candy; |
364 | 486k | } |
365 | 308k | } |
366 | 691k | } |
367 | 389k | if(best_err>t2){ |
368 | 116k | oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame); |
369 | | /*Examine the candidates in set B.*/ |
370 | 233k | for(;ci<mcenc.ncandidates;ci++){ |
371 | 116k | candx=OC_DIV2(mcenc.candidates[ci][0]); |
372 | 116k | candy=OC_DIV2(mcenc.candidates[ci][1]); |
373 | 116k | hitbit=(ogg_int32_t)1<<candx+15; |
374 | 116k | if(hit_cache[candy+15]&hitbit)continue; |
375 | 43.9k | hit_cache[candy+15]|=hitbit; |
376 | 43.9k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
377 | 43.9k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
378 | 43.9k | if(err<best_err){ |
379 | 13.2k | best_err=err; |
380 | 13.2k | best_vec[0]=candx; |
381 | 13.2k | best_vec[1]=candy; |
382 | 13.2k | } |
383 | 43.9k | if(_frame==OC_FRAME_PREV){ |
384 | 117k | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
385 | 28.2k | best_block_err[bi]=block_err[bi]; |
386 | 28.2k | best_block_vec[bi][0]=candx; |
387 | 28.2k | best_block_vec[bi][1]=candy; |
388 | 28.2k | } |
389 | 23.5k | } |
390 | 43.9k | } |
391 | | /*Use the same threshold for set B as in set A.*/ |
392 | 116k | if(best_err>t2){ |
393 | 113k | int best_site; |
394 | 113k | int nsites; |
395 | 113k | int sitei; |
396 | 113k | int site; |
397 | 113k | int b; |
398 | | /*Square pattern search.*/ |
399 | 542k | for(;;){ |
400 | 542k | best_site=4; |
401 | | /*Compose the bit flags for boundary conditions.*/ |
402 | 542k | b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1| |
403 | 542k | OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3; |
404 | 542k | nsites=OC_SQUARE_NSITES[b]; |
405 | 4.59M | for(sitei=0;sitei<nsites;sitei++){ |
406 | 4.05M | site=OC_SQUARE_SITES[b][sitei]; |
407 | 4.05M | candx=best_vec[0]+OC_SQUARE_DX[site]; |
408 | 4.05M | candy=best_vec[1]+OC_SQUARE_DY[site]; |
409 | 4.05M | hitbit=(ogg_int32_t)1<<candx+15; |
410 | 4.05M | if(hit_cache[candy+15]&hitbit)continue; |
411 | 2.29M | hit_cache[candy+15]|=hitbit; |
412 | 2.29M | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
413 | 2.29M | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
414 | 2.29M | if(err<best_err){ |
415 | 698k | best_err=err; |
416 | 698k | best_site=site; |
417 | 698k | } |
418 | 2.29M | if(_frame==OC_FRAME_PREV){ |
419 | 6.30M | for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){ |
420 | 865k | best_block_err[bi]=block_err[bi]; |
421 | 865k | best_block_vec[bi][0]=candx; |
422 | 865k | best_block_vec[bi][1]=candy; |
423 | 865k | } |
424 | 1.26M | } |
425 | 2.29M | } |
426 | 542k | if(best_site==4)break; |
427 | 429k | best_vec[0]+=OC_SQUARE_DX[best_site]; |
428 | 429k | best_vec[1]+=OC_SQUARE_DY[best_site]; |
429 | 429k | } |
430 | | /*Final 4-MV search.*/ |
431 | | /*Simply use 1/4 of the macro block set A and B threshold as the |
432 | | individual block threshold.*/ |
433 | 113k | if(_frame==OC_FRAME_PREV){ |
434 | 61.0k | t2>>=2; |
435 | 305k | for(bi=0;bi<4;bi++){ |
436 | 244k | if(best_block_err[bi]>t2){ |
437 | | /*Square pattern search. |
438 | | We do this in a slightly interesting manner. |
439 | | We continue to check the SAD of all four blocks in the |
440 | | macro block. |
441 | | This gives us two things: |
442 | | 1) We can continue to use the hit_cache to avoid duplicate |
443 | | checks. |
444 | | Otherwise we could continue to read it, but not write to it |
445 | | without saving and restoring it for each block. |
446 | | Note that we could still eliminate a large number of |
447 | | duplicate checks by taking into account the site we came |
448 | | from when choosing the site list. |
449 | | We can still do that to avoid extra hit_cache queries, and |
450 | | it might even be a speed win. |
451 | | 2) It gives us a slightly better chance of escaping local |
452 | | minima. |
453 | | We would not be here if we weren't doing a fairly bad job |
454 | | in finding a good vector, and checking these vectors can |
455 | | save us from 100 to several thousand points off our SAD 1 |
456 | | in 15 times. |
457 | | TODO: Is this a good idea? |
458 | | Who knows. |
459 | | It needs more testing.*/ |
460 | 216k | for(;;){ |
461 | 216k | int bestx; |
462 | 216k | int besty; |
463 | 216k | int bj; |
464 | 216k | bestx=best_block_vec[bi][0]; |
465 | 216k | besty=best_block_vec[bi][1]; |
466 | | /*Compose the bit flags for boundary conditions.*/ |
467 | 216k | b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1| |
468 | 216k | OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3; |
469 | 216k | nsites=OC_SQUARE_NSITES[b]; |
470 | 1.83M | for(sitei=0;sitei<nsites;sitei++){ |
471 | 1.62M | site=OC_SQUARE_SITES[b][sitei]; |
472 | 1.62M | candx=bestx+OC_SQUARE_DX[site]; |
473 | 1.62M | candy=besty+OC_SQUARE_DY[site]; |
474 | 1.62M | hitbit=(ogg_int32_t)1<<candx+15; |
475 | 1.62M | if(hit_cache[candy+15]&hitbit)continue; |
476 | 607k | hit_cache[candy+15]|=hitbit; |
477 | 607k | err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc, |
478 | 607k | frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err); |
479 | 607k | if(err<best_err){ |
480 | 49.8k | best_err=err; |
481 | 49.8k | best_vec[0]=candx; |
482 | 49.8k | best_vec[1]=candy; |
483 | 49.8k | } |
484 | 3.03M | for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){ |
485 | 254k | best_block_err[bj]=block_err[bj]; |
486 | 254k | best_block_vec[bj][0]=candx; |
487 | 254k | best_block_vec[bj][1]=candy; |
488 | 254k | } |
489 | 607k | } |
490 | 216k | if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){ |
491 | 103k | break; |
492 | 103k | } |
493 | 216k | } |
494 | 103k | } |
495 | 244k | } |
496 | 61.0k | } |
497 | 113k | } |
498 | 116k | } |
499 | 389k | } |
500 | 445k | embs[_mbi].error[_frame]=(ogg_uint16_t)best_err; |
501 | 445k | candx=best_vec[0]; |
502 | 445k | candy=best_vec[1]; |
503 | 445k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc, |
504 | 445k | frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride); |
505 | 445k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1); |
506 | 445k | if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){ |
507 | 1.11M | for(bi=0;bi<4;bi++){ |
508 | 890k | candx=best_block_vec[bi][0]; |
509 | 890k | candy=best_block_vec[bi][1]; |
510 | 890k | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc, |
511 | 890k | frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride); |
512 | 890k | embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1); |
513 | 890k | } |
514 | 222k | } |
515 | 445k | } |
516 | | |
517 | 222k | void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){ |
518 | 222k | oc_mv2 *mvs; |
519 | 222k | oc_mv accum_p; |
520 | 222k | oc_mv accum_g; |
521 | 222k | oc_mv mv2_p; |
522 | 222k | mvs=_enc->mb_info[_mbi].analysis_mv; |
523 | 222k | if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV]; |
524 | 200k | else accum_p=0; |
525 | 222k | accum_g=mvs[2][OC_FRAME_GOLD]; |
526 | | /*Move the motion vector predictors back a frame.*/ |
527 | 222k | mv2_p=mvs[2][OC_FRAME_PREV]; |
528 | 222k | mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD]; |
529 | 222k | mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV]; |
530 | 222k | mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD]; |
531 | 222k | mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p); |
532 | | /*Search the last frame.*/ |
533 | 222k | oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG); |
534 | 222k | mvs[2][OC_FRAME_PREV]=accum_p; |
535 | | /*GOLDEN MVs are different from PREV MVs in that they're each absolute |
536 | | offsets from some frame in the past rather than relative offsets from the |
537 | | frame before. |
538 | | For predictor calculation to make sense, we need them to be in the same |
539 | | form as PREV MVs.*/ |
540 | 222k | mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
541 | 222k | mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g); |
542 | | /*Search the golden frame.*/ |
543 | 222k | oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG); |
544 | | /*Put GOLDEN MVs back into absolute offset form. |
545 | | The newest MV is already an absolute offset.*/ |
546 | 222k | mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g); |
547 | 222k | mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]); |
548 | 222k | } |
549 | | |
550 | | #if 0 |
551 | | static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi, |
552 | | int _vec[2],int _best_err,int _frame){ |
553 | | const unsigned char *src; |
554 | | const unsigned char *ref; |
555 | | const ptrdiff_t *frag_buf_offs; |
556 | | const ptrdiff_t *fragis; |
557 | | int offset_y[9]; |
558 | | int ystride; |
559 | | int mvoffset_base; |
560 | | int best_site; |
561 | | int sitei; |
562 | | int err; |
563 | | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
564 | | ref=_enc->state.ref_frame_data[_framei]; |
565 | | frag_buf_offs=_enc->state.frag_buf_offs; |
566 | | fragis=_enc->state.mb_maps[_mbi][0]; |
567 | | ystride=_enc->state.ref_ystride[0]; |
568 | | mvoffset_base=_vec[0]+_vec[1]*ystride; |
569 | | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
570 | | offset_y[3]=offset_y[5]=0; |
571 | | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
572 | | best_site=4; |
573 | | for(sitei=0;sitei<8;sitei++){ |
574 | | int site; |
575 | | int xmask; |
576 | | int ymask; |
577 | | int dx; |
578 | | int dy; |
579 | | int mvoffset0; |
580 | | int mvoffset1; |
581 | | site=OC_SQUARE_SITES[0][sitei]; |
582 | | dx=OC_SQUARE_DX[site]; |
583 | | dy=OC_SQUARE_DY[site]; |
584 | | /*The following code SHOULD be equivalent to |
585 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
586 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
587 | | However, it should also be much faster, as it involves no multiplies and |
588 | | doesn't have to handle chroma vectors.*/ |
589 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
590 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
591 | | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
592 | | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
593 | | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
594 | | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
595 | | if(err<_best_err){ |
596 | | _best_err=err; |
597 | | best_site=site; |
598 | | } |
599 | | } |
600 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
601 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
602 | | return _best_err; |
603 | | } |
604 | | #endif |
605 | | |
606 | | static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc, |
607 | 275k | int _mbi,int _vec[2],unsigned _best_err,int _frame){ |
608 | 275k | const unsigned char *src; |
609 | 275k | const unsigned char *ref; |
610 | 275k | const ptrdiff_t *frag_buf_offs; |
611 | 275k | const ptrdiff_t *fragis; |
612 | 275k | int offset_y[9]; |
613 | 275k | int ystride; |
614 | 275k | int mvoffset_base; |
615 | 275k | int best_site; |
616 | 275k | int sitei; |
617 | 275k | int err; |
618 | 275k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
619 | 275k | ref=_enc->state.ref_frame_data[_frame]; |
620 | 275k | frag_buf_offs=_enc->state.frag_buf_offs; |
621 | 275k | fragis=_enc->state.mb_maps[_mbi][0]; |
622 | 275k | ystride=_enc->state.ref_ystride[0]; |
623 | 275k | mvoffset_base=_vec[0]+_vec[1]*ystride; |
624 | 275k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
625 | 275k | offset_y[3]=offset_y[5]=0; |
626 | 275k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
627 | 275k | best_site=4; |
628 | 2.48M | for(sitei=0;sitei<8;sitei++){ |
629 | 2.20M | int site; |
630 | 2.20M | int xmask; |
631 | 2.20M | int ymask; |
632 | 2.20M | int dx; |
633 | 2.20M | int dy; |
634 | 2.20M | int mvoffset0; |
635 | 2.20M | int mvoffset1; |
636 | 2.20M | site=OC_SQUARE_SITES[0][sitei]; |
637 | 2.20M | dx=OC_SQUARE_DX[site]; |
638 | 2.20M | dy=OC_SQUARE_DY[site]; |
639 | | /*The following code SHOULD be equivalent to |
640 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
641 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
642 | | However, it should also be much faster, as it involves no multiplies and |
643 | | doesn't have to handle chroma vectors.*/ |
644 | 2.20M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
645 | 2.20M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
646 | 2.20M | mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask); |
647 | 2.20M | mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask); |
648 | 2.20M | if(_enc->sp_level<OC_SP_LEVEL_NOSATD){ |
649 | 2.20M | err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis, |
650 | 2.20M | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
651 | 2.20M | } |
652 | 0 | else{ |
653 | 0 | err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis, |
654 | 0 | mvoffset0,mvoffset1,src,ref,ystride,_best_err); |
655 | 0 | } |
656 | 2.20M | if(err<_best_err){ |
657 | 518k | _best_err=err; |
658 | 518k | best_site=site; |
659 | 518k | } |
660 | 2.20M | } |
661 | 275k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
662 | 275k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
663 | 275k | return _best_err; |
664 | 275k | } |
665 | | |
666 | 275k | void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){ |
667 | 275k | oc_mb_enc_info *embs; |
668 | 275k | int vec[2]; |
669 | 275k | embs=_enc->mb_info; |
670 | 275k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame])); |
671 | 275k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame])); |
672 | 275k | embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc, |
673 | 275k | _mbi,vec,embs[_mbi].satd[_frame],_frame); |
674 | 275k | embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]); |
675 | 275k | } |
676 | | |
677 | | #if 0 |
678 | | static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc, |
679 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
680 | | int _offset_y[9],unsigned _best_err){ |
681 | | int mvoffset_base; |
682 | | int best_site; |
683 | | int sitei; |
684 | | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
685 | | best_site=4; |
686 | | for(sitei=0;sitei<8;sitei++){ |
687 | | unsigned err; |
688 | | int site; |
689 | | int xmask; |
690 | | int ymask; |
691 | | int dx; |
692 | | int dy; |
693 | | int mvoffset0; |
694 | | int mvoffset1; |
695 | | site=OC_SQUARE_SITES[0][sitei]; |
696 | | dx=OC_SQUARE_DX[site]; |
697 | | dy=OC_SQUARE_DY[site]; |
698 | | /*The following code SHOULD be equivalent to |
699 | | oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1, |
700 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0); |
701 | | However, it should also be much faster, as it involves no multiplies and |
702 | | doesn't have to handle chroma vectors.*/ |
703 | | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
704 | | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
705 | | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
706 | | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
707 | | err=oc_enc_frag_sad2_thresh(_enc,_src, |
708 | | _ref+mvoffset0,_ref+mvoffset1,ystride,_best_err); |
709 | | if(err<_best_err){ |
710 | | _best_err=err; |
711 | | best_site=site; |
712 | | } |
713 | | } |
714 | | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
715 | | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
716 | | return _best_err; |
717 | | } |
718 | | #endif |
719 | | |
720 | | static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc, |
721 | | int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride, |
722 | 139k | int _offset_y[9],unsigned _best_err){ |
723 | 139k | int mvoffset_base; |
724 | 139k | int best_site; |
725 | 139k | int sitei; |
726 | 139k | mvoffset_base=_vec[0]+_vec[1]*_ystride; |
727 | 139k | best_site=4; |
728 | 1.25M | for(sitei=0;sitei<8;sitei++){ |
729 | 1.11M | unsigned err; |
730 | 1.11M | int dc; |
731 | 1.11M | int site; |
732 | 1.11M | int xmask; |
733 | 1.11M | int ymask; |
734 | 1.11M | int dx; |
735 | 1.11M | int dy; |
736 | 1.11M | int mvoffset0; |
737 | 1.11M | int mvoffset1; |
738 | 1.11M | site=OC_SQUARE_SITES[0][sitei]; |
739 | 1.11M | dx=OC_SQUARE_DX[site]; |
740 | 1.11M | dy=OC_SQUARE_DY[site]; |
741 | | /*The following code SHOULD be equivalent to |
742 | | oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0, |
743 | | (_vec[0]<<1)+dx,(_vec[1]<<1)+dy); |
744 | | However, it should also be much faster, as it involves no multiplies and |
745 | | doesn't have to handle chroma vectors.*/ |
746 | 1.11M | xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx); |
747 | 1.11M | ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy); |
748 | 1.11M | mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask); |
749 | 1.11M | mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask); |
750 | 1.11M | err=oc_enc_frag_satd2(_enc,&dc,_src, |
751 | 1.11M | _ref+mvoffset0,_ref+mvoffset1,_ystride); |
752 | 1.11M | err+=abs(dc); |
753 | 1.11M | if(err<_best_err){ |
754 | 270k | _best_err=err; |
755 | 270k | best_site=site; |
756 | 270k | } |
757 | 1.11M | } |
758 | 139k | _vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site]; |
759 | 139k | _vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site]; |
760 | 139k | return _best_err; |
761 | 139k | } |
762 | | |
763 | 34.7k | void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){ |
764 | 34.7k | oc_mb_enc_info *embs; |
765 | 34.7k | const ptrdiff_t *frag_buf_offs; |
766 | 34.7k | const ptrdiff_t *fragis; |
767 | 34.7k | const unsigned char *src; |
768 | 34.7k | const unsigned char *ref; |
769 | 34.7k | int offset_y[9]; |
770 | 34.7k | int ystride; |
771 | 34.7k | int bi; |
772 | 34.7k | ystride=_enc->state.ref_ystride[0]; |
773 | 34.7k | frag_buf_offs=_enc->state.frag_buf_offs; |
774 | 34.7k | fragis=_enc->state.mb_maps[_mbi][0]; |
775 | 34.7k | src=_enc->state.ref_frame_data[OC_FRAME_IO]; |
776 | 34.7k | ref=_enc->state.ref_frame_data[OC_FRAME_PREV]; |
777 | 34.7k | offset_y[0]=offset_y[1]=offset_y[2]=-ystride; |
778 | 34.7k | offset_y[3]=offset_y[5]=0; |
779 | 34.7k | offset_y[6]=offset_y[7]=offset_y[8]=ystride; |
780 | 34.7k | embs=_enc->mb_info; |
781 | 173k | for(bi=0;bi<4;bi++){ |
782 | 139k | ptrdiff_t frag_offs; |
783 | 139k | int vec[2]; |
784 | 139k | frag_offs=frag_buf_offs[fragis[bi]]; |
785 | 139k | vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi])); |
786 | 139k | vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi])); |
787 | 139k | embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec, |
788 | 139k | src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]); |
789 | 139k | embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]); |
790 | 139k | } |
791 | 34.7k | } |