/src/libtheora/lib/x86/mmxfrag.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id$ |
15 | | |
16 | | ********************************************************************/ |
17 | | |
18 | | /*MMX acceleration of fragment reconstruction for motion compensation. |
19 | | Originally written by Rudolf Marek. |
20 | | Additional optimization by Nils Pipenbrinck. |
21 | | Note: Loops are unrolled for best performance. |
22 | | The iteration each instruction belongs to is marked in the comments as #i.*/ |
23 | | #include <stddef.h> |
24 | | #include "x86int.h" |
25 | | |
26 | | #if defined(OC_X86_ASM) |
27 | | |
28 | | /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
29 | | between rows.*/ |
30 | | # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ |
31 | 1.87M | do{ \ |
32 | 1.87M | const unsigned char *src; \ |
33 | 1.87M | unsigned char *dst; \ |
34 | 1.87M | ptrdiff_t ystride3; \ |
35 | 1.87M | src=(_src); \ |
36 | 1.87M | dst=(_dst); \ |
37 | 1.87M | __asm__ __volatile__( \ |
38 | 1.87M | /*src+0*ystride*/ \ |
39 | 1.87M | "movq (%[src]),%%mm0\n\t" \ |
40 | 1.87M | /*src+1*ystride*/ \ |
41 | 1.87M | "movq (%[src],%[ystride]),%%mm1\n\t" \ |
42 | 1.87M | /*ystride3=ystride*3*/ \ |
43 | 1.87M | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ |
44 | 1.87M | /*src+2*ystride*/ \ |
45 | 1.87M | "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
46 | 1.87M | /*src+3*ystride*/ \ |
47 | 1.87M | "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
48 | 1.87M | /*dst+0*ystride*/ \ |
49 | 1.87M | "movq %%mm0,(%[dst])\n\t" \ |
50 | 1.87M | /*dst+1*ystride*/ \ |
51 | 1.87M | "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
52 | 1.87M | /*Pointer to next 4.*/ \ |
53 | 1.87M | "lea (%[src],%[ystride],4),%[src]\n\t" \ |
54 | 1.87M | /*dst+2*ystride*/ \ |
55 | 1.87M | "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
56 | 1.87M | /*dst+3*ystride*/ \ |
57 | 1.87M | "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
58 | 1.87M | /*Pointer to next 4.*/ \ |
59 | 1.87M | "lea (%[dst],%[ystride],4),%[dst]\n\t" \ |
60 | 1.87M | /*src+0*ystride*/ \ |
61 | 1.87M | "movq (%[src]),%%mm0\n\t" \ |
62 | 1.87M | /*src+1*ystride*/ \ |
63 | 1.87M | "movq (%[src],%[ystride]),%%mm1\n\t" \ |
64 | 1.87M | /*src+2*ystride*/ \ |
65 | 1.87M | "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
66 | 1.87M | /*src+3*ystride*/ \ |
67 | 1.87M | "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
68 | 1.87M | /*dst+0*ystride*/ \ |
69 | 1.87M | "movq %%mm0,(%[dst])\n\t" \ |
70 | 1.87M | /*dst+1*ystride*/ \ |
71 | 1.87M | "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
72 | 1.87M | /*dst+2*ystride*/ \ |
73 | 1.87M | "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
74 | 1.87M | /*dst+3*ystride*/ \ |
75 | 1.87M | "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
76 | 1.87M | :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ |
77 | 1.87M | :[ystride]"r"((ptrdiff_t)(_ystride)) \ |
78 | 1.87M | :"memory" \ |
79 | 1.87M | ); \ |
80 | 1.87M | } \ |
81 | 1.87M | while(0) |
82 | | |
83 | | /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
84 | | between rows.*/ |
85 | | void oc_frag_copy_mmx(unsigned char *_dst, |
86 | 0 | const unsigned char *_src,int _ystride){ |
87 | 0 | OC_FRAG_COPY_MMX(_dst,_src,_ystride); |
88 | 0 | } |
89 | | |
90 | | /*Copies the fragments specified by the lists of fragment indices from one |
91 | | frame to another. |
92 | | _dst_frame: The reference frame to copy to. |
93 | | _src_frame: The reference frame to copy from. |
94 | | _ystride: The row stride of the reference frames. |
95 | | _fragis: A pointer to a list of fragment indices. |
96 | | _nfragis: The number of fragment indices to copy. |
97 | | _frag_buf_offs: The offsets of fragments in the reference frames.*/ |
98 | | void oc_frag_copy_list_mmx(unsigned char *_dst_frame, |
99 | | const unsigned char *_src_frame,int _ystride, |
100 | 12.0k | const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ |
101 | 12.0k | ptrdiff_t fragii; |
102 | 1.88M | for(fragii=0;fragii<_nfragis;fragii++){ |
103 | 1.87M | ptrdiff_t frag_buf_off; |
104 | 1.87M | frag_buf_off=_frag_buf_offs[_fragis[fragii]]; |
105 | 1.87M | OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, |
106 | 1.87M | _src_frame+frag_buf_off,_ystride); |
107 | 1.87M | } |
108 | 12.0k | } |
109 | | |
110 | | |
111 | | void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, |
112 | 7.47M | const ogg_int16_t *_residue){ |
113 | 7.47M | __asm__ __volatile__( |
114 | | /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ |
115 | 7.47M | "pcmpeqw %%mm0,%%mm0\n\t" |
116 | | /*#0 Load low residue.*/ |
117 | 7.47M | "movq 0*8(%[residue]),%%mm1\n\t" |
118 | | /*#0 Load high residue.*/ |
119 | 7.47M | "movq 1*8(%[residue]),%%mm2\n\t" |
120 | | /*Set mm0 to 0x8000800080008000.*/ |
121 | 7.47M | "psllw $15,%%mm0\n\t" |
122 | | /*#1 Load low residue.*/ |
123 | 7.47M | "movq 2*8(%[residue]),%%mm3\n\t" |
124 | | /*#1 Load high residue.*/ |
125 | 7.47M | "movq 3*8(%[residue]),%%mm4\n\t" |
126 | | /*Set mm0 to 0x0080008000800080.*/ |
127 | 7.47M | "psrlw $8,%%mm0\n\t" |
128 | | /*#2 Load low residue.*/ |
129 | 7.47M | "movq 4*8(%[residue]),%%mm5\n\t" |
130 | | /*#2 Load high residue.*/ |
131 | 7.47M | "movq 5*8(%[residue]),%%mm6\n\t" |
132 | | /*#0 Bias low residue.*/ |
133 | 7.47M | "paddsw %%mm0,%%mm1\n\t" |
134 | | /*#0 Bias high residue.*/ |
135 | 7.47M | "paddsw %%mm0,%%mm2\n\t" |
136 | | /*#0 Pack to byte.*/ |
137 | 7.47M | "packuswb %%mm2,%%mm1\n\t" |
138 | | /*#1 Bias low residue.*/ |
139 | 7.47M | "paddsw %%mm0,%%mm3\n\t" |
140 | | /*#1 Bias high residue.*/ |
141 | 7.47M | "paddsw %%mm0,%%mm4\n\t" |
142 | | /*#1 Pack to byte.*/ |
143 | 7.47M | "packuswb %%mm4,%%mm3\n\t" |
144 | | /*#2 Bias low residue.*/ |
145 | 7.47M | "paddsw %%mm0,%%mm5\n\t" |
146 | | /*#2 Bias high residue.*/ |
147 | 7.47M | "paddsw %%mm0,%%mm6\n\t" |
148 | | /*#2 Pack to byte.*/ |
149 | 7.47M | "packuswb %%mm6,%%mm5\n\t" |
150 | | /*#0 Write row.*/ |
151 | 7.47M | "movq %%mm1,(%[dst])\n\t" |
152 | | /*#1 Write row.*/ |
153 | 7.47M | "movq %%mm3,(%[dst],%[ystride])\n\t" |
154 | | /*#2 Write row.*/ |
155 | 7.47M | "movq %%mm5,(%[dst],%[ystride],2)\n\t" |
156 | | /*#3 Load low residue.*/ |
157 | 7.47M | "movq 6*8(%[residue]),%%mm1\n\t" |
158 | | /*#3 Load high residue.*/ |
159 | 7.47M | "movq 7*8(%[residue]),%%mm2\n\t" |
160 | | /*#4 Load high residue.*/ |
161 | 7.47M | "movq 8*8(%[residue]),%%mm3\n\t" |
162 | | /*#4 Load high residue.*/ |
163 | 7.47M | "movq 9*8(%[residue]),%%mm4\n\t" |
164 | | /*#5 Load high residue.*/ |
165 | 7.47M | "movq 10*8(%[residue]),%%mm5\n\t" |
166 | | /*#5 Load high residue.*/ |
167 | 7.47M | "movq 11*8(%[residue]),%%mm6\n\t" |
168 | | /*#3 Bias low residue.*/ |
169 | 7.47M | "paddsw %%mm0,%%mm1\n\t" |
170 | | /*#3 Bias high residue.*/ |
171 | 7.47M | "paddsw %%mm0,%%mm2\n\t" |
172 | | /*#3 Pack to byte.*/ |
173 | 7.47M | "packuswb %%mm2,%%mm1\n\t" |
174 | | /*#4 Bias low residue.*/ |
175 | 7.47M | "paddsw %%mm0,%%mm3\n\t" |
176 | | /*#4 Bias high residue.*/ |
177 | 7.47M | "paddsw %%mm0,%%mm4\n\t" |
178 | | /*#4 Pack to byte.*/ |
179 | 7.47M | "packuswb %%mm4,%%mm3\n\t" |
180 | | /*#5 Bias low residue.*/ |
181 | 7.47M | "paddsw %%mm0,%%mm5\n\t" |
182 | | /*#5 Bias high residue.*/ |
183 | 7.47M | "paddsw %%mm0,%%mm6\n\t" |
184 | | /*#5 Pack to byte.*/ |
185 | 7.47M | "packuswb %%mm6,%%mm5\n\t" |
186 | | /*#3 Write row.*/ |
187 | 7.47M | "movq %%mm1,(%[dst],%[ystride3])\n\t" |
188 | | /*#4 Write row.*/ |
189 | 7.47M | "movq %%mm3,(%[dst4])\n\t" |
190 | | /*#5 Write row.*/ |
191 | 7.47M | "movq %%mm5,(%[dst4],%[ystride])\n\t" |
192 | | /*#6 Load low residue.*/ |
193 | 7.47M | "movq 12*8(%[residue]),%%mm1\n\t" |
194 | | /*#6 Load high residue.*/ |
195 | 7.47M | "movq 13*8(%[residue]),%%mm2\n\t" |
196 | | /*#7 Load low residue.*/ |
197 | 7.47M | "movq 14*8(%[residue]),%%mm3\n\t" |
198 | | /*#7 Load high residue.*/ |
199 | 7.47M | "movq 15*8(%[residue]),%%mm4\n\t" |
200 | | /*#6 Bias low residue.*/ |
201 | 7.47M | "paddsw %%mm0,%%mm1\n\t" |
202 | | /*#6 Bias high residue.*/ |
203 | 7.47M | "paddsw %%mm0,%%mm2\n\t" |
204 | | /*#6 Pack to byte.*/ |
205 | 7.47M | "packuswb %%mm2,%%mm1\n\t" |
206 | | /*#7 Bias low residue.*/ |
207 | 7.47M | "paddsw %%mm0,%%mm3\n\t" |
208 | | /*#7 Bias high residue.*/ |
209 | 7.47M | "paddsw %%mm0,%%mm4\n\t" |
210 | | /*#7 Pack to byte.*/ |
211 | 7.47M | "packuswb %%mm4,%%mm3\n\t" |
212 | | /*#6 Write row.*/ |
213 | 7.47M | "movq %%mm1,(%[dst4],%[ystride],2)\n\t" |
214 | | /*#7 Write row.*/ |
215 | 7.47M | "movq %%mm3,(%[dst4],%[ystride3])\n\t" |
216 | 7.47M | : |
217 | 7.47M | :[residue]"r"(_residue), |
218 | 7.47M | [dst]"r"(_dst), |
219 | 7.47M | [dst4]"r"(_dst+(_ystride<<2)), |
220 | 7.47M | [ystride]"r"((ptrdiff_t)_ystride), |
221 | 7.47M | [ystride3]"r"((ptrdiff_t)_ystride*3) |
222 | 7.47M | :"memory" |
223 | 7.47M | ); |
224 | 7.47M | } |
225 | | |
226 | | void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, |
227 | 2.49M | int _ystride,const ogg_int16_t *_residue){ |
228 | 2.49M | int i; |
229 | | /*Zero mm0.*/ |
230 | 2.49M | __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); |
231 | 12.4M | for(i=4;i-->0;){ |
232 | 9.96M | __asm__ __volatile__( |
233 | | /*#0 Load source.*/ |
234 | 9.96M | "movq (%[src]),%%mm3\n\t" |
235 | | /*#1 Load source.*/ |
236 | 9.96M | "movq (%[src],%[ystride]),%%mm7\n\t" |
237 | | /*#0 Get copy of src.*/ |
238 | 9.96M | "movq %%mm3,%%mm4\n\t" |
239 | | /*#0 Expand high source.*/ |
240 | 9.96M | "punpckhbw %%mm0,%%mm4\n\t" |
241 | | /*#0 Expand low source.*/ |
242 | 9.96M | "punpcklbw %%mm0,%%mm3\n\t" |
243 | | /*#0 Add residue high.*/ |
244 | 9.96M | "paddsw 8(%[residue]),%%mm4\n\t" |
245 | | /*#1 Get copy of src.*/ |
246 | 9.96M | "movq %%mm7,%%mm2\n\t" |
247 | | /*#0 Add residue low.*/ |
248 | 9.96M | "paddsw (%[residue]), %%mm3\n\t" |
249 | | /*#1 Expand high source.*/ |
250 | 9.96M | "punpckhbw %%mm0,%%mm2\n\t" |
251 | | /*#0 Pack final row pixels.*/ |
252 | 9.96M | "packuswb %%mm4,%%mm3\n\t" |
253 | | /*#1 Expand low source.*/ |
254 | 9.96M | "punpcklbw %%mm0,%%mm7\n\t" |
255 | | /*#1 Add residue low.*/ |
256 | 9.96M | "paddsw 16(%[residue]),%%mm7\n\t" |
257 | | /*#1 Add residue high.*/ |
258 | 9.96M | "paddsw 24(%[residue]),%%mm2\n\t" |
259 | | /*Advance residue.*/ |
260 | 9.96M | "lea 32(%[residue]),%[residue]\n\t" |
261 | | /*#1 Pack final row pixels.*/ |
262 | 9.96M | "packuswb %%mm2,%%mm7\n\t" |
263 | | /*Advance src.*/ |
264 | 9.96M | "lea (%[src],%[ystride],2),%[src]\n\t" |
265 | | /*#0 Write row.*/ |
266 | 9.96M | "movq %%mm3,(%[dst])\n\t" |
267 | | /*#1 Write row.*/ |
268 | 9.96M | "movq %%mm7,(%[dst],%[ystride])\n\t" |
269 | | /*Advance dst.*/ |
270 | 9.96M | "lea (%[dst],%[ystride],2),%[dst]\n\t" |
271 | 9.96M | :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src) |
272 | 9.96M | :[ystride]"r"((ptrdiff_t)_ystride) |
273 | 9.96M | :"memory" |
274 | 9.96M | ); |
275 | 9.96M | } |
276 | 2.49M | } |
277 | | |
278 | | void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, |
279 | 8.19k | const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ |
280 | 8.19k | int i; |
281 | | /*Zero mm7.*/ |
282 | 8.19k | __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); |
283 | 40.9k | for(i=4;i-->0;){ |
284 | 32.7k | __asm__ __volatile__( |
285 | | /*#0 Load src1.*/ |
286 | 32.7k | "movq (%[src1]),%%mm0\n\t" |
287 | | /*#0 Load src2.*/ |
288 | 32.7k | "movq (%[src2]),%%mm2\n\t" |
289 | | /*#0 Copy src1.*/ |
290 | 32.7k | "movq %%mm0,%%mm1\n\t" |
291 | | /*#0 Copy src2.*/ |
292 | 32.7k | "movq %%mm2,%%mm3\n\t" |
293 | | /*#1 Load src1.*/ |
294 | 32.7k | "movq (%[src1],%[ystride]),%%mm4\n\t" |
295 | | /*#0 Unpack lower src1.*/ |
296 | 32.7k | "punpcklbw %%mm7,%%mm0\n\t" |
297 | | /*#1 Load src2.*/ |
298 | 32.7k | "movq (%[src2],%[ystride]),%%mm5\n\t" |
299 | | /*#0 Unpack higher src1.*/ |
300 | 32.7k | "punpckhbw %%mm7,%%mm1\n\t" |
301 | | /*#0 Unpack lower src2.*/ |
302 | 32.7k | "punpcklbw %%mm7,%%mm2\n\t" |
303 | | /*#0 Unpack higher src2.*/ |
304 | 32.7k | "punpckhbw %%mm7,%%mm3\n\t" |
305 | | /*Advance src1 ptr.*/ |
306 | 32.7k | "lea (%[src1],%[ystride],2),%[src1]\n\t" |
307 | | /*Advance src2 ptr.*/ |
308 | 32.7k | "lea (%[src2],%[ystride],2),%[src2]\n\t" |
309 | | /*#0 Lower src1+src2.*/ |
310 | 32.7k | "paddsw %%mm2,%%mm0\n\t" |
311 | | /*#0 Higher src1+src2.*/ |
312 | 32.7k | "paddsw %%mm3,%%mm1\n\t" |
313 | | /*#1 Copy src1.*/ |
314 | 32.7k | "movq %%mm4,%%mm2\n\t" |
315 | | /*#0 Build lo average.*/ |
316 | 32.7k | "psraw $1,%%mm0\n\t" |
317 | | /*#1 Copy src2.*/ |
318 | 32.7k | "movq %%mm5,%%mm3\n\t" |
319 | | /*#1 Unpack lower src1.*/ |
320 | 32.7k | "punpcklbw %%mm7,%%mm4\n\t" |
321 | | /*#0 Build hi average.*/ |
322 | 32.7k | "psraw $1,%%mm1\n\t" |
323 | | /*#1 Unpack higher src1.*/ |
324 | 32.7k | "punpckhbw %%mm7,%%mm2\n\t" |
325 | | /*#0 low+=residue.*/ |
326 | 32.7k | "paddsw (%[residue]),%%mm0\n\t" |
327 | | /*#1 Unpack lower src2.*/ |
328 | 32.7k | "punpcklbw %%mm7,%%mm5\n\t" |
329 | | /*#0 high+=residue.*/ |
330 | 32.7k | "paddsw 8(%[residue]),%%mm1\n\t" |
331 | | /*#1 Unpack higher src2.*/ |
332 | 32.7k | "punpckhbw %%mm7,%%mm3\n\t" |
333 | | /*#1 Lower src1+src2.*/ |
334 | 32.7k | "paddsw %%mm4,%%mm5\n\t" |
335 | | /*#0 Pack and saturate.*/ |
336 | 32.7k | "packuswb %%mm1,%%mm0\n\t" |
337 | | /*#1 Higher src1+src2.*/ |
338 | 32.7k | "paddsw %%mm2,%%mm3\n\t" |
339 | | /*#0 Write row.*/ |
340 | 32.7k | "movq %%mm0,(%[dst])\n\t" |
341 | | /*#1 Build lo average.*/ |
342 | 32.7k | "psraw $1,%%mm5\n\t" |
343 | | /*#1 Build hi average.*/ |
344 | 32.7k | "psraw $1,%%mm3\n\t" |
345 | | /*#1 low+=residue.*/ |
346 | 32.7k | "paddsw 16(%[residue]),%%mm5\n\t" |
347 | | /*#1 high+=residue.*/ |
348 | 32.7k | "paddsw 24(%[residue]),%%mm3\n\t" |
349 | | /*#1 Pack and saturate.*/ |
350 | 32.7k | "packuswb %%mm3,%%mm5\n\t" |
351 | | /*#1 Write row ptr.*/ |
352 | 32.7k | "movq %%mm5,(%[dst],%[ystride])\n\t" |
353 | | /*Advance residue ptr.*/ |
354 | 32.7k | "add $32,%[residue]\n\t" |
355 | | /*Advance dest ptr.*/ |
356 | 32.7k | "lea (%[dst],%[ystride],2),%[dst]\n\t" |
357 | 32.7k | :[dst]"+r"(_dst),[residue]"+r"(_residue), |
358 | 32.7k | [src1]"+r"(_src1),[src2]"+r"(_src2) |
359 | 32.7k | :[ystride]"r"((ptrdiff_t)_ystride) |
360 | 32.7k | :"memory" |
361 | 32.7k | ); |
362 | 32.7k | } |
363 | 8.19k | } |
364 | | |
365 | 2.21k | void oc_restore_fpu_mmx(void){ |
366 | 2.21k | __asm__ __volatile__("emms\n\t"); |
367 | 2.21k | } |
368 | | #endif |