/src/theora/lib/x86/mmxencfrag.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ |
15 | | |
16 | | ********************************************************************/ |
17 | | #include <stddef.h> |
18 | | #include "x86enc.h" |
19 | | |
20 | | #if defined(OC_X86_ASM) |
21 | | |
22 | | unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, |
23 | 19.3M | const unsigned char *_ref,int _ystride){ |
24 | 19.3M | ptrdiff_t ystride3; |
25 | 19.3M | ptrdiff_t ret; |
26 | 19.3M | __asm__ __volatile__( |
27 | | /*Load the first 4 rows of each block.*/ |
28 | 19.3M | "movq (%[src]),%%mm0\n\t" |
29 | 19.3M | "movq (%[ref]),%%mm1\n\t" |
30 | 19.3M | "movq (%[src],%[ystride]),%%mm2\n\t" |
31 | 19.3M | "movq (%[ref],%[ystride]),%%mm3\n\t" |
32 | 19.3M | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" |
33 | 19.3M | "movq (%[src],%[ystride],2),%%mm4\n\t" |
34 | 19.3M | "movq (%[ref],%[ystride],2),%%mm5\n\t" |
35 | 19.3M | "movq (%[src],%[ystride3]),%%mm6\n\t" |
36 | 19.3M | "movq (%[ref],%[ystride3]),%%mm7\n\t" |
37 | | /*Compute their SADs and add them in %%mm0*/ |
38 | 19.3M | "psadbw %%mm1,%%mm0\n\t" |
39 | 19.3M | "psadbw %%mm3,%%mm2\n\t" |
40 | 19.3M | "lea (%[src],%[ystride],4),%[src]\n\t" |
41 | 19.3M | "paddw %%mm2,%%mm0\n\t" |
42 | 19.3M | "lea (%[ref],%[ystride],4),%[ref]\n\t" |
43 | | /*Load the next 3 rows as registers become available.*/ |
44 | 19.3M | "movq (%[src]),%%mm2\n\t" |
45 | 19.3M | "movq (%[ref]),%%mm3\n\t" |
46 | 19.3M | "psadbw %%mm5,%%mm4\n\t" |
47 | 19.3M | "psadbw %%mm7,%%mm6\n\t" |
48 | 19.3M | "paddw %%mm4,%%mm0\n\t" |
49 | 19.3M | "movq (%[ref],%[ystride]),%%mm5\n\t" |
50 | 19.3M | "movq (%[src],%[ystride]),%%mm4\n\t" |
51 | 19.3M | "paddw %%mm6,%%mm0\n\t" |
52 | 19.3M | "movq (%[ref],%[ystride],2),%%mm7\n\t" |
53 | 19.3M | "movq (%[src],%[ystride],2),%%mm6\n\t" |
54 | | /*Start adding their SADs to %%mm0*/ |
55 | 19.3M | "psadbw %%mm3,%%mm2\n\t" |
56 | 19.3M | "psadbw %%mm5,%%mm4\n\t" |
57 | 19.3M | "paddw %%mm2,%%mm0\n\t" |
58 | 19.3M | "psadbw %%mm7,%%mm6\n\t" |
59 | | /*Load last row as registers become available.*/ |
60 | 19.3M | "movq (%[src],%[ystride3]),%%mm2\n\t" |
61 | 19.3M | "movq (%[ref],%[ystride3]),%%mm3\n\t" |
62 | | /*And finish adding up their SADs.*/ |
63 | 19.3M | "paddw %%mm4,%%mm0\n\t" |
64 | 19.3M | "psadbw %%mm3,%%mm2\n\t" |
65 | 19.3M | "paddw %%mm6,%%mm0\n\t" |
66 | 19.3M | "paddw %%mm2,%%mm0\n\t" |
67 | 19.3M | "movd %%mm0,%[ret]\n\t" |
68 | 19.3M | :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3) |
69 | 19.3M | :[ystride]"r"((ptrdiff_t)_ystride) |
70 | 19.3M | ); |
71 | 19.3M | return (unsigned)ret; |
72 | 19.3M | } |
73 | | |
74 | | unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, |
75 | 0 | const unsigned char *_ref,int _ystride,unsigned _thresh){ |
76 | | /*Early termination is for suckers.*/ |
77 | 0 | return oc_enc_frag_sad_mmxext(_src,_ref,_ystride); |
78 | 0 | } |
79 | | |
80 | | /*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the |
81 | | first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7. |
82 | | We pre-load the next two rows of data as registers become available.*/ |
83 | | #define OC_SAD2_LOOP \ |
84 | | "#OC_SAD2_LOOP\n\t" \ |
85 | | /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \ |
86 | | pavgb computes (%%mm0+%%mm1+1>>1). \ |
87 | | The latter is exactly 1 too large when the low bit of two corresponding \ |
88 | | bytes is only set in one of them. \ |
89 | | Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ |
90 | | correct the output of pavgb. \ |
91 | | TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \ |
92 | | schedules better; currently, however, this function is unused.*/ \ |
93 | | "movq %%mm0,%%mm6\n\t" \ |
94 | | "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \ |
95 | | "pxor %%mm1,%%mm0\n\t" \ |
96 | | "pavgb %%mm1,%%mm6\n\t" \ |
97 | | "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \ |
98 | | "movq %%mm2,%%mm1\n\t" \ |
99 | | "pand %%mm7,%%mm0\n\t" \ |
100 | | "pavgb %%mm3,%%mm2\n\t" \ |
101 | | "pxor %%mm3,%%mm1\n\t" \ |
102 | | "movq (%[ref2],%[ystride]),%%mm3\n\t" \ |
103 | | "psubb %%mm0,%%mm6\n\t" \ |
104 | | "movq (%[ref1]),%%mm0\n\t" \ |
105 | | "pand %%mm7,%%mm1\n\t" \ |
106 | | "psadbw %%mm6,%%mm4\n\t" \ |
107 | | "movd %[ret],%%mm6\n\t" \ |
108 | | "psubb %%mm1,%%mm2\n\t" \ |
109 | | "movq (%[ref2]),%%mm1\n\t" \ |
110 | | "lea (%[src],%[ystride],2),%[src]\n\t" \ |
111 | | "psadbw %%mm2,%%mm5\n\t" \ |
112 | | "movq (%[ref1],%[ystride]),%%mm2\n\t" \ |
113 | | "paddw %%mm4,%%mm5\n\t" \ |
114 | | "movq (%[src]),%%mm4\n\t" \ |
115 | | "paddw %%mm5,%%mm6\n\t" \ |
116 | | "movq (%[src],%[ystride]),%%mm5\n\t" \ |
117 | | "movd %%mm6,%[ret]\n\t" \ |
118 | | |
119 | | /*Same as above, but does not pre-load the next two rows.*/ |
120 | | #define OC_SAD2_TAIL \ |
121 | | "#OC_SAD2_TAIL\n\t" \ |
122 | | "movq %%mm0,%%mm6\n\t" \ |
123 | | "pavgb %%mm1,%%mm0\n\t" \ |
124 | | "pxor %%mm1,%%mm6\n\t" \ |
125 | | "movq %%mm2,%%mm1\n\t" \ |
126 | | "pand %%mm7,%%mm6\n\t" \ |
127 | | "pavgb %%mm3,%%mm2\n\t" \ |
128 | | "pxor %%mm3,%%mm1\n\t" \ |
129 | | "psubb %%mm6,%%mm0\n\t" \ |
130 | | "pand %%mm7,%%mm1\n\t" \ |
131 | | "psadbw %%mm0,%%mm4\n\t" \ |
132 | | "psubb %%mm1,%%mm2\n\t" \ |
133 | | "movd %[ret],%%mm6\n\t" \ |
134 | | "psadbw %%mm2,%%mm5\n\t" \ |
135 | | "paddw %%mm4,%%mm5\n\t" \ |
136 | | "paddw %%mm5,%%mm6\n\t" \ |
137 | | "movd %%mm6,%[ret]\n\t" \ |
138 | | |
139 | | unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, |
140 | | const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, |
141 | 0 | unsigned _thresh){ |
142 | 0 | ptrdiff_t ret; |
143 | 0 | __asm__ __volatile__( |
144 | 0 | "movq (%[ref1]),%%mm0\n\t" |
145 | 0 | "movq (%[ref2]),%%mm1\n\t" |
146 | 0 | "movq (%[ref1],%[ystride]),%%mm2\n\t" |
147 | 0 | "movq (%[ref2],%[ystride]),%%mm3\n\t" |
148 | 0 | "xor %[ret],%[ret]\n\t" |
149 | 0 | "movq (%[src]),%%mm4\n\t" |
150 | 0 | "pxor %%mm7,%%mm7\n\t" |
151 | 0 | "pcmpeqb %%mm6,%%mm6\n\t" |
152 | 0 | "movq (%[src],%[ystride]),%%mm5\n\t" |
153 | 0 | "psubb %%mm6,%%mm7\n\t" |
154 | 0 | OC_SAD2_LOOP |
155 | 0 | OC_SAD2_LOOP |
156 | 0 | OC_SAD2_LOOP |
157 | 0 | OC_SAD2_TAIL |
158 | 0 | :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2) |
159 | 0 | :[ystride]"r"((ptrdiff_t)_ystride) |
160 | 0 | ); |
161 | 0 | return (unsigned)ret; |
162 | 0 | } |
163 | | |
164 | | /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their |
165 | | 16-bit difference in %%mm0...%%mm7.*/ |
166 | | #define OC_LOAD_SUB_8x4(_off) \ |
167 | | "#OC_LOAD_SUB_8x4\n\t" \ |
168 | | "movd "#_off"(%[src]),%%mm0\n\t" \ |
169 | | "movd "#_off"(%[ref]),%%mm4\n\t" \ |
170 | | "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \ |
171 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
172 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \ |
173 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
174 | | "movd "#_off"(%[src]),%%mm2\n\t" \ |
175 | | "movd "#_off"(%[ref]),%%mm7\n\t" \ |
176 | | "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \ |
177 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \ |
178 | | "punpcklbw %%mm4,%%mm0\n\t" \ |
179 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
180 | | "punpcklbw %%mm4,%%mm4\n\t" \ |
181 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
182 | | "psubw %%mm4,%%mm0\n\t" \ |
183 | | "movd "#_off"(%[src]),%%mm4\n\t" \ |
184 | | "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \ |
185 | | "movd "#_off"(%[ref]),%%mm0\n\t" \ |
186 | | "punpcklbw %%mm5,%%mm1\n\t" \ |
187 | | "punpcklbw %%mm5,%%mm5\n\t" \ |
188 | | "psubw %%mm5,%%mm1\n\t" \ |
189 | | "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \ |
190 | | "punpcklbw %%mm7,%%mm2\n\t" \ |
191 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
192 | | "psubw %%mm7,%%mm2\n\t" \ |
193 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \ |
194 | | "punpcklbw %%mm6,%%mm3\n\t" \ |
195 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
196 | | "punpcklbw %%mm6,%%mm6\n\t" \ |
197 | | "psubw %%mm6,%%mm3\n\t" \ |
198 | | "movd "#_off"(%[src]),%%mm6\n\t" \ |
199 | | "punpcklbw %%mm0,%%mm4\n\t" \ |
200 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
201 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
202 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
203 | | "psubw %%mm0,%%mm4\n\t" \ |
204 | | "movd "#_off"(%[ref]),%%mm0\n\t" \ |
205 | | "punpcklbw %%mm7,%%mm5\n\t" \ |
206 | | "neg %[src_ystride]\n\t" \ |
207 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
208 | | "psubw %%mm7,%%mm5\n\t" \ |
209 | | "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \ |
210 | | "punpcklbw %%mm0,%%mm6\n\t" \ |
211 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
212 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
213 | | "neg %[ref_ystride]\n\t" \ |
214 | | "psubw %%mm0,%%mm6\n\t" \ |
215 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \ |
216 | | "lea (%[src],%[src_ystride],8),%[src]\n\t" \ |
217 | | "punpcklbw %%mm0,%%mm7\n\t" \ |
218 | | "neg %[src_ystride]\n\t" \ |
219 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
220 | | "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \ |
221 | | "psubw %%mm0,%%mm7\n\t" \ |
222 | | "neg %[ref_ystride]\n\t" \ |
223 | | "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \ |
224 | | |
225 | | /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ |
226 | | #define OC_LOAD_8x4(_off) \ |
227 | | "#OC_LOAD_8x4\n\t" \ |
228 | | "movd "#_off"(%[src]),%%mm0\n\t" \ |
229 | | "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \ |
230 | | "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \ |
231 | | "pxor %%mm7,%%mm7\n\t" \ |
232 | | "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \ |
233 | | "punpcklbw %%mm7,%%mm0\n\t" \ |
234 | | "movd "#_off"(%[src4]),%%mm4\n\t" \ |
235 | | "punpcklbw %%mm7,%%mm1\n\t" \ |
236 | | "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \ |
237 | | "punpcklbw %%mm7,%%mm2\n\t" \ |
238 | | "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \ |
239 | | "punpcklbw %%mm7,%%mm3\n\t" \ |
240 | | "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \ |
241 | | "punpcklbw %%mm4,%%mm4\n\t" \ |
242 | | "punpcklbw %%mm5,%%mm5\n\t" \ |
243 | | "psrlw $8,%%mm4\n\t" \ |
244 | | "psrlw $8,%%mm5\n\t" \ |
245 | | "punpcklbw %%mm6,%%mm6\n\t" \ |
246 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
247 | | "psrlw $8,%%mm6\n\t" \ |
248 | | "psrlw $8,%%mm7\n\t" \ |
249 | | |
250 | | /*Performs the first two stages of an 8-point 1-D Hadamard transform. |
251 | | The transform is performed in place, except that outputs 0-3 are swapped with |
252 | | outputs 4-7. |
253 | | Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to |
254 | | perform this stage in place with no temporary registers).*/ |
255 | | #define OC_HADAMARD_AB_8x4 \ |
256 | | "#OC_HADAMARD_AB_8x4\n\t" \ |
257 | | /*Stage A: \ |
258 | | Outputs 0-3 are swapped with 4-7 here.*/ \ |
259 | | "paddw %%mm1,%%mm5\n\t" \ |
260 | | "paddw %%mm2,%%mm6\n\t" \ |
261 | | "paddw %%mm1,%%mm1\n\t" \ |
262 | | "paddw %%mm2,%%mm2\n\t" \ |
263 | | "psubw %%mm5,%%mm1\n\t" \ |
264 | | "psubw %%mm6,%%mm2\n\t" \ |
265 | | "paddw %%mm3,%%mm7\n\t" \ |
266 | | "paddw %%mm0,%%mm4\n\t" \ |
267 | | "paddw %%mm3,%%mm3\n\t" \ |
268 | | "paddw %%mm0,%%mm0\n\t" \ |
269 | | "psubw %%mm7,%%mm3\n\t" \ |
270 | | "psubw %%mm4,%%mm0\n\t" \ |
271 | | /*Stage B:*/ \ |
272 | | "paddw %%mm2,%%mm0\n\t" \ |
273 | | "paddw %%mm3,%%mm1\n\t" \ |
274 | | "paddw %%mm6,%%mm4\n\t" \ |
275 | | "paddw %%mm7,%%mm5\n\t" \ |
276 | | "paddw %%mm2,%%mm2\n\t" \ |
277 | | "paddw %%mm3,%%mm3\n\t" \ |
278 | | "paddw %%mm6,%%mm6\n\t" \ |
279 | | "paddw %%mm7,%%mm7\n\t" \ |
280 | | "psubw %%mm0,%%mm2\n\t" \ |
281 | | "psubw %%mm1,%%mm3\n\t" \ |
282 | | "psubw %%mm4,%%mm6\n\t" \ |
283 | | "psubw %%mm5,%%mm7\n\t" \ |
284 | | |
285 | | /*Performs the last stage of an 8-point 1-D Hadamard transform in place. |
286 | | Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in |
287 | | place with no temporary registers).*/ |
288 | | #define OC_HADAMARD_C_8x4 \ |
289 | | "#OC_HADAMARD_C_8x4\n\t" \ |
290 | | /*Stage C:*/ \ |
291 | | "paddw %%mm1,%%mm0\n\t" \ |
292 | | "paddw %%mm3,%%mm2\n\t" \ |
293 | | "paddw %%mm5,%%mm4\n\t" \ |
294 | | "paddw %%mm7,%%mm6\n\t" \ |
295 | | "paddw %%mm1,%%mm1\n\t" \ |
296 | | "paddw %%mm3,%%mm3\n\t" \ |
297 | | "paddw %%mm5,%%mm5\n\t" \ |
298 | | "paddw %%mm7,%%mm7\n\t" \ |
299 | | "psubw %%mm0,%%mm1\n\t" \ |
300 | | "psubw %%mm2,%%mm3\n\t" \ |
301 | | "psubw %%mm4,%%mm5\n\t" \ |
302 | | "psubw %%mm6,%%mm7\n\t" \ |
303 | | |
304 | | /*Performs an 8-point 1-D Hadamard transform. |
305 | | The transform is performed in place, except that outputs 0-3 are swapped with |
306 | | outputs 4-7. |
307 | | Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform |
308 | | in place with no temporary registers).*/ |
309 | | #define OC_HADAMARD_8x4 \ |
310 | | OC_HADAMARD_AB_8x4 \ |
311 | | OC_HADAMARD_C_8x4 \ |
312 | | |
313 | | /*Performs the first part of the final stage of the Hadamard transform and |
314 | | summing of absolute values. |
315 | | At the end of this part, %%mm1 will contain the DC coefficient of the |
316 | | transform.*/ |
317 | | #define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ |
318 | | /*We use the fact that \ |
319 | | (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ |
320 | | to merge the final butterfly with the abs and the first stage of \ |
321 | | accumulation. \ |
322 | | Thus we can avoid using pabsw, which is not available until SSSE3. \ |
323 | | Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \ |
324 | | implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ |
325 | | registers). \ |
326 | | Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ |
327 | | This implementation is only 26 (+4 for spilling registers).*/ \ |
328 | | "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \ |
329 | | "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \ |
330 | | "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \ |
331 | | /*mm7={0x7FFF}x4 \ |
332 | | mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ |
333 | | "pcmpeqb %%mm7,%%mm7\n\t" \ |
334 | | "movq %%mm0,%%mm6\n\t" \ |
335 | | "psrlw $1,%%mm7\n\t" \ |
336 | | "paddw %%mm1,%%mm6\n\t" \ |
337 | | "pmaxsw %%mm1,%%mm0\n\t" \ |
338 | | "paddsw %%mm7,%%mm6\n\t" \ |
339 | | "psubw %%mm6,%%mm0\n\t" \ |
340 | | /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \ |
341 | | mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \ |
342 | | "movq %%mm2,%%mm6\n\t" \ |
343 | | "movq %%mm4,%%mm1\n\t" \ |
344 | | "pmaxsw %%mm3,%%mm2\n\t" \ |
345 | | "pmaxsw %%mm5,%%mm4\n\t" \ |
346 | | "paddw %%mm3,%%mm6\n\t" \ |
347 | | "paddw %%mm5,%%mm1\n\t" \ |
348 | | "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \ |
349 | | |
350 | | /*Performs the second part of the final stage of the Hadamard transform and |
351 | | summing of absolute values.*/ |
352 | | #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ |
353 | | "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \ |
354 | | "paddsw %%mm7,%%mm6\n\t" \ |
355 | | "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \ |
356 | | "paddsw %%mm7,%%mm1\n\t" \ |
357 | | "psubw %%mm6,%%mm2\n\t" \ |
358 | | "psubw %%mm1,%%mm4\n\t" \ |
359 | | /*mm7={1}x4 (needed for the horizontal add that follows) \ |
360 | | mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \ |
361 | | "movq %%mm3,%%mm6\n\t" \ |
362 | | "pmaxsw %%mm5,%%mm3\n\t" \ |
363 | | "paddw %%mm2,%%mm0\n\t" \ |
364 | | "paddw %%mm5,%%mm6\n\t" \ |
365 | | "paddw %%mm4,%%mm0\n\t" \ |
366 | | "paddsw %%mm7,%%mm6\n\t" \ |
367 | | "paddw %%mm3,%%mm0\n\t" \ |
368 | | "psrlw $14,%%mm7\n\t" \ |
369 | | "psubw %%mm6,%%mm0\n\t" \ |
370 | | |
371 | | /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the |
372 | | absolute value of each component, and accumulates everything into mm0. |
373 | | This is the only portion of SATD which requires MMXEXT (we could use plain |
374 | | MMX, but it takes 4 instructions and an extra register to work around the |
375 | | lack of a pmaxsw, which is a pretty serious penalty).*/ |
376 | | #define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \ |
377 | | OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ |
378 | | OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ |
379 | | |
380 | | /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each |
381 | | component, and accumulates everything into mm0. |
382 | | Note that mm0 will have an extra 4 added to each column, and that after |
383 | | removing this value, the remainder will be half the conventional value.*/ |
384 | | #define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \ |
385 | | OC_HADAMARD_AB_8x4 \ |
386 | | OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) |
387 | | |
388 | | /*Performs two 4x4 transposes (mostly) in place. |
389 | | On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7} |
390 | | contains rows {a,b,c,d}. |
391 | | On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and |
392 | | {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/ |
393 | | #define OC_TRANSPOSE_4x4x2(_off) \ |
394 | | "#OC_TRANSPOSE_4x4x2\n\t" \ |
395 | | /*First 4x4 transpose:*/ \ |
396 | | "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \ |
397 | | /*mm0 = e3 e2 e1 e0 \ |
398 | | mm1 = f3 f2 f1 f0 \ |
399 | | mm2 = g3 g2 g1 g0 \ |
400 | | mm3 = h3 h2 h1 h0*/ \ |
401 | | "movq %%mm2,%%mm5\n\t" \ |
402 | | "punpcklwd %%mm3,%%mm2\n\t" \ |
403 | | "punpckhwd %%mm3,%%mm5\n\t" \ |
404 | | "movq %%mm0,%%mm3\n\t" \ |
405 | | "punpcklwd %%mm1,%%mm0\n\t" \ |
406 | | "punpckhwd %%mm1,%%mm3\n\t" \ |
407 | | /*mm0 = f1 e1 f0 e0 \ |
408 | | mm3 = f3 e3 f2 e2 \ |
409 | | mm2 = h1 g1 h0 g0 \ |
410 | | mm5 = h3 g3 h2 g2*/ \ |
411 | | "movq %%mm0,%%mm1\n\t" \ |
412 | | "punpckldq %%mm2,%%mm0\n\t" \ |
413 | | "punpckhdq %%mm2,%%mm1\n\t" \ |
414 | | "movq %%mm3,%%mm2\n\t" \ |
415 | | "punpckhdq %%mm5,%%mm3\n\t" \ |
416 | | "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \ |
417 | | "punpckldq %%mm5,%%mm2\n\t" \ |
418 | | /*mm0 = h0 g0 f0 e0 \ |
419 | | mm1 = h1 g1 f1 e1 \ |
420 | | mm2 = h2 g2 f2 e2 \ |
421 | | mm3 = h3 g3 f3 e3*/ \ |
422 | | "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \ |
423 | | /*Second 4x4 transpose:*/ \ |
424 | | /*mm4 = a3 a2 a1 a0 \ |
425 | | mm5 = b3 b2 b1 b0 \ |
426 | | mm6 = c3 c2 c1 c0 \ |
427 | | mm7 = d3 d2 d1 d0*/ \ |
428 | | "movq %%mm6,%%mm0\n\t" \ |
429 | | "punpcklwd %%mm7,%%mm6\n\t" \ |
430 | | "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \ |
431 | | "punpckhwd %%mm7,%%mm0\n\t" \ |
432 | | "movq %%mm4,%%mm7\n\t" \ |
433 | | "punpcklwd %%mm5,%%mm4\n\t" \ |
434 | | "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \ |
435 | | "punpckhwd %%mm5,%%mm7\n\t" \ |
436 | | /*mm4 = b1 a1 b0 a0 \ |
437 | | mm7 = b3 a3 b2 a2 \ |
438 | | mm6 = d1 c1 d0 c0 \ |
439 | | mm0 = d3 c3 d2 c2*/ \ |
440 | | "movq %%mm4,%%mm5\n\t" \ |
441 | | "punpckldq %%mm6,%%mm4\n\t" \ |
442 | | "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \ |
443 | | "punpckhdq %%mm6,%%mm5\n\t" \ |
444 | | "movq %%mm7,%%mm6\n\t" \ |
445 | | "punpckhdq %%mm0,%%mm7\n\t" \ |
446 | | "punpckldq %%mm0,%%mm6\n\t" \ |
447 | | /*mm4 = d0 c0 b0 a0 \ |
448 | | mm5 = d1 c1 b1 a1 \ |
449 | | mm6 = d2 c2 b2 a2 \ |
450 | | mm7 = d3 c3 b3 a3*/ \ |
451 | | |
452 | | static unsigned oc_int_frag_satd_mmxext(int *_dc, |
453 | | const unsigned char *_src,int _src_ystride, |
454 | 0 | const unsigned char *_ref,int _ref_ystride){ |
455 | 0 | OC_ALIGN8(ogg_int16_t buf[64]); |
456 | 0 | unsigned ret; |
457 | 0 | unsigned ret2; |
458 | 0 | int dc; |
459 | 0 | __asm__ __volatile__( |
460 | 0 | OC_LOAD_SUB_8x4(0x00) |
461 | 0 | OC_HADAMARD_8x4 |
462 | 0 | OC_TRANSPOSE_4x4x2(0x00) |
463 | | /*Finish swapping out this 8x4 block to make room for the next one. |
464 | | mm0...mm3 have been swapped out already.*/ |
465 | 0 | "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" |
466 | 0 | "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" |
467 | 0 | "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" |
468 | 0 | "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" |
469 | 0 | OC_LOAD_SUB_8x4(0x04) |
470 | 0 | OC_HADAMARD_8x4 |
471 | 0 | OC_TRANSPOSE_4x4x2(0x08) |
472 | | /*Here the first 4x4 block of output from the last transpose is the second |
473 | | 4x4 block of input for the next transform. |
474 | | We have cleverly arranged that it already be in the appropriate place, so |
475 | | we only have to do half the loads.*/ |
476 | 0 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" |
477 | 0 | "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" |
478 | 0 | "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" |
479 | 0 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" |
480 | | /*We split out the stages here so we can save the DC coefficient in the |
481 | | middle.*/ |
482 | 0 | OC_HADAMARD_AB_8x4 |
483 | 0 | OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) |
484 | 0 | "movd %%mm1,%[dc]\n\t" |
485 | 0 | OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) |
486 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
487 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
488 | | for the factor of two we dropped + 3 for the vertical accumulation). |
489 | | Now we finally have to promote things to dwords. |
490 | | We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long |
491 | | latency of pmaddwd by starting the next series of loads now.*/ |
492 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
493 | 0 | "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" |
494 | 0 | "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" |
495 | 0 | "movq %%mm0,%%mm4\n\t" |
496 | 0 | "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" |
497 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
498 | 0 | "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" |
499 | 0 | "paddd %%mm0,%%mm4\n\t" |
500 | 0 | "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" |
501 | 0 | "movd %%mm4,%[ret2]\n\t" |
502 | 0 | "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" |
503 | 0 | "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" |
504 | 0 | "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" |
505 | 0 | OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) |
506 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
507 | | /*Subtract abs(dc) from 2*ret2.*/ |
508 | 0 | "movsx %w[dc],%[dc]\n\t" |
509 | 0 | "cdq\n\t" |
510 | 0 | "lea (%[ret],%[ret2],2),%[ret2]\n\t" |
511 | 0 | "movq %%mm0,%%mm4\n\t" |
512 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
513 | 0 | "xor %[dc],%[ret]\n\t" |
514 | 0 | "paddd %%mm0,%%mm4\n\t" |
515 | | /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 |
516 | | added to them, a factor of two removed, and the DC value included; |
517 | | correct the final sum here.*/ |
518 | 0 | "sub %[ret],%[ret2]\n\t" |
519 | 0 | "movd %%mm4,%[ret]\n\t" |
520 | 0 | "lea -64(%[ret2],%[ret],2),%[ret]\n\t" |
521 | | /*Although it looks like we're using 8 registers here, gcc can alias %[ret] |
522 | | and %[ret2] with some of the inputs, since for once we don't write to |
523 | | them until after we're done using everything but %[buf].*/ |
524 | | /*Note that _src_ystride and _ref_ystride must be given non-overlapping |
525 | | constraints, otherewise if gcc can prove they're equal it will allocate |
526 | | them to the same register (which is bad); _src and _ref face a similar |
527 | | problem, though those are never actually the same.*/ |
528 | 0 | :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc), |
529 | 0 | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) |
530 | 0 | :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride), |
531 | 0 | [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride) |
532 | | /*We have to use neg, so we actually clobber the condition codes for once |
533 | | (not to mention cmp, sub, and add).*/ |
534 | 0 | :"cc" |
535 | 0 | ); |
536 | 0 | *_dc=dc; |
537 | 0 | return ret; |
538 | 0 | } |
539 | | |
540 | | unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src, |
541 | 0 | const unsigned char *_ref,int _ystride){ |
542 | 0 | return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride); |
543 | 0 | } |
544 | | |
545 | | /*Our internal implementation of frag_copy2 takes an extra stride parameter so |
546 | | we can share code with oc_enc_frag_satd2_mmxext().*/ |
547 | | void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, |
548 | 20.1M | const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ |
549 | 20.1M | __asm__ __volatile__( |
550 | | /*Load the first 3 rows.*/ |
551 | 20.1M | "movq (%[src1]),%%mm0\n\t" |
552 | 20.1M | "movq (%[src2]),%%mm1\n\t" |
553 | 20.1M | "movq (%[src1],%[src_ystride]),%%mm2\n\t" |
554 | 20.1M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
555 | 20.1M | "movq (%[src2],%[src_ystride]),%%mm3\n\t" |
556 | 20.1M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
557 | 20.1M | "pxor %%mm7,%%mm7\n\t" |
558 | 20.1M | "movq (%[src1]),%%mm4\n\t" |
559 | 20.1M | "pcmpeqb %%mm6,%%mm6\n\t" |
560 | 20.1M | "movq (%[src2]),%%mm5\n\t" |
561 | | /*mm7={1}x8.*/ |
562 | 20.1M | "psubb %%mm6,%%mm7\n\t" |
563 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
564 | 20.1M | "movq %%mm0,%%mm6\n\t" |
565 | 20.1M | "pxor %%mm1,%%mm0\n\t" |
566 | 20.1M | "pavgb %%mm1,%%mm6\n\t" |
567 | | /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/ |
568 | 20.1M | "movq %%mm2,%%mm1\n\t" |
569 | 20.1M | "pand %%mm7,%%mm0\n\t" |
570 | 20.1M | "pavgb %%mm3,%%mm2\n\t" |
571 | 20.1M | "pxor %%mm3,%%mm1\n\t" |
572 | | /*%%mm3 is free.*/ |
573 | 20.1M | "psubb %%mm0,%%mm6\n\t" |
574 | | /*%%mm0 is free, start loading the next row.*/ |
575 | 20.1M | "movq (%[src1],%[src_ystride]),%%mm0\n\t" |
576 | | /*Start averaging %%mm5 and %%mm4 using %%mm3.*/ |
577 | 20.1M | "movq %%mm4,%%mm3\n\t" |
578 | | /*%%mm6 (row 0) is done; write it out.*/ |
579 | 20.1M | "movq %%mm6,(%[dst])\n\t" |
580 | 20.1M | "pand %%mm7,%%mm1\n\t" |
581 | 20.1M | "pavgb %%mm5,%%mm4\n\t" |
582 | 20.1M | "psubb %%mm1,%%mm2\n\t" |
583 | | /*%%mm1 is free, continue loading the next row.*/ |
584 | 20.1M | "movq (%[src2],%[src_ystride]),%%mm1\n\t" |
585 | 20.1M | "pxor %%mm5,%%mm3\n\t" |
586 | 20.1M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
587 | | /*%%mm2 (row 1) is done; write it out.*/ |
588 | 20.1M | "movq %%mm2,(%[dst],%[dst_ystride])\n\t" |
589 | 20.1M | "pand %%mm7,%%mm3\n\t" |
590 | | /*Start loading the next row.*/ |
591 | 20.1M | "movq (%[src1]),%%mm2\n\t" |
592 | 20.1M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
593 | 20.1M | "psubb %%mm3,%%mm4\n\t" |
594 | 20.1M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
595 | | /*%%mm4 (row 2) is done; write it out.*/ |
596 | 20.1M | "movq %%mm4,(%[dst])\n\t" |
597 | | /*Continue loading the next row.*/ |
598 | 20.1M | "movq (%[src2]),%%mm3\n\t" |
599 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
600 | 20.1M | "movq %%mm0,%%mm6\n\t" |
601 | 20.1M | "pxor %%mm1,%%mm0\n\t" |
602 | | /*Start loading the next row.*/ |
603 | 20.1M | "movq (%[src1],%[src_ystride]),%%mm4\n\t" |
604 | 20.1M | "pavgb %%mm1,%%mm6\n\t" |
605 | | /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/ |
606 | 20.1M | "movq %%mm2,%%mm1\n\t" |
607 | 20.1M | "pand %%mm7,%%mm0\n\t" |
608 | | /*Continue loading the next row.*/ |
609 | 20.1M | "movq (%[src2],%[src_ystride]),%%mm5\n\t" |
610 | 20.1M | "pavgb %%mm3,%%mm2\n\t" |
611 | 20.1M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
612 | 20.1M | "pxor %%mm3,%%mm1\n\t" |
613 | | /*%%mm3 is free.*/ |
614 | 20.1M | "psubb %%mm0,%%mm6\n\t" |
615 | | /*%%mm0 is free, start loading the next row.*/ |
616 | 20.1M | "movq (%[src1]),%%mm0\n\t" |
617 | | /*Start averaging %%mm5 into %%mm4 using %%mm3.*/ |
618 | 20.1M | "movq %%mm4,%%mm3\n\t" |
619 | | /*%%mm6 (row 3) is done; write it out.*/ |
620 | 20.1M | "movq %%mm6,(%[dst],%[dst_ystride])\n\t" |
621 | 20.1M | "pand %%mm7,%%mm1\n\t" |
622 | 20.1M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
623 | 20.1M | "pavgb %%mm5,%%mm4\n\t" |
624 | 20.1M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
625 | 20.1M | "psubb %%mm1,%%mm2\n\t" |
626 | | /*%%mm1 is free; continue loading the next row.*/ |
627 | 20.1M | "movq (%[src2]),%%mm1\n\t" |
628 | 20.1M | "pxor %%mm5,%%mm3\n\t" |
629 | | /*%%mm2 (row 4) is done; write it out.*/ |
630 | 20.1M | "movq %%mm2,(%[dst])\n\t" |
631 | 20.1M | "pand %%mm7,%%mm3\n\t" |
632 | | /*Start loading the next row.*/ |
633 | 20.1M | "movq (%[src1],%[src_ystride]),%%mm2\n\t" |
634 | 20.1M | "psubb %%mm3,%%mm4\n\t" |
635 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
636 | 20.1M | "movq %%mm0,%%mm6\n\t" |
637 | | /*Continue loading the next row.*/ |
638 | 20.1M | "movq (%[src2],%[src_ystride]),%%mm3\n\t" |
639 | | /*%%mm4 (row 5) is done; write it out.*/ |
640 | 20.1M | "movq %%mm4,(%[dst],%[dst_ystride])\n\t" |
641 | 20.1M | "pxor %%mm1,%%mm0\n\t" |
642 | 20.1M | "pavgb %%mm1,%%mm6\n\t" |
643 | | /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/ |
644 | 20.1M | "movq %%mm2,%%mm4\n\t" |
645 | 20.1M | "pand %%mm7,%%mm0\n\t" |
646 | 20.1M | "pavgb %%mm3,%%mm2\n\t" |
647 | 20.1M | "pxor %%mm3,%%mm4\n\t" |
648 | 20.1M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
649 | 20.1M | "psubb %%mm0,%%mm6\n\t" |
650 | 20.1M | "pand %%mm7,%%mm4\n\t" |
651 | | /*%%mm6 (row 6) is done, write it out.*/ |
652 | 20.1M | "movq %%mm6,(%[dst])\n\t" |
653 | 20.1M | "psubb %%mm4,%%mm2\n\t" |
654 | | /*%%mm2 (row 7) is done, write it out.*/ |
655 | 20.1M | "movq %%mm2,(%[dst],%[dst_ystride])\n\t" |
656 | 20.1M | :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2) |
657 | 20.1M | :[dst_ystride]"r"((ptrdiff_t)_dst_ystride), |
658 | 20.1M | [src_ystride]"r"((ptrdiff_t)_src_ystride) |
659 | 20.1M | :"memory" |
660 | 20.1M | ); |
661 | 20.1M | } |
662 | | |
663 | | unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src, |
664 | 0 | const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ |
665 | 0 | OC_ALIGN8(unsigned char ref[64]); |
666 | 0 | oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); |
667 | 0 | return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8); |
668 | 0 | } |
669 | | |
670 | | unsigned oc_enc_frag_intra_satd_mmxext(int *_dc, |
671 | 0 | const unsigned char *_src,int _ystride){ |
672 | 0 | OC_ALIGN8(ogg_int16_t buf[64]); |
673 | 0 | unsigned ret; |
674 | 0 | unsigned ret2; |
675 | 0 | int dc; |
676 | 0 | __asm__ __volatile__( |
677 | 0 | OC_LOAD_8x4(0x00) |
678 | 0 | OC_HADAMARD_8x4 |
679 | 0 | OC_TRANSPOSE_4x4x2(0x00) |
680 | | /*Finish swapping out this 8x4 block to make room for the next one. |
681 | | mm0...mm3 have been swapped out already.*/ |
682 | 0 | "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" |
683 | 0 | "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" |
684 | 0 | "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" |
685 | 0 | "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" |
686 | 0 | OC_LOAD_8x4(0x04) |
687 | 0 | OC_HADAMARD_8x4 |
688 | 0 | OC_TRANSPOSE_4x4x2(0x08) |
689 | | /*Here the first 4x4 block of output from the last transpose is the second |
690 | | 4x4 block of input for the next transform. |
691 | | We have cleverly arranged that it already be in the appropriate place, so |
692 | | we only have to do half the loads.*/ |
693 | 0 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" |
694 | 0 | "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" |
695 | 0 | "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" |
696 | 0 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" |
697 | | /*We split out the stages here so we can save the DC coefficient in the |
698 | | middle.*/ |
699 | 0 | OC_HADAMARD_AB_8x4 |
700 | 0 | OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) |
701 | 0 | "movd %%mm1,%[dc]\n\t" |
702 | 0 | OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) |
703 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
704 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
705 | | for the factor of two we dropped + 3 for the vertical accumulation). |
706 | | Now we finally have to promote things to dwords. |
707 | | We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long |
708 | | latency of pmaddwd by starting the next series of loads now.*/ |
709 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
710 | 0 | "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" |
711 | 0 | "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" |
712 | 0 | "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" |
713 | 0 | "movq %%mm0,%%mm4\n\t" |
714 | 0 | "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" |
715 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
716 | 0 | "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" |
717 | 0 | "paddd %%mm0,%%mm4\n\t" |
718 | 0 | "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" |
719 | 0 | "movd %%mm4,%[ret]\n\t" |
720 | 0 | "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" |
721 | 0 | "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" |
722 | 0 | OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) |
723 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
724 | | /*We assume that the DC coefficient is always positive (which is true, |
725 | | because the input to the INTRA transform was not a difference).*/ |
726 | 0 | "movzx %w[dc],%[dc]\n\t" |
727 | 0 | "add %[ret],%[ret]\n\t" |
728 | 0 | "sub %[dc],%[ret]\n\t" |
729 | 0 | "movq %%mm0,%%mm4\n\t" |
730 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
731 | 0 | "paddd %%mm0,%%mm4\n\t" |
732 | 0 | "movd %%mm4,%[ret2]\n\t" |
733 | 0 | "lea -64(%[ret],%[ret2],2),%[ret]\n\t" |
734 | | /*Although it looks like we're using 8 registers here, gcc can alias %[ret] |
735 | | and %[ret2] with some of the inputs, since for once we don't write to |
736 | | them until after we're done using everything but %[buf] (which is also |
737 | | listed as an output to ensure gcc _doesn't_ alias them against it).*/ |
738 | 0 | :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc), |
739 | 0 | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) |
740 | 0 | :[src]"r"(_src),[src4]"r"(_src+4*_ystride), |
741 | 0 | [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride) |
742 | | /*We have to use sub, so we actually clobber the condition codes for once |
743 | | (not to mention add).*/ |
744 | 0 | :"cc" |
745 | 0 | ); |
746 | 0 | *_dc=dc; |
747 | 0 | return ret; |
748 | 0 | } |
749 | | |
750 | | void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64], |
751 | 834k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
752 | 834k | int i; |
753 | 834k | __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); |
754 | 4.17M | for(i=4;i-->0;){ |
755 | 3.33M | __asm__ __volatile__( |
756 | | /*mm0=[src]*/ |
757 | 3.33M | "movq (%[src]),%%mm0\n\t" |
758 | | /*mm1=[ref]*/ |
759 | 3.33M | "movq (%[ref]),%%mm1\n\t" |
760 | | /*mm4=[src+ystride]*/ |
761 | 3.33M | "movq (%[src],%[ystride]),%%mm4\n\t" |
762 | | /*mm5=[ref+ystride]*/ |
763 | 3.33M | "movq (%[ref],%[ystride]),%%mm5\n\t" |
764 | | /*Compute [src]-[ref].*/ |
765 | 3.33M | "movq %%mm0,%%mm2\n\t" |
766 | 3.33M | "punpcklbw %%mm7,%%mm0\n\t" |
767 | 3.33M | "movq %%mm1,%%mm3\n\t" |
768 | 3.33M | "punpckhbw %%mm7,%%mm2\n\t" |
769 | 3.33M | "punpcklbw %%mm7,%%mm1\n\t" |
770 | 3.33M | "punpckhbw %%mm7,%%mm3\n\t" |
771 | 3.33M | "psubw %%mm1,%%mm0\n\t" |
772 | 3.33M | "psubw %%mm3,%%mm2\n\t" |
773 | | /*Compute [src+ystride]-[ref+ystride].*/ |
774 | 3.33M | "movq %%mm4,%%mm1\n\t" |
775 | 3.33M | "punpcklbw %%mm7,%%mm4\n\t" |
776 | 3.33M | "movq %%mm5,%%mm3\n\t" |
777 | 3.33M | "punpckhbw %%mm7,%%mm1\n\t" |
778 | 3.33M | "lea (%[src],%[ystride],2),%[src]\n\t" |
779 | 3.33M | "punpcklbw %%mm7,%%mm5\n\t" |
780 | 3.33M | "lea (%[ref],%[ystride],2),%[ref]\n\t" |
781 | 3.33M | "punpckhbw %%mm7,%%mm3\n\t" |
782 | 3.33M | "psubw %%mm5,%%mm4\n\t" |
783 | 3.33M | "psubw %%mm3,%%mm1\n\t" |
784 | | /*Write the answer out.*/ |
785 | 3.33M | "movq %%mm0,0x00(%[residue])\n\t" |
786 | 3.33M | "movq %%mm2,0x08(%[residue])\n\t" |
787 | 3.33M | "movq %%mm4,0x10(%[residue])\n\t" |
788 | 3.33M | "movq %%mm1,0x18(%[residue])\n\t" |
789 | 3.33M | "lea 0x20(%[residue]),%[residue]\n\t" |
790 | 3.33M | :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref) |
791 | 3.33M | :[ystride]"r"((ptrdiff_t)_ystride) |
792 | 3.33M | :"memory" |
793 | 3.33M | ); |
794 | 3.33M | } |
795 | 834k | } |
796 | | |
797 | | void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64], |
798 | 23.7M | const unsigned char *_src,int _ystride){ |
799 | 23.7M | ptrdiff_t ystride3; |
800 | 23.7M | __asm__ __volatile__( |
801 | | /*mm0=[src]*/ |
802 | 23.7M | "movq (%[src]),%%mm0\n\t" |
803 | | /*mm1=[src+ystride]*/ |
804 | 23.7M | "movq (%[src],%[ystride]),%%mm1\n\t" |
805 | | /*mm6={-1}x4*/ |
806 | 23.7M | "pcmpeqw %%mm6,%%mm6\n\t" |
807 | | /*mm2=[src+2*ystride]*/ |
808 | 23.7M | "movq (%[src],%[ystride],2),%%mm2\n\t" |
809 | | /*[ystride3]=3*[ystride]*/ |
810 | 23.7M | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" |
811 | | /*mm6={1}x4*/ |
812 | 23.7M | "psllw $15,%%mm6\n\t" |
813 | | /*mm3=[src+3*ystride]*/ |
814 | 23.7M | "movq (%[src],%[ystride3]),%%mm3\n\t" |
815 | | /*mm6={128}x4*/ |
816 | 23.7M | "psrlw $8,%%mm6\n\t" |
817 | | /*mm7=0*/ |
818 | 23.7M | "pxor %%mm7,%%mm7\n\t" |
819 | | /*[src]=[src]+4*[ystride]*/ |
820 | 23.7M | "lea (%[src],%[ystride],4),%[src]\n\t" |
821 | | /*Compute [src]-128 and [src+ystride]-128*/ |
822 | 23.7M | "movq %%mm0,%%mm4\n\t" |
823 | 23.7M | "punpcklbw %%mm7,%%mm0\n\t" |
824 | 23.7M | "movq %%mm1,%%mm5\n\t" |
825 | 23.7M | "punpckhbw %%mm7,%%mm4\n\t" |
826 | 23.7M | "psubw %%mm6,%%mm0\n\t" |
827 | 23.7M | "punpcklbw %%mm7,%%mm1\n\t" |
828 | 23.7M | "psubw %%mm6,%%mm4\n\t" |
829 | 23.7M | "punpckhbw %%mm7,%%mm5\n\t" |
830 | 23.7M | "psubw %%mm6,%%mm1\n\t" |
831 | 23.7M | "psubw %%mm6,%%mm5\n\t" |
832 | | /*Write the answer out.*/ |
833 | 23.7M | "movq %%mm0,0x00(%[residue])\n\t" |
834 | 23.7M | "movq %%mm4,0x08(%[residue])\n\t" |
835 | 23.7M | "movq %%mm1,0x10(%[residue])\n\t" |
836 | 23.7M | "movq %%mm5,0x18(%[residue])\n\t" |
837 | | /*mm0=[src+4*ystride]*/ |
838 | 23.7M | "movq (%[src]),%%mm0\n\t" |
839 | | /*mm1=[src+5*ystride]*/ |
840 | 23.7M | "movq (%[src],%[ystride]),%%mm1\n\t" |
841 | | /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/ |
842 | 23.7M | "movq %%mm2,%%mm4\n\t" |
843 | 23.7M | "punpcklbw %%mm7,%%mm2\n\t" |
844 | 23.7M | "movq %%mm3,%%mm5\n\t" |
845 | 23.7M | "punpckhbw %%mm7,%%mm4\n\t" |
846 | 23.7M | "psubw %%mm6,%%mm2\n\t" |
847 | 23.7M | "punpcklbw %%mm7,%%mm3\n\t" |
848 | 23.7M | "psubw %%mm6,%%mm4\n\t" |
849 | 23.7M | "punpckhbw %%mm7,%%mm5\n\t" |
850 | 23.7M | "psubw %%mm6,%%mm3\n\t" |
851 | 23.7M | "psubw %%mm6,%%mm5\n\t" |
852 | | /*Write the answer out.*/ |
853 | 23.7M | "movq %%mm2,0x20(%[residue])\n\t" |
854 | 23.7M | "movq %%mm4,0x28(%[residue])\n\t" |
855 | 23.7M | "movq %%mm3,0x30(%[residue])\n\t" |
856 | 23.7M | "movq %%mm5,0x38(%[residue])\n\t" |
857 | | /*mm2=[src+6*ystride]*/ |
858 | 23.7M | "movq (%[src],%[ystride],2),%%mm2\n\t" |
859 | | /*mm3=[src+7*ystride]*/ |
860 | 23.7M | "movq (%[src],%[ystride3]),%%mm3\n\t" |
861 | | /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/ |
862 | 23.7M | "movq %%mm0,%%mm4\n\t" |
863 | 23.7M | "punpcklbw %%mm7,%%mm0\n\t" |
864 | 23.7M | "movq %%mm1,%%mm5\n\t" |
865 | 23.7M | "punpckhbw %%mm7,%%mm4\n\t" |
866 | 23.7M | "psubw %%mm6,%%mm0\n\t" |
867 | 23.7M | "punpcklbw %%mm7,%%mm1\n\t" |
868 | 23.7M | "psubw %%mm6,%%mm4\n\t" |
869 | 23.7M | "punpckhbw %%mm7,%%mm5\n\t" |
870 | 23.7M | "psubw %%mm6,%%mm1\n\t" |
871 | 23.7M | "psubw %%mm6,%%mm5\n\t" |
872 | | /*Write the answer out.*/ |
873 | 23.7M | "movq %%mm0,0x40(%[residue])\n\t" |
874 | 23.7M | "movq %%mm4,0x48(%[residue])\n\t" |
875 | 23.7M | "movq %%mm1,0x50(%[residue])\n\t" |
876 | 23.7M | "movq %%mm5,0x58(%[residue])\n\t" |
877 | | /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ |
878 | 23.7M | "movq %%mm2,%%mm4\n\t" |
879 | 23.7M | "punpcklbw %%mm7,%%mm2\n\t" |
880 | 23.7M | "movq %%mm3,%%mm5\n\t" |
881 | 23.7M | "punpckhbw %%mm7,%%mm4\n\t" |
882 | 23.7M | "psubw %%mm6,%%mm2\n\t" |
883 | 23.7M | "punpcklbw %%mm7,%%mm3\n\t" |
884 | 23.7M | "psubw %%mm6,%%mm4\n\t" |
885 | 23.7M | "punpckhbw %%mm7,%%mm5\n\t" |
886 | 23.7M | "psubw %%mm6,%%mm3\n\t" |
887 | 23.7M | "psubw %%mm6,%%mm5\n\t" |
888 | | /*Write the answer out.*/ |
889 | 23.7M | "movq %%mm2,0x60(%[residue])\n\t" |
890 | 23.7M | "movq %%mm4,0x68(%[residue])\n\t" |
891 | 23.7M | "movq %%mm3,0x70(%[residue])\n\t" |
892 | 23.7M | "movq %%mm5,0x78(%[residue])\n\t" |
893 | 23.7M | :[src]"+r"(_src),[ystride3]"=&r"(ystride3) |
894 | 23.7M | :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride) |
895 | 23.7M | :"memory" |
896 | 23.7M | ); |
897 | 23.7M | } |
898 | | |
899 | | void oc_enc_frag_copy2_mmxext(unsigned char *_dst, |
900 | 0 | const unsigned char *_src1,const unsigned char *_src2,int _ystride){ |
901 | 0 | oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride); |
902 | 0 | } |
903 | | |
904 | | #endif |