/src/theora/lib/x86/mmxencfrag.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation https://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | |
15 | | ********************************************************************/ |
16 | | #include <stddef.h> |
17 | | #include "x86enc.h" |
18 | | |
19 | | #if defined(OC_X86_ASM) |
20 | | |
21 | | unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src, |
22 | 16.7M | const unsigned char *_ref,int _ystride){ |
23 | 16.7M | ptrdiff_t ystride3; |
24 | 16.7M | ptrdiff_t ret; |
25 | 16.7M | __asm__ __volatile__( |
26 | | /*Load the first 4 rows of each block.*/ |
27 | 16.7M | "movq (%[src]),%%mm0\n\t" |
28 | 16.7M | "movq (%[ref]),%%mm1\n\t" |
29 | 16.7M | "movq (%[src],%[ystride]),%%mm2\n\t" |
30 | 16.7M | "movq (%[ref],%[ystride]),%%mm3\n\t" |
31 | 16.7M | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" |
32 | 16.7M | "movq (%[src],%[ystride],2),%%mm4\n\t" |
33 | 16.7M | "movq (%[ref],%[ystride],2),%%mm5\n\t" |
34 | 16.7M | "movq (%[src],%[ystride3]),%%mm6\n\t" |
35 | 16.7M | "movq (%[ref],%[ystride3]),%%mm7\n\t" |
36 | | /*Compute their SADs and add them in %%mm0*/ |
37 | 16.7M | "psadbw %%mm1,%%mm0\n\t" |
38 | 16.7M | "psadbw %%mm3,%%mm2\n\t" |
39 | 16.7M | "lea (%[src],%[ystride],4),%[src]\n\t" |
40 | 16.7M | "paddw %%mm2,%%mm0\n\t" |
41 | 16.7M | "lea (%[ref],%[ystride],4),%[ref]\n\t" |
42 | | /*Load the next 3 rows as registers become available.*/ |
43 | 16.7M | "movq (%[src]),%%mm2\n\t" |
44 | 16.7M | "movq (%[ref]),%%mm3\n\t" |
45 | 16.7M | "psadbw %%mm5,%%mm4\n\t" |
46 | 16.7M | "psadbw %%mm7,%%mm6\n\t" |
47 | 16.7M | "paddw %%mm4,%%mm0\n\t" |
48 | 16.7M | "movq (%[ref],%[ystride]),%%mm5\n\t" |
49 | 16.7M | "movq (%[src],%[ystride]),%%mm4\n\t" |
50 | 16.7M | "paddw %%mm6,%%mm0\n\t" |
51 | 16.7M | "movq (%[ref],%[ystride],2),%%mm7\n\t" |
52 | 16.7M | "movq (%[src],%[ystride],2),%%mm6\n\t" |
53 | | /*Start adding their SADs to %%mm0*/ |
54 | 16.7M | "psadbw %%mm3,%%mm2\n\t" |
55 | 16.7M | "psadbw %%mm5,%%mm4\n\t" |
56 | 16.7M | "paddw %%mm2,%%mm0\n\t" |
57 | 16.7M | "psadbw %%mm7,%%mm6\n\t" |
58 | | /*Load last row as registers become available.*/ |
59 | 16.7M | "movq (%[src],%[ystride3]),%%mm2\n\t" |
60 | 16.7M | "movq (%[ref],%[ystride3]),%%mm3\n\t" |
61 | | /*And finish adding up their SADs.*/ |
62 | 16.7M | "paddw %%mm4,%%mm0\n\t" |
63 | 16.7M | "psadbw %%mm3,%%mm2\n\t" |
64 | 16.7M | "paddw %%mm6,%%mm0\n\t" |
65 | 16.7M | "paddw %%mm2,%%mm0\n\t" |
66 | 16.7M | "movd %%mm0,%[ret]\n\t" |
67 | 16.7M | :[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3) |
68 | 16.7M | :[ystride]"r"((ptrdiff_t)_ystride) |
69 | 16.7M | ); |
70 | 16.7M | return (unsigned)ret; |
71 | 16.7M | } |
72 | | |
73 | | unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src, |
74 | 0 | const unsigned char *_ref,int _ystride,unsigned _thresh){ |
75 | | /*Early termination is for suckers.*/ |
76 | 0 | return oc_enc_frag_sad_mmxext(_src,_ref,_ystride); |
77 | 0 | } |
78 | | |
79 | | /*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the |
80 | | first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7. |
81 | | We pre-load the next two rows of data as registers become available.*/ |
82 | | #define OC_SAD2_LOOP \ |
83 | | "#OC_SAD2_LOOP\n\t" \ |
84 | | /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \ |
85 | | pavgb computes (%%mm0+%%mm1+1>>1). \ |
86 | | The latter is exactly 1 too large when the low bit of two corresponding \ |
87 | | bytes is only set in one of them. \ |
88 | | Therefore we pxor the operands, pand to mask out the low bits, and psubb to \ |
89 | | correct the output of pavgb. \ |
90 | | TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \ |
91 | | schedules better; currently, however, this function is unused.*/ \ |
92 | | "movq %%mm0,%%mm6\n\t" \ |
93 | | "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \ |
94 | | "pxor %%mm1,%%mm0\n\t" \ |
95 | | "pavgb %%mm1,%%mm6\n\t" \ |
96 | | "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \ |
97 | | "movq %%mm2,%%mm1\n\t" \ |
98 | | "pand %%mm7,%%mm0\n\t" \ |
99 | | "pavgb %%mm3,%%mm2\n\t" \ |
100 | | "pxor %%mm3,%%mm1\n\t" \ |
101 | | "movq (%[ref2],%[ystride]),%%mm3\n\t" \ |
102 | | "psubb %%mm0,%%mm6\n\t" \ |
103 | | "movq (%[ref1]),%%mm0\n\t" \ |
104 | | "pand %%mm7,%%mm1\n\t" \ |
105 | | "psadbw %%mm6,%%mm4\n\t" \ |
106 | | "movd %[ret],%%mm6\n\t" \ |
107 | | "psubb %%mm1,%%mm2\n\t" \ |
108 | | "movq (%[ref2]),%%mm1\n\t" \ |
109 | | "lea (%[src],%[ystride],2),%[src]\n\t" \ |
110 | | "psadbw %%mm2,%%mm5\n\t" \ |
111 | | "movq (%[ref1],%[ystride]),%%mm2\n\t" \ |
112 | | "paddw %%mm4,%%mm5\n\t" \ |
113 | | "movq (%[src]),%%mm4\n\t" \ |
114 | | "paddw %%mm5,%%mm6\n\t" \ |
115 | | "movq (%[src],%[ystride]),%%mm5\n\t" \ |
116 | | "movd %%mm6,%[ret]\n\t" \ |
117 | | |
118 | | /*Same as above, but does not pre-load the next two rows.*/ |
119 | | #define OC_SAD2_TAIL \ |
120 | | "#OC_SAD2_TAIL\n\t" \ |
121 | | "movq %%mm0,%%mm6\n\t" \ |
122 | | "pavgb %%mm1,%%mm0\n\t" \ |
123 | | "pxor %%mm1,%%mm6\n\t" \ |
124 | | "movq %%mm2,%%mm1\n\t" \ |
125 | | "pand %%mm7,%%mm6\n\t" \ |
126 | | "pavgb %%mm3,%%mm2\n\t" \ |
127 | | "pxor %%mm3,%%mm1\n\t" \ |
128 | | "psubb %%mm6,%%mm0\n\t" \ |
129 | | "pand %%mm7,%%mm1\n\t" \ |
130 | | "psadbw %%mm0,%%mm4\n\t" \ |
131 | | "psubb %%mm1,%%mm2\n\t" \ |
132 | | "movd %[ret],%%mm6\n\t" \ |
133 | | "psadbw %%mm2,%%mm5\n\t" \ |
134 | | "paddw %%mm4,%%mm5\n\t" \ |
135 | | "paddw %%mm5,%%mm6\n\t" \ |
136 | | "movd %%mm6,%[ret]\n\t" \ |
137 | | |
138 | | unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src, |
139 | | const unsigned char *_ref1,const unsigned char *_ref2,int _ystride, |
140 | 0 | unsigned _thresh){ |
141 | 0 | ptrdiff_t ret; |
142 | 0 | __asm__ __volatile__( |
143 | 0 | "movq (%[ref1]),%%mm0\n\t" |
144 | 0 | "movq (%[ref2]),%%mm1\n\t" |
145 | 0 | "movq (%[ref1],%[ystride]),%%mm2\n\t" |
146 | 0 | "movq (%[ref2],%[ystride]),%%mm3\n\t" |
147 | 0 | "xor %[ret],%[ret]\n\t" |
148 | 0 | "movq (%[src]),%%mm4\n\t" |
149 | 0 | "pxor %%mm7,%%mm7\n\t" |
150 | 0 | "pcmpeqb %%mm6,%%mm6\n\t" |
151 | 0 | "movq (%[src],%[ystride]),%%mm5\n\t" |
152 | 0 | "psubb %%mm6,%%mm7\n\t" |
153 | 0 | OC_SAD2_LOOP |
154 | 0 | OC_SAD2_LOOP |
155 | 0 | OC_SAD2_LOOP |
156 | 0 | OC_SAD2_TAIL |
157 | 0 | :[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2) |
158 | 0 | :[ystride]"r"((ptrdiff_t)_ystride) |
159 | 0 | ); |
160 | 0 | return (unsigned)ret; |
161 | 0 | } |
162 | | |
163 | | /*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their |
164 | | 16-bit difference in %%mm0...%%mm7.*/ |
165 | | #define OC_LOAD_SUB_8x4(_off) \ |
166 | | "#OC_LOAD_SUB_8x4\n\t" \ |
167 | | "movd "#_off"(%[src]),%%mm0\n\t" \ |
168 | | "movd "#_off"(%[ref]),%%mm4\n\t" \ |
169 | | "movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \ |
170 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
171 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \ |
172 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
173 | | "movd "#_off"(%[src]),%%mm2\n\t" \ |
174 | | "movd "#_off"(%[ref]),%%mm7\n\t" \ |
175 | | "movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \ |
176 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \ |
177 | | "punpcklbw %%mm4,%%mm0\n\t" \ |
178 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
179 | | "punpcklbw %%mm4,%%mm4\n\t" \ |
180 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
181 | | "psubw %%mm4,%%mm0\n\t" \ |
182 | | "movd "#_off"(%[src]),%%mm4\n\t" \ |
183 | | "movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \ |
184 | | "movd "#_off"(%[ref]),%%mm0\n\t" \ |
185 | | "punpcklbw %%mm5,%%mm1\n\t" \ |
186 | | "punpcklbw %%mm5,%%mm5\n\t" \ |
187 | | "psubw %%mm5,%%mm1\n\t" \ |
188 | | "movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \ |
189 | | "punpcklbw %%mm7,%%mm2\n\t" \ |
190 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
191 | | "psubw %%mm7,%%mm2\n\t" \ |
192 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \ |
193 | | "punpcklbw %%mm6,%%mm3\n\t" \ |
194 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
195 | | "punpcklbw %%mm6,%%mm6\n\t" \ |
196 | | "psubw %%mm6,%%mm3\n\t" \ |
197 | | "movd "#_off"(%[src]),%%mm6\n\t" \ |
198 | | "punpcklbw %%mm0,%%mm4\n\t" \ |
199 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
200 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
201 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
202 | | "psubw %%mm0,%%mm4\n\t" \ |
203 | | "movd "#_off"(%[ref]),%%mm0\n\t" \ |
204 | | "punpcklbw %%mm7,%%mm5\n\t" \ |
205 | | "neg %[src_ystride]\n\t" \ |
206 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
207 | | "psubw %%mm7,%%mm5\n\t" \ |
208 | | "movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \ |
209 | | "punpcklbw %%mm0,%%mm6\n\t" \ |
210 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
211 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
212 | | "neg %[ref_ystride]\n\t" \ |
213 | | "psubw %%mm0,%%mm6\n\t" \ |
214 | | "movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \ |
215 | | "lea (%[src],%[src_ystride],8),%[src]\n\t" \ |
216 | | "punpcklbw %%mm0,%%mm7\n\t" \ |
217 | | "neg %[src_ystride]\n\t" \ |
218 | | "punpcklbw %%mm0,%%mm0\n\t" \ |
219 | | "lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \ |
220 | | "psubw %%mm0,%%mm7\n\t" \ |
221 | | "neg %[ref_ystride]\n\t" \ |
222 | | "movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \ |
223 | | |
224 | | /*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/ |
225 | | #define OC_LOAD_8x4(_off) \ |
226 | | "#OC_LOAD_8x4\n\t" \ |
227 | | "movd "#_off"(%[src]),%%mm0\n\t" \ |
228 | | "movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \ |
229 | | "movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \ |
230 | | "pxor %%mm7,%%mm7\n\t" \ |
231 | | "movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \ |
232 | | "punpcklbw %%mm7,%%mm0\n\t" \ |
233 | | "movd "#_off"(%[src4]),%%mm4\n\t" \ |
234 | | "punpcklbw %%mm7,%%mm1\n\t" \ |
235 | | "movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \ |
236 | | "punpcklbw %%mm7,%%mm2\n\t" \ |
237 | | "movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \ |
238 | | "punpcklbw %%mm7,%%mm3\n\t" \ |
239 | | "movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \ |
240 | | "punpcklbw %%mm4,%%mm4\n\t" \ |
241 | | "punpcklbw %%mm5,%%mm5\n\t" \ |
242 | | "psrlw $8,%%mm4\n\t" \ |
243 | | "psrlw $8,%%mm5\n\t" \ |
244 | | "punpcklbw %%mm6,%%mm6\n\t" \ |
245 | | "punpcklbw %%mm7,%%mm7\n\t" \ |
246 | | "psrlw $8,%%mm6\n\t" \ |
247 | | "psrlw $8,%%mm7\n\t" \ |
248 | | |
249 | | /*Performs the first two stages of an 8-point 1-D Hadamard transform. |
250 | | The transform is performed in place, except that outputs 0-3 are swapped with |
251 | | outputs 4-7. |
252 | | Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to |
253 | | perform this stage in place with no temporary registers).*/ |
254 | | #define OC_HADAMARD_AB_8x4 \ |
255 | | "#OC_HADAMARD_AB_8x4\n\t" \ |
256 | | /*Stage A: \ |
257 | | Outputs 0-3 are swapped with 4-7 here.*/ \ |
258 | | "paddw %%mm1,%%mm5\n\t" \ |
259 | | "paddw %%mm2,%%mm6\n\t" \ |
260 | | "paddw %%mm1,%%mm1\n\t" \ |
261 | | "paddw %%mm2,%%mm2\n\t" \ |
262 | | "psubw %%mm5,%%mm1\n\t" \ |
263 | | "psubw %%mm6,%%mm2\n\t" \ |
264 | | "paddw %%mm3,%%mm7\n\t" \ |
265 | | "paddw %%mm0,%%mm4\n\t" \ |
266 | | "paddw %%mm3,%%mm3\n\t" \ |
267 | | "paddw %%mm0,%%mm0\n\t" \ |
268 | | "psubw %%mm7,%%mm3\n\t" \ |
269 | | "psubw %%mm4,%%mm0\n\t" \ |
270 | | /*Stage B:*/ \ |
271 | | "paddw %%mm2,%%mm0\n\t" \ |
272 | | "paddw %%mm3,%%mm1\n\t" \ |
273 | | "paddw %%mm6,%%mm4\n\t" \ |
274 | | "paddw %%mm7,%%mm5\n\t" \ |
275 | | "paddw %%mm2,%%mm2\n\t" \ |
276 | | "paddw %%mm3,%%mm3\n\t" \ |
277 | | "paddw %%mm6,%%mm6\n\t" \ |
278 | | "paddw %%mm7,%%mm7\n\t" \ |
279 | | "psubw %%mm0,%%mm2\n\t" \ |
280 | | "psubw %%mm1,%%mm3\n\t" \ |
281 | | "psubw %%mm4,%%mm6\n\t" \ |
282 | | "psubw %%mm5,%%mm7\n\t" \ |
283 | | |
284 | | /*Performs the last stage of an 8-point 1-D Hadamard transform in place. |
285 | | Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in |
286 | | place with no temporary registers).*/ |
287 | | #define OC_HADAMARD_C_8x4 \ |
288 | | "#OC_HADAMARD_C_8x4\n\t" \ |
289 | | /*Stage C:*/ \ |
290 | | "paddw %%mm1,%%mm0\n\t" \ |
291 | | "paddw %%mm3,%%mm2\n\t" \ |
292 | | "paddw %%mm5,%%mm4\n\t" \ |
293 | | "paddw %%mm7,%%mm6\n\t" \ |
294 | | "paddw %%mm1,%%mm1\n\t" \ |
295 | | "paddw %%mm3,%%mm3\n\t" \ |
296 | | "paddw %%mm5,%%mm5\n\t" \ |
297 | | "paddw %%mm7,%%mm7\n\t" \ |
298 | | "psubw %%mm0,%%mm1\n\t" \ |
299 | | "psubw %%mm2,%%mm3\n\t" \ |
300 | | "psubw %%mm4,%%mm5\n\t" \ |
301 | | "psubw %%mm6,%%mm7\n\t" \ |
302 | | |
303 | | /*Performs an 8-point 1-D Hadamard transform. |
304 | | The transform is performed in place, except that outputs 0-3 are swapped with |
305 | | outputs 4-7. |
306 | | Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform |
307 | | in place with no temporary registers).*/ |
308 | | #define OC_HADAMARD_8x4 \ |
309 | | OC_HADAMARD_AB_8x4 \ |
310 | | OC_HADAMARD_C_8x4 \ |
311 | | |
312 | | /*Performs the first part of the final stage of the Hadamard transform and |
313 | | summing of absolute values. |
314 | | At the end of this part, %%mm1 will contain the DC coefficient of the |
315 | | transform.*/ |
316 | | #define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ |
317 | | /*We use the fact that \ |
318 | | (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ |
319 | | to merge the final butterfly with the abs and the first stage of \ |
320 | | accumulation. \ |
321 | | Thus we can avoid using pabsw, which is not available until SSSE3. \ |
322 | | Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \ |
323 | | implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ |
324 | | registers). \ |
325 | | Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ |
326 | | This implementation is only 26 (+4 for spilling registers).*/ \ |
327 | | "#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \ |
328 | | "movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \ |
329 | | "movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \ |
330 | | /*mm7={0x7FFF}x4 \ |
331 | | mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \ |
332 | | "pcmpeqb %%mm7,%%mm7\n\t" \ |
333 | | "movq %%mm0,%%mm6\n\t" \ |
334 | | "psrlw $1,%%mm7\n\t" \ |
335 | | "paddw %%mm1,%%mm6\n\t" \ |
336 | | "pmaxsw %%mm1,%%mm0\n\t" \ |
337 | | "paddsw %%mm7,%%mm6\n\t" \ |
338 | | "psubw %%mm6,%%mm0\n\t" \ |
339 | | /*mm2=max(abs(mm2),abs(mm3))-0x7FFF \ |
340 | | mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \ |
341 | | "movq %%mm2,%%mm6\n\t" \ |
342 | | "movq %%mm4,%%mm1\n\t" \ |
343 | | "pmaxsw %%mm3,%%mm2\n\t" \ |
344 | | "pmaxsw %%mm5,%%mm4\n\t" \ |
345 | | "paddw %%mm3,%%mm6\n\t" \ |
346 | | "paddw %%mm5,%%mm1\n\t" \ |
347 | | "movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \ |
348 | | |
349 | | /*Performs the second part of the final stage of the Hadamard transform and |
350 | | summing of absolute values.*/ |
351 | | #define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ |
352 | | "#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \ |
353 | | "paddsw %%mm7,%%mm6\n\t" \ |
354 | | "movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \ |
355 | | "paddsw %%mm7,%%mm1\n\t" \ |
356 | | "psubw %%mm6,%%mm2\n\t" \ |
357 | | "psubw %%mm1,%%mm4\n\t" \ |
358 | | /*mm7={1}x4 (needed for the horizontal add that follows) \ |
359 | | mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \ |
360 | | "movq %%mm3,%%mm6\n\t" \ |
361 | | "pmaxsw %%mm5,%%mm3\n\t" \ |
362 | | "paddw %%mm2,%%mm0\n\t" \ |
363 | | "paddw %%mm5,%%mm6\n\t" \ |
364 | | "paddw %%mm4,%%mm0\n\t" \ |
365 | | "paddsw %%mm7,%%mm6\n\t" \ |
366 | | "paddw %%mm3,%%mm0\n\t" \ |
367 | | "psrlw $14,%%mm7\n\t" \ |
368 | | "psubw %%mm6,%%mm0\n\t" \ |
369 | | |
370 | | /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the |
371 | | absolute value of each component, and accumulates everything into mm0. |
372 | | This is the only portion of SATD which requires MMXEXT (we could use plain |
373 | | MMX, but it takes 4 instructions and an extra register to work around the |
374 | | lack of a pmaxsw, which is a pretty serious penalty).*/ |
375 | | #define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \ |
376 | | OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \ |
377 | | OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \ |
378 | | |
379 | | /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each |
380 | | component, and accumulates everything into mm0. |
381 | | Note that mm0 will have an extra 4 added to each column, and that after |
382 | | removing this value, the remainder will be half the conventional value.*/ |
383 | | #define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \ |
384 | | OC_HADAMARD_AB_8x4 \ |
385 | | OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) |
386 | | |
387 | | /*Performs two 4x4 transposes (mostly) in place. |
388 | | On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7} |
389 | | contains rows {a,b,c,d}. |
390 | | On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and |
391 | | {mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/ |
392 | | #define OC_TRANSPOSE_4x4x2(_off) \ |
393 | | "#OC_TRANSPOSE_4x4x2\n\t" \ |
394 | | /*First 4x4 transpose:*/ \ |
395 | | "movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \ |
396 | | /*mm0 = e3 e2 e1 e0 \ |
397 | | mm1 = f3 f2 f1 f0 \ |
398 | | mm2 = g3 g2 g1 g0 \ |
399 | | mm3 = h3 h2 h1 h0*/ \ |
400 | | "movq %%mm2,%%mm5\n\t" \ |
401 | | "punpcklwd %%mm3,%%mm2\n\t" \ |
402 | | "punpckhwd %%mm3,%%mm5\n\t" \ |
403 | | "movq %%mm0,%%mm3\n\t" \ |
404 | | "punpcklwd %%mm1,%%mm0\n\t" \ |
405 | | "punpckhwd %%mm1,%%mm3\n\t" \ |
406 | | /*mm0 = f1 e1 f0 e0 \ |
407 | | mm3 = f3 e3 f2 e2 \ |
408 | | mm2 = h1 g1 h0 g0 \ |
409 | | mm5 = h3 g3 h2 g2*/ \ |
410 | | "movq %%mm0,%%mm1\n\t" \ |
411 | | "punpckldq %%mm2,%%mm0\n\t" \ |
412 | | "punpckhdq %%mm2,%%mm1\n\t" \ |
413 | | "movq %%mm3,%%mm2\n\t" \ |
414 | | "punpckhdq %%mm5,%%mm3\n\t" \ |
415 | | "movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \ |
416 | | "punpckldq %%mm5,%%mm2\n\t" \ |
417 | | /*mm0 = h0 g0 f0 e0 \ |
418 | | mm1 = h1 g1 f1 e1 \ |
419 | | mm2 = h2 g2 f2 e2 \ |
420 | | mm3 = h3 g3 f3 e3*/ \ |
421 | | "movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \ |
422 | | /*Second 4x4 transpose:*/ \ |
423 | | /*mm4 = a3 a2 a1 a0 \ |
424 | | mm5 = b3 b2 b1 b0 \ |
425 | | mm6 = c3 c2 c1 c0 \ |
426 | | mm7 = d3 d2 d1 d0*/ \ |
427 | | "movq %%mm6,%%mm0\n\t" \ |
428 | | "punpcklwd %%mm7,%%mm6\n\t" \ |
429 | | "movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \ |
430 | | "punpckhwd %%mm7,%%mm0\n\t" \ |
431 | | "movq %%mm4,%%mm7\n\t" \ |
432 | | "punpcklwd %%mm5,%%mm4\n\t" \ |
433 | | "movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \ |
434 | | "punpckhwd %%mm5,%%mm7\n\t" \ |
435 | | /*mm4 = b1 a1 b0 a0 \ |
436 | | mm7 = b3 a3 b2 a2 \ |
437 | | mm6 = d1 c1 d0 c0 \ |
438 | | mm0 = d3 c3 d2 c2*/ \ |
439 | | "movq %%mm4,%%mm5\n\t" \ |
440 | | "punpckldq %%mm6,%%mm4\n\t" \ |
441 | | "movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \ |
442 | | "punpckhdq %%mm6,%%mm5\n\t" \ |
443 | | "movq %%mm7,%%mm6\n\t" \ |
444 | | "punpckhdq %%mm0,%%mm7\n\t" \ |
445 | | "punpckldq %%mm0,%%mm6\n\t" \ |
446 | | /*mm4 = d0 c0 b0 a0 \ |
447 | | mm5 = d1 c1 b1 a1 \ |
448 | | mm6 = d2 c2 b2 a2 \ |
449 | | mm7 = d3 c3 b3 a3*/ \ |
450 | | |
451 | | static unsigned oc_int_frag_satd_mmxext(int *_dc, |
452 | | const unsigned char *_src,int _src_ystride, |
453 | 0 | const unsigned char *_ref,int _ref_ystride){ |
454 | 0 | OC_ALIGN8(ogg_int16_t buf[64]); |
455 | 0 | unsigned ret; |
456 | 0 | unsigned ret2; |
457 | 0 | int dc; |
458 | 0 | __asm__ __volatile__( |
459 | 0 | OC_LOAD_SUB_8x4(0x00) |
460 | 0 | OC_HADAMARD_8x4 |
461 | 0 | OC_TRANSPOSE_4x4x2(0x00) |
462 | | /*Finish swapping out this 8x4 block to make room for the next one. |
463 | | mm0...mm3 have been swapped out already.*/ |
464 | 0 | "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" |
465 | 0 | "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" |
466 | 0 | "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" |
467 | 0 | "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" |
468 | 0 | OC_LOAD_SUB_8x4(0x04) |
469 | 0 | OC_HADAMARD_8x4 |
470 | 0 | OC_TRANSPOSE_4x4x2(0x08) |
471 | | /*Here the first 4x4 block of output from the last transpose is the second |
472 | | 4x4 block of input for the next transform. |
473 | | We have cleverly arranged that it already be in the appropriate place, so |
474 | | we only have to do half the loads.*/ |
475 | 0 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" |
476 | 0 | "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" |
477 | 0 | "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" |
478 | 0 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" |
479 | | /*We split out the stages here so we can save the DC coefficient in the |
480 | | middle.*/ |
481 | 0 | OC_HADAMARD_AB_8x4 |
482 | 0 | OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) |
483 | 0 | "movd %%mm1,%[dc]\n\t" |
484 | 0 | OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) |
485 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
486 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
487 | | for the factor of two we dropped + 3 for the vertical accumulation). |
488 | | Now we finally have to promote things to dwords. |
489 | | We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long |
490 | | latency of pmaddwd by starting the next series of loads now.*/ |
491 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
492 | 0 | "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" |
493 | 0 | "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" |
494 | 0 | "movq %%mm0,%%mm4\n\t" |
495 | 0 | "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" |
496 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
497 | 0 | "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" |
498 | 0 | "paddd %%mm0,%%mm4\n\t" |
499 | 0 | "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" |
500 | 0 | "movd %%mm4,%[ret2]\n\t" |
501 | 0 | "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" |
502 | 0 | "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" |
503 | 0 | "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" |
504 | 0 | OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) |
505 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
506 | | /*Subtract abs(dc) from 2*ret2.*/ |
507 | 0 | "movsx %w[dc],%[dc]\n\t" |
508 | 0 | "cdq\n\t" |
509 | 0 | "lea (%[ret],%[ret2],2),%[ret2]\n\t" |
510 | 0 | "movq %%mm0,%%mm4\n\t" |
511 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
512 | 0 | "xor %[dc],%[ret]\n\t" |
513 | 0 | "paddd %%mm0,%%mm4\n\t" |
514 | | /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4 |
515 | | added to them, a factor of two removed, and the DC value included; |
516 | | correct the final sum here.*/ |
517 | 0 | "sub %[ret],%[ret2]\n\t" |
518 | 0 | "movd %%mm4,%[ret]\n\t" |
519 | 0 | "lea -64(%[ret2],%[ret],2),%[ret]\n\t" |
520 | | /*Although it looks like we're using 8 registers here, gcc can alias %[ret] |
521 | | and %[ret2] with some of the inputs, since for once we don't write to |
522 | | them until after we're done using everything but %[buf].*/ |
523 | | /*Note that _src_ystride and _ref_ystride must be given non-overlapping |
524 | | constraints, otherwise if gcc can prove they're equal it will allocate |
525 | | them to the same register (which is bad); _src and _ref face a similar |
526 | | problem, though those are never actually the same.*/ |
527 | 0 | :[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc), |
528 | 0 | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) |
529 | 0 | :[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride), |
530 | 0 | [ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride) |
531 | | /*We have to use neg, so we actually clobber the condition codes for once |
532 | | (not to mention cmp, sub, and add).*/ |
533 | 0 | :"cc" |
534 | 0 | ); |
535 | 0 | *_dc=dc; |
536 | 0 | return ret; |
537 | 0 | } |
538 | | |
539 | | unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src, |
540 | 0 | const unsigned char *_ref,int _ystride){ |
541 | 0 | return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride); |
542 | 0 | } |
543 | | |
544 | | /*Our internal implementation of frag_copy2 takes an extra stride parameter so |
545 | | we can share code with oc_enc_frag_satd2_mmxext().*/ |
546 | | void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride, |
547 | 15.0M | const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){ |
548 | 15.0M | __asm__ __volatile__( |
549 | | /*Load the first 3 rows.*/ |
550 | 15.0M | "movq (%[src1]),%%mm0\n\t" |
551 | 15.0M | "movq (%[src2]),%%mm1\n\t" |
552 | 15.0M | "movq (%[src1],%[src_ystride]),%%mm2\n\t" |
553 | 15.0M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
554 | 15.0M | "movq (%[src2],%[src_ystride]),%%mm3\n\t" |
555 | 15.0M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
556 | 15.0M | "pxor %%mm7,%%mm7\n\t" |
557 | 15.0M | "movq (%[src1]),%%mm4\n\t" |
558 | 15.0M | "pcmpeqb %%mm6,%%mm6\n\t" |
559 | 15.0M | "movq (%[src2]),%%mm5\n\t" |
560 | | /*mm7={1}x8.*/ |
561 | 15.0M | "psubb %%mm6,%%mm7\n\t" |
562 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
563 | 15.0M | "movq %%mm0,%%mm6\n\t" |
564 | 15.0M | "pxor %%mm1,%%mm0\n\t" |
565 | 15.0M | "pavgb %%mm1,%%mm6\n\t" |
566 | | /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/ |
567 | 15.0M | "movq %%mm2,%%mm1\n\t" |
568 | 15.0M | "pand %%mm7,%%mm0\n\t" |
569 | 15.0M | "pavgb %%mm3,%%mm2\n\t" |
570 | 15.0M | "pxor %%mm3,%%mm1\n\t" |
571 | | /*%%mm3 is free.*/ |
572 | 15.0M | "psubb %%mm0,%%mm6\n\t" |
573 | | /*%%mm0 is free, start loading the next row.*/ |
574 | 15.0M | "movq (%[src1],%[src_ystride]),%%mm0\n\t" |
575 | | /*Start averaging %%mm5 and %%mm4 using %%mm3.*/ |
576 | 15.0M | "movq %%mm4,%%mm3\n\t" |
577 | | /*%%mm6 (row 0) is done; write it out.*/ |
578 | 15.0M | "movq %%mm6,(%[dst])\n\t" |
579 | 15.0M | "pand %%mm7,%%mm1\n\t" |
580 | 15.0M | "pavgb %%mm5,%%mm4\n\t" |
581 | 15.0M | "psubb %%mm1,%%mm2\n\t" |
582 | | /*%%mm1 is free, continue loading the next row.*/ |
583 | 15.0M | "movq (%[src2],%[src_ystride]),%%mm1\n\t" |
584 | 15.0M | "pxor %%mm5,%%mm3\n\t" |
585 | 15.0M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
586 | | /*%%mm2 (row 1) is done; write it out.*/ |
587 | 15.0M | "movq %%mm2,(%[dst],%[dst_ystride])\n\t" |
588 | 15.0M | "pand %%mm7,%%mm3\n\t" |
589 | | /*Start loading the next row.*/ |
590 | 15.0M | "movq (%[src1]),%%mm2\n\t" |
591 | 15.0M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
592 | 15.0M | "psubb %%mm3,%%mm4\n\t" |
593 | 15.0M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
594 | | /*%%mm4 (row 2) is done; write it out.*/ |
595 | 15.0M | "movq %%mm4,(%[dst])\n\t" |
596 | | /*Continue loading the next row.*/ |
597 | 15.0M | "movq (%[src2]),%%mm3\n\t" |
598 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
599 | 15.0M | "movq %%mm0,%%mm6\n\t" |
600 | 15.0M | "pxor %%mm1,%%mm0\n\t" |
601 | | /*Start loading the next row.*/ |
602 | 15.0M | "movq (%[src1],%[src_ystride]),%%mm4\n\t" |
603 | 15.0M | "pavgb %%mm1,%%mm6\n\t" |
604 | | /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/ |
605 | 15.0M | "movq %%mm2,%%mm1\n\t" |
606 | 15.0M | "pand %%mm7,%%mm0\n\t" |
607 | | /*Continue loading the next row.*/ |
608 | 15.0M | "movq (%[src2],%[src_ystride]),%%mm5\n\t" |
609 | 15.0M | "pavgb %%mm3,%%mm2\n\t" |
610 | 15.0M | "lea (%[src1],%[src_ystride],2),%[src1]\n\t" |
611 | 15.0M | "pxor %%mm3,%%mm1\n\t" |
612 | | /*%%mm3 is free.*/ |
613 | 15.0M | "psubb %%mm0,%%mm6\n\t" |
614 | | /*%%mm0 is free, start loading the next row.*/ |
615 | 15.0M | "movq (%[src1]),%%mm0\n\t" |
616 | | /*Start averaging %%mm5 into %%mm4 using %%mm3.*/ |
617 | 15.0M | "movq %%mm4,%%mm3\n\t" |
618 | | /*%%mm6 (row 3) is done; write it out.*/ |
619 | 15.0M | "movq %%mm6,(%[dst],%[dst_ystride])\n\t" |
620 | 15.0M | "pand %%mm7,%%mm1\n\t" |
621 | 15.0M | "lea (%[src2],%[src_ystride],2),%[src2]\n\t" |
622 | 15.0M | "pavgb %%mm5,%%mm4\n\t" |
623 | 15.0M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
624 | 15.0M | "psubb %%mm1,%%mm2\n\t" |
625 | | /*%%mm1 is free; continue loading the next row.*/ |
626 | 15.0M | "movq (%[src2]),%%mm1\n\t" |
627 | 15.0M | "pxor %%mm5,%%mm3\n\t" |
628 | | /*%%mm2 (row 4) is done; write it out.*/ |
629 | 15.0M | "movq %%mm2,(%[dst])\n\t" |
630 | 15.0M | "pand %%mm7,%%mm3\n\t" |
631 | | /*Start loading the next row.*/ |
632 | 15.0M | "movq (%[src1],%[src_ystride]),%%mm2\n\t" |
633 | 15.0M | "psubb %%mm3,%%mm4\n\t" |
634 | | /*Start averaging %%mm0 and %%mm1 into %%mm6.*/ |
635 | 15.0M | "movq %%mm0,%%mm6\n\t" |
636 | | /*Continue loading the next row.*/ |
637 | 15.0M | "movq (%[src2],%[src_ystride]),%%mm3\n\t" |
638 | | /*%%mm4 (row 5) is done; write it out.*/ |
639 | 15.0M | "movq %%mm4,(%[dst],%[dst_ystride])\n\t" |
640 | 15.0M | "pxor %%mm1,%%mm0\n\t" |
641 | 15.0M | "pavgb %%mm1,%%mm6\n\t" |
642 | | /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/ |
643 | 15.0M | "movq %%mm2,%%mm4\n\t" |
644 | 15.0M | "pand %%mm7,%%mm0\n\t" |
645 | 15.0M | "pavgb %%mm3,%%mm2\n\t" |
646 | 15.0M | "pxor %%mm3,%%mm4\n\t" |
647 | 15.0M | "lea (%[dst],%[dst_ystride],2),%[dst]\n\t" |
648 | 15.0M | "psubb %%mm0,%%mm6\n\t" |
649 | 15.0M | "pand %%mm7,%%mm4\n\t" |
650 | | /*%%mm6 (row 6) is done, write it out.*/ |
651 | 15.0M | "movq %%mm6,(%[dst])\n\t" |
652 | 15.0M | "psubb %%mm4,%%mm2\n\t" |
653 | | /*%%mm2 (row 7) is done, write it out.*/ |
654 | 15.0M | "movq %%mm2,(%[dst],%[dst_ystride])\n\t" |
655 | 15.0M | :[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2) |
656 | 15.0M | :[dst_ystride]"r"((ptrdiff_t)_dst_ystride), |
657 | 15.0M | [src_ystride]"r"((ptrdiff_t)_src_ystride) |
658 | 15.0M | :"memory" |
659 | 15.0M | ); |
660 | 15.0M | } |
661 | | |
662 | | unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src, |
663 | 0 | const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ |
664 | 0 | OC_ALIGN8(unsigned char ref[64]); |
665 | 0 | oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); |
666 | 0 | return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8); |
667 | 0 | } |
668 | | |
669 | | unsigned oc_enc_frag_intra_satd_mmxext(int *_dc, |
670 | 0 | const unsigned char *_src,int _ystride){ |
671 | 0 | OC_ALIGN8(ogg_int16_t buf[64]); |
672 | 0 | unsigned ret; |
673 | 0 | unsigned ret2; |
674 | 0 | int dc; |
675 | 0 | __asm__ __volatile__( |
676 | 0 | OC_LOAD_8x4(0x00) |
677 | 0 | OC_HADAMARD_8x4 |
678 | 0 | OC_TRANSPOSE_4x4x2(0x00) |
679 | | /*Finish swapping out this 8x4 block to make room for the next one. |
680 | | mm0...mm3 have been swapped out already.*/ |
681 | 0 | "movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t" |
682 | 0 | "movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t" |
683 | 0 | "movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t" |
684 | 0 | "movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t" |
685 | 0 | OC_LOAD_8x4(0x04) |
686 | 0 | OC_HADAMARD_8x4 |
687 | 0 | OC_TRANSPOSE_4x4x2(0x08) |
688 | | /*Here the first 4x4 block of output from the last transpose is the second |
689 | | 4x4 block of input for the next transform. |
690 | | We have cleverly arranged that it already be in the appropriate place, so |
691 | | we only have to do half the loads.*/ |
692 | 0 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t" |
693 | 0 | "movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t" |
694 | 0 | "movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t" |
695 | 0 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t" |
696 | | /*We split out the stages here so we can save the DC coefficient in the |
697 | | middle.*/ |
698 | 0 | OC_HADAMARD_AB_8x4 |
699 | 0 | OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38) |
700 | 0 | "movd %%mm1,%[dc]\n\t" |
701 | 0 | OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38) |
702 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
703 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
704 | | for the factor of two we dropped + 3 for the vertical accumulation). |
705 | | Now we finally have to promote things to dwords. |
706 | | We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long |
707 | | latency of pmaddwd by starting the next series of loads now.*/ |
708 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
709 | 0 | "movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t" |
710 | 0 | "movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t" |
711 | 0 | "movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t" |
712 | 0 | "movq %%mm0,%%mm4\n\t" |
713 | 0 | "movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t" |
714 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
715 | 0 | "movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t" |
716 | 0 | "paddd %%mm0,%%mm4\n\t" |
717 | 0 | "movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t" |
718 | 0 | "movd %%mm4,%[ret]\n\t" |
719 | 0 | "movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t" |
720 | 0 | "movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t" |
721 | 0 | OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78) |
722 | 0 | "pmaddwd %%mm7,%%mm0\n\t" |
723 | | /*We assume that the DC coefficient is always positive (which is true, |
724 | | because the input to the INTRA transform was not a difference).*/ |
725 | 0 | "movzx %w[dc],%[dc]\n\t" |
726 | 0 | "add %[ret],%[ret]\n\t" |
727 | 0 | "sub %[dc],%[ret]\n\t" |
728 | 0 | "movq %%mm0,%%mm4\n\t" |
729 | 0 | "punpckhdq %%mm0,%%mm0\n\t" |
730 | 0 | "paddd %%mm0,%%mm4\n\t" |
731 | 0 | "movd %%mm4,%[ret2]\n\t" |
732 | 0 | "lea -64(%[ret],%[ret2],2),%[ret]\n\t" |
733 | | /*Although it looks like we're using 8 registers here, gcc can alias %[ret] |
734 | | and %[ret2] with some of the inputs, since for once we don't write to |
735 | | them until after we're done using everything but %[buf] (which is also |
736 | | listed as an output to ensure gcc _doesn't_ alias them against it).*/ |
737 | 0 | :[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc), |
738 | 0 | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64)) |
739 | 0 | :[src]"r"(_src),[src4]"r"(_src+4*_ystride), |
740 | 0 | [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride) |
741 | | /*We have to use sub, so we actually clobber the condition codes for once |
742 | | (not to mention add).*/ |
743 | 0 | :"cc" |
744 | 0 | ); |
745 | 0 | *_dc=dc; |
746 | 0 | return ret; |
747 | 0 | } |
748 | | |
749 | | void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64], |
750 | 434k | const unsigned char *_src,const unsigned char *_ref,int _ystride){ |
751 | 434k | int i; |
752 | 434k | __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); |
753 | 2.17M | for(i=4;i-->0;){ |
754 | 1.73M | __asm__ __volatile__( |
755 | | /*mm0=[src]*/ |
756 | 1.73M | "movq (%[src]),%%mm0\n\t" |
757 | | /*mm1=[ref]*/ |
758 | 1.73M | "movq (%[ref]),%%mm1\n\t" |
759 | | /*mm4=[src+ystride]*/ |
760 | 1.73M | "movq (%[src],%[ystride]),%%mm4\n\t" |
761 | | /*mm5=[ref+ystride]*/ |
762 | 1.73M | "movq (%[ref],%[ystride]),%%mm5\n\t" |
763 | | /*Compute [src]-[ref].*/ |
764 | 1.73M | "movq %%mm0,%%mm2\n\t" |
765 | 1.73M | "punpcklbw %%mm7,%%mm0\n\t" |
766 | 1.73M | "movq %%mm1,%%mm3\n\t" |
767 | 1.73M | "punpckhbw %%mm7,%%mm2\n\t" |
768 | 1.73M | "punpcklbw %%mm7,%%mm1\n\t" |
769 | 1.73M | "punpckhbw %%mm7,%%mm3\n\t" |
770 | 1.73M | "psubw %%mm1,%%mm0\n\t" |
771 | 1.73M | "psubw %%mm3,%%mm2\n\t" |
772 | | /*Compute [src+ystride]-[ref+ystride].*/ |
773 | 1.73M | "movq %%mm4,%%mm1\n\t" |
774 | 1.73M | "punpcklbw %%mm7,%%mm4\n\t" |
775 | 1.73M | "movq %%mm5,%%mm3\n\t" |
776 | 1.73M | "punpckhbw %%mm7,%%mm1\n\t" |
777 | 1.73M | "lea (%[src],%[ystride],2),%[src]\n\t" |
778 | 1.73M | "punpcklbw %%mm7,%%mm5\n\t" |
779 | 1.73M | "lea (%[ref],%[ystride],2),%[ref]\n\t" |
780 | 1.73M | "punpckhbw %%mm7,%%mm3\n\t" |
781 | 1.73M | "psubw %%mm5,%%mm4\n\t" |
782 | 1.73M | "psubw %%mm3,%%mm1\n\t" |
783 | | /*Write the answer out.*/ |
784 | 1.73M | "movq %%mm0,0x00(%[residue])\n\t" |
785 | 1.73M | "movq %%mm2,0x08(%[residue])\n\t" |
786 | 1.73M | "movq %%mm4,0x10(%[residue])\n\t" |
787 | 1.73M | "movq %%mm1,0x18(%[residue])\n\t" |
788 | 1.73M | "lea 0x20(%[residue]),%[residue]\n\t" |
789 | 1.73M | :[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref) |
790 | 1.73M | :[ystride]"r"((ptrdiff_t)_ystride) |
791 | 1.73M | :"memory" |
792 | 1.73M | ); |
793 | 1.73M | } |
794 | 434k | } |
795 | | |
796 | | void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64], |
797 | 22.3M | const unsigned char *_src,int _ystride){ |
798 | 22.3M | ptrdiff_t ystride3; |
799 | 22.3M | __asm__ __volatile__( |
800 | | /*mm0=[src]*/ |
801 | 22.3M | "movq (%[src]),%%mm0\n\t" |
802 | | /*mm1=[src+ystride]*/ |
803 | 22.3M | "movq (%[src],%[ystride]),%%mm1\n\t" |
804 | | /*mm6={-1}x4*/ |
805 | 22.3M | "pcmpeqw %%mm6,%%mm6\n\t" |
806 | | /*mm2=[src+2*ystride]*/ |
807 | 22.3M | "movq (%[src],%[ystride],2),%%mm2\n\t" |
808 | | /*[ystride3]=3*[ystride]*/ |
809 | 22.3M | "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" |
810 | | /*mm6={1}x4*/ |
811 | 22.3M | "psllw $15,%%mm6\n\t" |
812 | | /*mm3=[src+3*ystride]*/ |
813 | 22.3M | "movq (%[src],%[ystride3]),%%mm3\n\t" |
814 | | /*mm6={128}x4*/ |
815 | 22.3M | "psrlw $8,%%mm6\n\t" |
816 | | /*mm7=0*/ |
817 | 22.3M | "pxor %%mm7,%%mm7\n\t" |
818 | | /*[src]=[src]+4*[ystride]*/ |
819 | 22.3M | "lea (%[src],%[ystride],4),%[src]\n\t" |
820 | | /*Compute [src]-128 and [src+ystride]-128*/ |
821 | 22.3M | "movq %%mm0,%%mm4\n\t" |
822 | 22.3M | "punpcklbw %%mm7,%%mm0\n\t" |
823 | 22.3M | "movq %%mm1,%%mm5\n\t" |
824 | 22.3M | "punpckhbw %%mm7,%%mm4\n\t" |
825 | 22.3M | "psubw %%mm6,%%mm0\n\t" |
826 | 22.3M | "punpcklbw %%mm7,%%mm1\n\t" |
827 | 22.3M | "psubw %%mm6,%%mm4\n\t" |
828 | 22.3M | "punpckhbw %%mm7,%%mm5\n\t" |
829 | 22.3M | "psubw %%mm6,%%mm1\n\t" |
830 | 22.3M | "psubw %%mm6,%%mm5\n\t" |
831 | | /*Write the answer out.*/ |
832 | 22.3M | "movq %%mm0,0x00(%[residue])\n\t" |
833 | 22.3M | "movq %%mm4,0x08(%[residue])\n\t" |
834 | 22.3M | "movq %%mm1,0x10(%[residue])\n\t" |
835 | 22.3M | "movq %%mm5,0x18(%[residue])\n\t" |
836 | | /*mm0=[src+4*ystride]*/ |
837 | 22.3M | "movq (%[src]),%%mm0\n\t" |
838 | | /*mm1=[src+5*ystride]*/ |
839 | 22.3M | "movq (%[src],%[ystride]),%%mm1\n\t" |
840 | | /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/ |
841 | 22.3M | "movq %%mm2,%%mm4\n\t" |
842 | 22.3M | "punpcklbw %%mm7,%%mm2\n\t" |
843 | 22.3M | "movq %%mm3,%%mm5\n\t" |
844 | 22.3M | "punpckhbw %%mm7,%%mm4\n\t" |
845 | 22.3M | "psubw %%mm6,%%mm2\n\t" |
846 | 22.3M | "punpcklbw %%mm7,%%mm3\n\t" |
847 | 22.3M | "psubw %%mm6,%%mm4\n\t" |
848 | 22.3M | "punpckhbw %%mm7,%%mm5\n\t" |
849 | 22.3M | "psubw %%mm6,%%mm3\n\t" |
850 | 22.3M | "psubw %%mm6,%%mm5\n\t" |
851 | | /*Write the answer out.*/ |
852 | 22.3M | "movq %%mm2,0x20(%[residue])\n\t" |
853 | 22.3M | "movq %%mm4,0x28(%[residue])\n\t" |
854 | 22.3M | "movq %%mm3,0x30(%[residue])\n\t" |
855 | 22.3M | "movq %%mm5,0x38(%[residue])\n\t" |
856 | | /*mm2=[src+6*ystride]*/ |
857 | 22.3M | "movq (%[src],%[ystride],2),%%mm2\n\t" |
858 | | /*mm3=[src+7*ystride]*/ |
859 | 22.3M | "movq (%[src],%[ystride3]),%%mm3\n\t" |
860 | | /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/ |
861 | 22.3M | "movq %%mm0,%%mm4\n\t" |
862 | 22.3M | "punpcklbw %%mm7,%%mm0\n\t" |
863 | 22.3M | "movq %%mm1,%%mm5\n\t" |
864 | 22.3M | "punpckhbw %%mm7,%%mm4\n\t" |
865 | 22.3M | "psubw %%mm6,%%mm0\n\t" |
866 | 22.3M | "punpcklbw %%mm7,%%mm1\n\t" |
867 | 22.3M | "psubw %%mm6,%%mm4\n\t" |
868 | 22.3M | "punpckhbw %%mm7,%%mm5\n\t" |
869 | 22.3M | "psubw %%mm6,%%mm1\n\t" |
870 | 22.3M | "psubw %%mm6,%%mm5\n\t" |
871 | | /*Write the answer out.*/ |
872 | 22.3M | "movq %%mm0,0x40(%[residue])\n\t" |
873 | 22.3M | "movq %%mm4,0x48(%[residue])\n\t" |
874 | 22.3M | "movq %%mm1,0x50(%[residue])\n\t" |
875 | 22.3M | "movq %%mm5,0x58(%[residue])\n\t" |
876 | | /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/ |
877 | 22.3M | "movq %%mm2,%%mm4\n\t" |
878 | 22.3M | "punpcklbw %%mm7,%%mm2\n\t" |
879 | 22.3M | "movq %%mm3,%%mm5\n\t" |
880 | 22.3M | "punpckhbw %%mm7,%%mm4\n\t" |
881 | 22.3M | "psubw %%mm6,%%mm2\n\t" |
882 | 22.3M | "punpcklbw %%mm7,%%mm3\n\t" |
883 | 22.3M | "psubw %%mm6,%%mm4\n\t" |
884 | 22.3M | "punpckhbw %%mm7,%%mm5\n\t" |
885 | 22.3M | "psubw %%mm6,%%mm3\n\t" |
886 | 22.3M | "psubw %%mm6,%%mm5\n\t" |
887 | | /*Write the answer out.*/ |
888 | 22.3M | "movq %%mm2,0x60(%[residue])\n\t" |
889 | 22.3M | "movq %%mm4,0x68(%[residue])\n\t" |
890 | 22.3M | "movq %%mm3,0x70(%[residue])\n\t" |
891 | 22.3M | "movq %%mm5,0x78(%[residue])\n\t" |
892 | 22.3M | :[src]"+r"(_src),[ystride3]"=&r"(ystride3) |
893 | 22.3M | :[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride) |
894 | 22.3M | :"memory" |
895 | 22.3M | ); |
896 | 22.3M | } |
897 | | |
898 | | void oc_enc_frag_copy2_mmxext(unsigned char *_dst, |
899 | 0 | const unsigned char *_src1,const unsigned char *_src2,int _ystride){ |
900 | 0 | oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride); |
901 | 0 | } |
902 | | |
903 | | #endif |