/src/theora/lib/x86/sse2fdct.c
Line | Count | Source |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 * |
9 | | * by the Xiph.Org Foundation http://www.xiph.org/ * |
10 | | * * |
11 | | ********************************************************************/ |
12 | | /*SSE2 fDCT implementation for x86_64.*/ |
13 | | /*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/ |
14 | | #include <stddef.h> |
15 | | #include "x86enc.h" |
16 | | #include "x86zigzag.h" |
17 | | #include "sse2trans.h" |
18 | | |
19 | | #if defined(OC_X86_64_ASM) |
20 | | |
21 | | # define OC_FDCT_8x8 \ |
22 | | /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \ |
23 | | "#OC_FDCT_8x8\n\t" \ |
24 | | /*Stage 1:*/ \ |
25 | | "movdqa %%xmm0,%%xmm11\n\t" \ |
26 | | "movdqa %%xmm1,%%xmm10\n\t" \ |
27 | | "movdqa %%xmm2,%%xmm9\n\t" \ |
28 | | "movdqa %%xmm3,%%xmm8\n\t" \ |
29 | | /*xmm11=t7'=t0-t7*/ \ |
30 | | "psubw %%xmm7,%%xmm11\n\t" \ |
31 | | /*xmm10=t6'=t1-t6*/ \ |
32 | | "psubw %%xmm6,%%xmm10\n\t" \ |
33 | | /*xmm9=t5'=t2-t5*/ \ |
34 | | "psubw %%xmm5,%%xmm9\n\t" \ |
35 | | /*xmm8=t4'=t3-t4*/ \ |
36 | | "psubw %%xmm4,%%xmm8\n\t" \ |
37 | | /*xmm0=t0'=t0+t7*/ \ |
38 | | "paddw %%xmm7,%%xmm0\n\t" \ |
39 | | /*xmm1=t1'=t1+t6*/ \ |
40 | | "paddw %%xmm6,%%xmm1\n\t" \ |
41 | | /*xmm5=t2'=t2+t5*/ \ |
42 | | "paddw %%xmm2,%%xmm5\n\t" \ |
43 | | /*xmm4=t3'=t3+t4*/ \ |
44 | | "paddw %%xmm3,%%xmm4\n\t" \ |
45 | | /*xmm2,3,6,7 are now free.*/ \ |
46 | | /*Stage 2:*/ \ |
47 | | "movdqa %%xmm0,%%xmm3\n\t" \ |
48 | | "mov $0x5A806A0A,%[a]\n\t" \ |
49 | | "movdqa %%xmm1,%%xmm2\n\t" \ |
50 | | "movd %[a],%%xmm13\n\t" \ |
51 | | "movdqa %%xmm10,%%xmm6\n\t" \ |
52 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
53 | | /*xmm2=t2''=t1'-t2'*/ \ |
54 | | "psubw %%xmm5,%%xmm2\n\t" \ |
55 | | "pxor %%xmm12,%%xmm12\n\t" \ |
56 | | /*xmm3=t3''=t0'-t3'*/ \ |
57 | | "psubw %%xmm4,%%xmm3\n\t" \ |
58 | | "psubw %%xmm14,%%xmm12\n\t" \ |
59 | | /*xmm10=t5''=t6'-t5'*/ \ |
60 | | "psubw %%xmm9,%%xmm10\n\t" \ |
61 | | "paddw %%xmm12,%%xmm12\n\t" \ |
62 | | /*xmm4=t0''=t0'+t3'*/ \ |
63 | | "paddw %%xmm0,%%xmm4\n\t" \ |
64 | | /*xmm1=t1''=t1'+t2'*/ \ |
65 | | "paddw %%xmm5,%%xmm1\n\t" \ |
66 | | /*xmm6=t6''=t6'+t5'*/ \ |
67 | | "paddw %%xmm9,%%xmm6\n\t" \ |
68 | | /*xmm0,xmm5,xmm9 are now free.*/ \ |
69 | | /*Stage 3:*/ \ |
70 | | /*xmm10:xmm5=t5''*27146+0xB500 \ |
71 | | xmm0=t5''*/ \ |
72 | | "movdqa %%xmm10,%%xmm5\n\t" \ |
73 | | "movdqa %%xmm10,%%xmm0\n\t" \ |
74 | | "punpckhwd %%xmm12,%%xmm10\n\t" \ |
75 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
76 | | "punpcklwd %%xmm12,%%xmm5\n\t" \ |
77 | | "pmaddwd %%xmm13,%%xmm5\n\t" \ |
78 | | /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \ |
79 | | "psrad $16,%%xmm10\n\t" \ |
80 | | "psrad $16,%%xmm5\n\t" \ |
81 | | "packssdw %%xmm10,%%xmm5\n\t" \ |
82 | | "paddw %%xmm0,%%xmm5\n\t" \ |
83 | | /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \ |
84 | | "pcmpeqw %%xmm15,%%xmm0\n\t" \ |
85 | | "psubw %%xmm14,%%xmm0\n\t" \ |
86 | | "paddw %%xmm5,%%xmm0\n\t" \ |
87 | | "movdqa %%xmm8,%%xmm5\n\t" \ |
88 | | "psraw $1,%%xmm0\n\t" \ |
89 | | /*xmm5=t5'''=t4'-s*/ \ |
90 | | "psubw %%xmm0,%%xmm5\n\t" \ |
91 | | /*xmm8=t4''=t4'+s*/ \ |
92 | | "paddw %%xmm0,%%xmm8\n\t" \ |
93 | | /*xmm0,xmm7,xmm9,xmm10 are free.*/ \ |
94 | | /*xmm7:xmm9=t6''*27146+0xB500*/ \ |
95 | | "movdqa %%xmm6,%%xmm7\n\t" \ |
96 | | "movdqa %%xmm6,%%xmm9\n\t" \ |
97 | | "punpckhwd %%xmm12,%%xmm7\n\t" \ |
98 | | "pmaddwd %%xmm13,%%xmm7\n\t" \ |
99 | | "punpcklwd %%xmm12,%%xmm9\n\t" \ |
100 | | "pmaddwd %%xmm13,%%xmm9\n\t" \ |
101 | | /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \ |
102 | | "psrad $16,%%xmm7\n\t" \ |
103 | | "psrad $16,%%xmm9\n\t" \ |
104 | | "packssdw %%xmm7,%%xmm9\n\t" \ |
105 | | "paddw %%xmm6,%%xmm9\n\t" \ |
106 | | /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \ |
107 | | "pcmpeqw %%xmm15,%%xmm6\n\t" \ |
108 | | "psubw %%xmm14,%%xmm6\n\t" \ |
109 | | "paddw %%xmm6,%%xmm9\n\t" \ |
110 | | "movdqa %%xmm11,%%xmm7\n\t" \ |
111 | | "psraw $1,%%xmm9\n\t" \ |
112 | | /*xmm7=t6'''=t7'-s*/ \ |
113 | | "psubw %%xmm9,%%xmm7\n\t" \ |
114 | | /*xmm9=t7''=t7'+s*/ \ |
115 | | "paddw %%xmm11,%%xmm9\n\t" \ |
116 | | /*xmm0,xmm6,xmm10,xmm11 are free.*/ \ |
117 | | /*Stage 4:*/ \ |
118 | | /*xmm10:xmm0=t1''*27146+0xB500*/ \ |
119 | | "movdqa %%xmm1,%%xmm0\n\t" \ |
120 | | "movdqa %%xmm1,%%xmm10\n\t" \ |
121 | | "punpcklwd %%xmm12,%%xmm0\n\t" \ |
122 | | "pmaddwd %%xmm13,%%xmm0\n\t" \ |
123 | | "punpckhwd %%xmm12,%%xmm10\n\t" \ |
124 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
125 | | /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \ |
126 | | "psrad $16,%%xmm0\n\t" \ |
127 | | "psrad $16,%%xmm10\n\t" \ |
128 | | "mov $0x20006A0A,%[a]\n\t" \ |
129 | | "packssdw %%xmm10,%%xmm0\n\t" \ |
130 | | "movd %[a],%%xmm13\n\t" \ |
131 | | "paddw %%xmm1,%%xmm0\n\t" \ |
132 | | /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \ |
133 | | "pcmpeqw %%xmm15,%%xmm1\n\t" \ |
134 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
135 | | "psubw %%xmm14,%%xmm1\n\t" \ |
136 | | "paddw %%xmm1,%%xmm0\n\t" \ |
137 | | /*xmm10:xmm4=t0''*27146+0x4000*/ \ |
138 | | "movdqa %%xmm4,%%xmm1\n\t" \ |
139 | | "movdqa %%xmm4,%%xmm10\n\t" \ |
140 | | "punpcklwd %%xmm12,%%xmm4\n\t" \ |
141 | | "pmaddwd %%xmm13,%%xmm4\n\t" \ |
142 | | "punpckhwd %%xmm12,%%xmm10\n\t" \ |
143 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
144 | | /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \ |
145 | | "psrad $16,%%xmm4\n\t" \ |
146 | | "psrad $16,%%xmm10\n\t" \ |
147 | | "mov $0x6CB7,%[a]\n\t" \ |
148 | | "packssdw %%xmm10,%%xmm4\n\t" \ |
149 | | "movd %[a],%%xmm12\n\t" \ |
150 | | "paddw %%xmm1,%%xmm4\n\t" \ |
151 | | /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \ |
152 | | "pcmpeqw %%xmm15,%%xmm1\n\t" \ |
153 | | "pshufd $00,%%xmm12,%%xmm12\n\t" \ |
154 | | "psubw %%xmm14,%%xmm1\n\t" \ |
155 | | "mov $0x7FFF6C84,%[a]\n\t" \ |
156 | | "paddw %%xmm1,%%xmm4\n\t" \ |
157 | | /*xmm0=_y[0]=u=r+s>>1 \ |
158 | | The naive implementation could cause overflow, so we use \ |
159 | | u=(r&s)+((r^s)>>1).*/ \ |
160 | | "movdqa %%xmm0,%%xmm6\n\t" \ |
161 | | "pxor %%xmm4,%%xmm0\n\t" \ |
162 | | "pand %%xmm4,%%xmm6\n\t" \ |
163 | | "psraw $1,%%xmm0\n\t" \ |
164 | | "movd %[a],%%xmm13\n\t" \ |
165 | | "paddw %%xmm6,%%xmm0\n\t" \ |
166 | | /*xmm4=_y[4]=v=r-u*/ \ |
167 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
168 | | "psubw %%xmm0,%%xmm4\n\t" \ |
169 | | /*xmm1,xmm6,xmm10,xmm11 are free.*/ \ |
170 | | /*xmm6:xmm10=60547*t3''+0x6CB7*/ \ |
171 | | "movdqa %%xmm3,%%xmm10\n\t" \ |
172 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
173 | | "punpcklwd %%xmm3,%%xmm10\n\t" \ |
174 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
175 | | "mov $0x61F861F8,%[a]\n\t" \ |
176 | | "punpckhwd %%xmm3,%%xmm6\n\t" \ |
177 | | "pmaddwd %%xmm13,%%xmm6\n\t" \ |
178 | | "movd %[a],%%xmm13\n\t" \ |
179 | | "paddd %%xmm12,%%xmm10\n\t" \ |
180 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
181 | | "paddd %%xmm12,%%xmm6\n\t" \ |
182 | | /*xmm1:xmm2=25080*t2'' \ |
183 | | xmm12=t2''*/ \ |
184 | | "movdqa %%xmm2,%%xmm11\n\t" \ |
185 | | "movdqa %%xmm2,%%xmm12\n\t" \ |
186 | | "pmullw %%xmm13,%%xmm2\n\t" \ |
187 | | "pmulhw %%xmm13,%%xmm11\n\t" \ |
188 | | "movdqa %%xmm2,%%xmm1\n\t" \ |
189 | | "punpcklwd %%xmm11,%%xmm2\n\t" \ |
190 | | "punpckhwd %%xmm11,%%xmm1\n\t" \ |
191 | | /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \ |
192 | | "paddd %%xmm2,%%xmm10\n\t" \ |
193 | | "paddd %%xmm1,%%xmm6\n\t" \ |
194 | | "psrad $16,%%xmm10\n\t" \ |
195 | | "pcmpeqw %%xmm15,%%xmm3\n\t" \ |
196 | | "psrad $16,%%xmm6\n\t" \ |
197 | | "psubw %%xmm14,%%xmm3\n\t" \ |
198 | | "packssdw %%xmm6,%%xmm10\n\t" \ |
199 | | "paddw %%xmm3,%%xmm10\n\t" \ |
200 | | /*xmm2=_y[2]=u \ |
201 | | xmm10=s=(25080*u>>16)-t2''*/ \ |
202 | | "movdqa %%xmm10,%%xmm2\n\t" \ |
203 | | "pmulhw %%xmm13,%%xmm10\n\t" \ |
204 | | "psubw %%xmm12,%%xmm10\n\t" \ |
205 | | /*xmm1:xmm6=s*21600+0x2800*/ \ |
206 | | "pxor %%xmm12,%%xmm12\n\t" \ |
207 | | "psubw %%xmm14,%%xmm12\n\t" \ |
208 | | "mov $0x28005460,%[a]\n\t" \ |
209 | | "movd %[a],%%xmm13\n\t" \ |
210 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
211 | | "movdqa %%xmm10,%%xmm6\n\t" \ |
212 | | "movdqa %%xmm10,%%xmm1\n\t" \ |
213 | | "punpcklwd %%xmm12,%%xmm6\n\t" \ |
214 | | "pmaddwd %%xmm13,%%xmm6\n\t" \ |
215 | | "mov $0x0E3D,%[a]\n\t" \ |
216 | | "punpckhwd %%xmm12,%%xmm1\n\t" \ |
217 | | "pmaddwd %%xmm13,%%xmm1\n\t" \ |
218 | | /*xmm6=(s*21600+0x2800>>18)+s*/ \ |
219 | | "psrad $18,%%xmm6\n\t" \ |
220 | | "psrad $18,%%xmm1\n\t" \ |
221 | | "movd %[a],%%xmm12\n\t" \ |
222 | | "packssdw %%xmm1,%%xmm6\n\t" \ |
223 | | "pshufd $00,%%xmm12,%%xmm12\n\t" \ |
224 | | "paddw %%xmm10,%%xmm6\n\t" \ |
225 | | /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \ |
226 | | "mov $0x7FFF54DC,%[a]\n\t" \ |
227 | | "pcmpeqw %%xmm15,%%xmm10\n\t" \ |
228 | | "movd %[a],%%xmm13\n\t" \ |
229 | | "psubw %%xmm14,%%xmm10\n\t" \ |
230 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
231 | | "paddw %%xmm10,%%xmm6\n\t " \ |
232 | | /*xmm1,xmm3,xmm10,xmm11 are free.*/ \ |
233 | | /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \ |
234 | | "movdqa %%xmm5,%%xmm10\n\t" \ |
235 | | "movdqa %%xmm5,%%xmm11\n\t" \ |
236 | | "punpcklwd %%xmm5,%%xmm10\n\t" \ |
237 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
238 | | "mov $0x8E3A8E3A,%[a]\n\t" \ |
239 | | "punpckhwd %%xmm5,%%xmm11\n\t" \ |
240 | | "pmaddwd %%xmm13,%%xmm11\n\t" \ |
241 | | "movd %[a],%%xmm13\n\t" \ |
242 | | "paddd %%xmm12,%%xmm10\n\t" \ |
243 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
244 | | "paddd %%xmm12,%%xmm11\n\t" \ |
245 | | /*xmm7:xmm12=36410*t6''' \ |
246 | | xmm1=t6'''*/ \ |
247 | | "movdqa %%xmm7,%%xmm3\n\t" \ |
248 | | "movdqa %%xmm7,%%xmm1\n\t" \ |
249 | | "pmulhw %%xmm13,%%xmm3\n\t" \ |
250 | | "pmullw %%xmm13,%%xmm7\n\t" \ |
251 | | "paddw %%xmm1,%%xmm3\n\t" \ |
252 | | "movdqa %%xmm7,%%xmm12\n\t" \ |
253 | | "punpckhwd %%xmm3,%%xmm7\n\t" \ |
254 | | "punpcklwd %%xmm3,%%xmm12\n\t" \ |
255 | | /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \ |
256 | | "paddd %%xmm12,%%xmm10\n\t" \ |
257 | | "paddd %%xmm7,%%xmm11\n\t" \ |
258 | | "psrad $16,%%xmm10\n\t" \ |
259 | | "pcmpeqw %%xmm15,%%xmm5\n\t" \ |
260 | | "psrad $16,%%xmm11\n\t" \ |
261 | | "psubw %%xmm14,%%xmm5\n\t" \ |
262 | | "packssdw %%xmm11,%%xmm10\n\t" \ |
263 | | "pxor %%xmm12,%%xmm12\n\t" \ |
264 | | "paddw %%xmm5,%%xmm10\n\t" \ |
265 | | /*xmm5=_y[5]=u \ |
266 | | xmm1=s=t6'''-(36410*u>>16)*/ \ |
267 | | "psubw %%xmm14,%%xmm12\n\t" \ |
268 | | "movdqa %%xmm10,%%xmm5\n\t" \ |
269 | | "mov $0x340067C8,%[a]\n\t" \ |
270 | | "pmulhw %%xmm13,%%xmm10\n\t" \ |
271 | | "movd %[a],%%xmm13\n\t" \ |
272 | | "paddw %%xmm5,%%xmm10\n\t" \ |
273 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
274 | | "psubw %%xmm10,%%xmm1\n\t" \ |
275 | | /*xmm11:xmm3=s*26568+0x3400*/ \ |
276 | | "movdqa %%xmm1,%%xmm3\n\t" \ |
277 | | "movdqa %%xmm1,%%xmm11\n\t" \ |
278 | | "punpcklwd %%xmm12,%%xmm3\n\t" \ |
279 | | "pmaddwd %%xmm13,%%xmm3\n\t" \ |
280 | | "mov $0x7B1B,%[a]\n\t" \ |
281 | | "punpckhwd %%xmm12,%%xmm11\n\t" \ |
282 | | "pmaddwd %%xmm13,%%xmm11\n\t" \ |
283 | | /*xmm3=(s*26568+0x3400>>17)+s*/ \ |
284 | | "psrad $17,%%xmm3\n\t" \ |
285 | | "psrad $17,%%xmm11\n\t" \ |
286 | | "movd %[a],%%xmm12\n\t" \ |
287 | | "packssdw %%xmm11,%%xmm3\n\t" \ |
288 | | "pshufd $00,%%xmm12,%%xmm12\n\t" \ |
289 | | "paddw %%xmm1,%%xmm3\n\t" \ |
290 | | /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \ |
291 | | "mov $0x7FFF7B16,%[a]\n\t" \ |
292 | | "pcmpeqw %%xmm15,%%xmm1\n\t" \ |
293 | | "movd %[a],%%xmm13\n\t" \ |
294 | | "psubw %%xmm14,%%xmm1\n\t" \ |
295 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
296 | | "paddw %%xmm1,%%xmm3\n\t " \ |
297 | | /*xmm1,xmm7,xmm10,xmm11 are free.*/ \ |
298 | | /*xmm11:xmm10=64277*t7''+0x7B1B*/ \ |
299 | | "movdqa %%xmm9,%%xmm10\n\t" \ |
300 | | "movdqa %%xmm9,%%xmm11\n\t" \ |
301 | | "punpcklwd %%xmm9,%%xmm10\n\t" \ |
302 | | "pmaddwd %%xmm13,%%xmm10\n\t" \ |
303 | | "mov $0x31F131F1,%[a]\n\t" \ |
304 | | "punpckhwd %%xmm9,%%xmm11\n\t" \ |
305 | | "pmaddwd %%xmm13,%%xmm11\n\t" \ |
306 | | "movd %[a],%%xmm13\n\t" \ |
307 | | "paddd %%xmm12,%%xmm10\n\t" \ |
308 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
309 | | "paddd %%xmm12,%%xmm11\n\t" \ |
310 | | /*xmm12:xmm7=12785*t4''*/ \ |
311 | | "movdqa %%xmm8,%%xmm7\n\t" \ |
312 | | "movdqa %%xmm8,%%xmm1\n\t" \ |
313 | | "pmullw %%xmm13,%%xmm7\n\t" \ |
314 | | "pmulhw %%xmm13,%%xmm1\n\t" \ |
315 | | "movdqa %%xmm7,%%xmm12\n\t" \ |
316 | | "punpcklwd %%xmm1,%%xmm7\n\t" \ |
317 | | "punpckhwd %%xmm1,%%xmm12\n\t" \ |
318 | | /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \ |
319 | | "paddd %%xmm7,%%xmm10\n\t" \ |
320 | | "paddd %%xmm12,%%xmm11\n\t" \ |
321 | | "psrad $16,%%xmm10\n\t" \ |
322 | | "pcmpeqw %%xmm15,%%xmm9\n\t" \ |
323 | | "psrad $16,%%xmm11\n\t" \ |
324 | | "psubw %%xmm14,%%xmm9\n\t" \ |
325 | | "packssdw %%xmm11,%%xmm10\n\t" \ |
326 | | "pxor %%xmm12,%%xmm12\n\t" \ |
327 | | "paddw %%xmm9,%%xmm10\n\t" \ |
328 | | /*xmm1=_y[1]=u \ |
329 | | xmm10=s=(12785*u>>16)-t4''*/ \ |
330 | | "psubw %%xmm14,%%xmm12\n\t" \ |
331 | | "movdqa %%xmm10,%%xmm1\n\t" \ |
332 | | "mov $0x3000503B,%[a]\n\t" \ |
333 | | "pmulhw %%xmm13,%%xmm10\n\t" \ |
334 | | "movd %[a],%%xmm13\n\t" \ |
335 | | "psubw %%xmm8,%%xmm10\n\t" \ |
336 | | "pshufd $00,%%xmm13,%%xmm13\n\t" \ |
337 | | /*xmm8:xmm7=s*20539+0x3000*/ \ |
338 | | "movdqa %%xmm10,%%xmm7\n\t" \ |
339 | | "movdqa %%xmm10,%%xmm8\n\t" \ |
340 | | "punpcklwd %%xmm12,%%xmm7\n\t" \ |
341 | | "pmaddwd %%xmm13,%%xmm7\n\t" \ |
342 | | "punpckhwd %%xmm12,%%xmm8\n\t" \ |
343 | | "pmaddwd %%xmm13,%%xmm8\n\t" \ |
344 | | /*xmm7=(s*20539+0x3000>>20)+s*/ \ |
345 | | "psrad $20,%%xmm7\n\t" \ |
346 | | "psrad $20,%%xmm8\n\t" \ |
347 | | "packssdw %%xmm8,%%xmm7\n\t" \ |
348 | | "paddw %%xmm10,%%xmm7\n\t" \ |
349 | | /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \ |
350 | | "pcmpeqw %%xmm15,%%xmm10\n\t" \ |
351 | | "psubw %%xmm14,%%xmm10\n\t" \ |
352 | | "paddw %%xmm10,%%xmm7\n\t " \ |
353 | | |
354 | | /*SSE2 implementation of the fDCT for x86-64 only. |
355 | | Because of the 8 extra XMM registers on x86-64, this version can operate |
356 | | without any temporary stack access at all.*/ |
357 | 24.6M | void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){ |
358 | 24.6M | ptrdiff_t a; |
359 | 24.6M | __asm__ __volatile__( |
360 | | /*Load the input.*/ |
361 | 24.6M | "movdqa 0x00(%[x]),%%xmm0\n\t" |
362 | 24.6M | "movdqa 0x10(%[x]),%%xmm1\n\t" |
363 | 24.6M | "movdqa 0x20(%[x]),%%xmm2\n\t" |
364 | 24.6M | "movdqa 0x30(%[x]),%%xmm3\n\t" |
365 | 24.6M | "movdqa 0x40(%[x]),%%xmm4\n\t" |
366 | 24.6M | "movdqa 0x50(%[x]),%%xmm5\n\t" |
367 | 24.6M | "movdqa 0x60(%[x]),%%xmm6\n\t" |
368 | 24.6M | "movdqa 0x70(%[x]),%%xmm7\n\t" |
369 | | /*Add two extra bits of working precision to improve accuracy; any more and |
370 | | we could overflow.*/ |
371 | | /*We also add a few biases to correct for some systematic error that |
372 | | remains in the full fDCT->iDCT round trip.*/ |
373 | | /*xmm15={0}x8*/ |
374 | 24.6M | "pxor %%xmm15,%%xmm15\n\t" |
375 | | /*xmm14={-1}x8*/ |
376 | 24.6M | "pcmpeqb %%xmm14,%%xmm14\n\t" |
377 | 24.6M | "psllw $2,%%xmm0\n\t" |
378 | | /*xmm8=xmm0*/ |
379 | 24.6M | "movdqa %%xmm0,%%xmm8\n\t" |
380 | 24.6M | "psllw $2,%%xmm1\n\t" |
381 | | /*xmm8={_x[7...0]==0}*/ |
382 | 24.6M | "pcmpeqw %%xmm15,%%xmm8\n\t" |
383 | 24.6M | "psllw $2,%%xmm2\n\t" |
384 | | /*xmm8={_x[7...0]!=0}*/ |
385 | 24.6M | "psubw %%xmm14,%%xmm8\n\t" |
386 | 24.6M | "psllw $2,%%xmm3\n\t" |
387 | | /*%[a]=1*/ |
388 | 24.6M | "mov $1,%[a]\n\t" |
389 | | /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/ |
390 | 24.6M | "pslld $16,%%xmm8\n\t" |
391 | 24.6M | "psllw $2,%%xmm4\n\t" |
392 | | /*xmm9={0,0,0,0,0,0,0,1}*/ |
393 | 24.6M | "movd %[a],%%xmm9\n\t" |
394 | | /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/ |
395 | 24.6M | "pshufhw $0x00,%%xmm8,%%xmm8\n\t" |
396 | 24.6M | "psllw $2,%%xmm5\n\t" |
397 | | /*%[a]={1}x2*/ |
398 | 24.6M | "mov $0x10001,%[a]\n\t" |
399 | | /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/ |
400 | 24.6M | "pshuflw $0x01,%%xmm8,%%xmm8\n\t" |
401 | 24.6M | "psllw $2,%%xmm6\n\t" |
402 | | /*xmm10={0,0,0,0,0,0,1,1}*/ |
403 | 24.6M | "movd %[a],%%xmm10\n\t" |
404 | | /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/ |
405 | 24.6M | "paddw %%xmm8,%%xmm0\n\t" |
406 | 24.6M | "psllw $2,%%xmm7\n\t" |
407 | | /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/ |
408 | 24.6M | "paddw %%xmm10,%%xmm0\n\t" |
409 | | /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/ |
410 | 24.6M | "psubw %%xmm9,%%xmm1\n\t" |
411 | | /*Transform columns.*/ |
412 | 24.6M | OC_FDCT_8x8 |
413 | | /*Transform rows.*/ |
414 | 24.6M | OC_TRANSPOSE_8x8 |
415 | 24.6M | OC_FDCT_8x8 |
416 | | /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/ |
417 | 24.6M | "paddw %%xmm14,%%xmm14\n\t" |
418 | 24.6M | "psubw %%xmm14,%%xmm0\n\t" |
419 | 24.6M | "psubw %%xmm14,%%xmm1\n\t" |
420 | 24.6M | "psraw $2,%%xmm0\n\t" |
421 | 24.6M | "psubw %%xmm14,%%xmm2\n\t" |
422 | 24.6M | "psraw $2,%%xmm1\n\t" |
423 | 24.6M | "psubw %%xmm14,%%xmm3\n\t" |
424 | 24.6M | "psraw $2,%%xmm2\n\t" |
425 | 24.6M | "psubw %%xmm14,%%xmm4\n\t" |
426 | 24.6M | "psraw $2,%%xmm3\n\t" |
427 | 24.6M | "psubw %%xmm14,%%xmm5\n\t" |
428 | 24.6M | "psraw $2,%%xmm4\n\t" |
429 | 24.6M | "psubw %%xmm14,%%xmm6\n\t" |
430 | 24.6M | "psraw $2,%%xmm5\n\t" |
431 | 24.6M | "psubw %%xmm14,%%xmm7\n\t" |
432 | 24.6M | "psraw $2,%%xmm6\n\t" |
433 | 24.6M | "psraw $2,%%xmm7\n\t" |
434 | | /*Transpose, zig-zag, and store the result.*/ |
435 | | /*We could probably do better using SSSE3's palignr, but re-using MMXEXT |
436 | | version will do for now.*/ |
437 | 24.6M | #define OC_ZZ_LOAD_ROW_LO(_row,_reg) \ |
438 | 24.6M | "movdq2q %%xmm"#_row","_reg"\n\t" \ |
439 | 24.6M | |
440 | 24.6M | #define OC_ZZ_LOAD_ROW_HI(_row,_reg) \ |
441 | 24.6M | "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \ |
442 | 24.6M | "movdq2q %%xmm"#_row","_reg"\n\t" \ |
443 | 24.6M | |
444 | 24.6M | OC_TRANSPOSE_ZIG_ZAG_MMXEXT |
445 | 24.6M | #undef OC_ZZ_LOAD_ROW_LO |
446 | 24.6M | #undef OC_ZZ_LOAD_ROW_HI |
447 | 24.6M | :[a]"=&r"(a) |
448 | 24.6M | :[y]"r"(_y),[x]"r"(_x) |
449 | 24.6M | :"memory" |
450 | 24.6M | ); |
451 | 24.6M | } |
452 | | #endif |