Coverage Report

Created: 2024-09-06 07:53

/src/theora/lib/x86/sse2fdct.c
Line
Count
Source
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************/
12
/*SSE2 fDCT implementation for x86_64.*/
13
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
14
#include <stddef.h>
15
#include "x86enc.h"
16
#include "x86zigzag.h"
17
#include "sse2trans.h"
18
19
#if defined(OC_X86_64_ASM)
20
21
# define OC_FDCT_8x8 \
22
 /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
23
 "#OC_FDCT_8x8\n\t" \
24
 /*Stage 1:*/ \
25
 "movdqa %%xmm0,%%xmm11\n\t" \
26
 "movdqa %%xmm1,%%xmm10\n\t" \
27
 "movdqa %%xmm2,%%xmm9\n\t" \
28
 "movdqa %%xmm3,%%xmm8\n\t" \
29
 /*xmm11=t7'=t0-t7*/ \
30
 "psubw %%xmm7,%%xmm11\n\t" \
31
 /*xmm10=t6'=t1-t6*/ \
32
 "psubw %%xmm6,%%xmm10\n\t" \
33
 /*xmm9=t5'=t2-t5*/ \
34
 "psubw %%xmm5,%%xmm9\n\t" \
35
 /*xmm8=t4'=t3-t4*/ \
36
 "psubw %%xmm4,%%xmm8\n\t" \
37
 /*xmm0=t0'=t0+t7*/ \
38
 "paddw %%xmm7,%%xmm0\n\t" \
39
 /*xmm1=t1'=t1+t6*/ \
40
 "paddw %%xmm6,%%xmm1\n\t" \
41
 /*xmm5=t2'=t2+t5*/ \
42
 "paddw %%xmm2,%%xmm5\n\t" \
43
 /*xmm4=t3'=t3+t4*/ \
44
 "paddw %%xmm3,%%xmm4\n\t" \
45
 /*xmm2,3,6,7 are now free.*/ \
46
 /*Stage 2:*/ \
47
 "movdqa %%xmm0,%%xmm3\n\t" \
48
 "mov $0x5A806A0A,%[a]\n\t" \
49
 "movdqa %%xmm1,%%xmm2\n\t" \
50
 "movd %[a],%%xmm13\n\t" \
51
 "movdqa %%xmm10,%%xmm6\n\t" \
52
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
53
 /*xmm2=t2''=t1'-t2'*/ \
54
 "psubw %%xmm5,%%xmm2\n\t" \
55
 "pxor %%xmm12,%%xmm12\n\t" \
56
 /*xmm3=t3''=t0'-t3'*/ \
57
 "psubw %%xmm4,%%xmm3\n\t" \
58
 "psubw %%xmm14,%%xmm12\n\t" \
59
 /*xmm10=t5''=t6'-t5'*/ \
60
 "psubw %%xmm9,%%xmm10\n\t" \
61
 "paddw %%xmm12,%%xmm12\n\t" \
62
 /*xmm4=t0''=t0'+t3'*/ \
63
 "paddw %%xmm0,%%xmm4\n\t" \
64
 /*xmm1=t1''=t1'+t2'*/ \
65
 "paddw %%xmm5,%%xmm1\n\t" \
66
 /*xmm6=t6''=t6'+t5'*/ \
67
 "paddw %%xmm9,%%xmm6\n\t" \
68
 /*xmm0,xmm5,xmm9 are now free.*/ \
69
 /*Stage 3:*/ \
70
 /*xmm10:xmm5=t5''*27146+0xB500 \
71
   xmm0=t5''*/ \
72
 "movdqa %%xmm10,%%xmm5\n\t" \
73
 "movdqa %%xmm10,%%xmm0\n\t" \
74
 "punpckhwd %%xmm12,%%xmm10\n\t" \
75
 "pmaddwd %%xmm13,%%xmm10\n\t" \
76
 "punpcklwd %%xmm12,%%xmm5\n\t" \
77
 "pmaddwd %%xmm13,%%xmm5\n\t" \
78
 /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
79
 "psrad $16,%%xmm10\n\t" \
80
 "psrad $16,%%xmm5\n\t" \
81
 "packssdw %%xmm10,%%xmm5\n\t" \
82
 "paddw %%xmm0,%%xmm5\n\t" \
83
 /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
84
 "pcmpeqw %%xmm15,%%xmm0\n\t" \
85
 "psubw %%xmm14,%%xmm0\n\t" \
86
 "paddw %%xmm5,%%xmm0\n\t" \
87
 "movdqa %%xmm8,%%xmm5\n\t" \
88
 "psraw $1,%%xmm0\n\t" \
89
 /*xmm5=t5'''=t4'-s*/ \
90
 "psubw %%xmm0,%%xmm5\n\t" \
91
 /*xmm8=t4''=t4'+s*/ \
92
 "paddw %%xmm0,%%xmm8\n\t" \
93
 /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
94
 /*xmm7:xmm9=t6''*27146+0xB500*/ \
95
 "movdqa %%xmm6,%%xmm7\n\t" \
96
 "movdqa %%xmm6,%%xmm9\n\t" \
97
 "punpckhwd %%xmm12,%%xmm7\n\t" \
98
 "pmaddwd %%xmm13,%%xmm7\n\t" \
99
 "punpcklwd %%xmm12,%%xmm9\n\t" \
100
 "pmaddwd %%xmm13,%%xmm9\n\t" \
101
 /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
102
 "psrad $16,%%xmm7\n\t" \
103
 "psrad $16,%%xmm9\n\t" \
104
 "packssdw %%xmm7,%%xmm9\n\t" \
105
 "paddw %%xmm6,%%xmm9\n\t" \
106
 /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
107
 "pcmpeqw %%xmm15,%%xmm6\n\t" \
108
 "psubw %%xmm14,%%xmm6\n\t" \
109
 "paddw %%xmm6,%%xmm9\n\t" \
110
 "movdqa %%xmm11,%%xmm7\n\t" \
111
 "psraw $1,%%xmm9\n\t" \
112
 /*xmm7=t6'''=t7'-s*/ \
113
 "psubw %%xmm9,%%xmm7\n\t" \
114
 /*xmm9=t7''=t7'+s*/ \
115
 "paddw %%xmm11,%%xmm9\n\t" \
116
 /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
117
 /*Stage 4:*/ \
118
 /*xmm10:xmm0=t1''*27146+0xB500*/ \
119
 "movdqa %%xmm1,%%xmm0\n\t" \
120
 "movdqa %%xmm1,%%xmm10\n\t" \
121
 "punpcklwd %%xmm12,%%xmm0\n\t" \
122
 "pmaddwd %%xmm13,%%xmm0\n\t" \
123
 "punpckhwd %%xmm12,%%xmm10\n\t" \
124
 "pmaddwd %%xmm13,%%xmm10\n\t" \
125
 /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
126
 "psrad $16,%%xmm0\n\t" \
127
 "psrad $16,%%xmm10\n\t" \
128
 "mov $0x20006A0A,%[a]\n\t" \
129
 "packssdw %%xmm10,%%xmm0\n\t" \
130
 "movd %[a],%%xmm13\n\t" \
131
 "paddw %%xmm1,%%xmm0\n\t" \
132
 /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
133
 "pcmpeqw %%xmm15,%%xmm1\n\t" \
134
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
135
 "psubw %%xmm14,%%xmm1\n\t" \
136
 "paddw %%xmm1,%%xmm0\n\t" \
137
 /*xmm10:xmm4=t0''*27146+0x4000*/ \
138
 "movdqa %%xmm4,%%xmm1\n\t" \
139
 "movdqa %%xmm4,%%xmm10\n\t" \
140
 "punpcklwd %%xmm12,%%xmm4\n\t" \
141
 "pmaddwd %%xmm13,%%xmm4\n\t" \
142
 "punpckhwd %%xmm12,%%xmm10\n\t" \
143
 "pmaddwd %%xmm13,%%xmm10\n\t" \
144
 /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
145
 "psrad $16,%%xmm4\n\t" \
146
 "psrad $16,%%xmm10\n\t" \
147
 "mov $0x6CB7,%[a]\n\t" \
148
 "packssdw %%xmm10,%%xmm4\n\t" \
149
 "movd %[a],%%xmm12\n\t" \
150
 "paddw %%xmm1,%%xmm4\n\t" \
151
 /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
152
 "pcmpeqw %%xmm15,%%xmm1\n\t" \
153
 "pshufd $00,%%xmm12,%%xmm12\n\t" \
154
 "psubw %%xmm14,%%xmm1\n\t" \
155
 "mov $0x7FFF6C84,%[a]\n\t" \
156
 "paddw %%xmm1,%%xmm4\n\t" \
157
 /*xmm0=_y[0]=u=r+s>>1 \
158
   The naive implementation could cause overflow, so we use \
159
    u=(r&s)+((r^s)>>1).*/ \
160
 "movdqa %%xmm0,%%xmm6\n\t" \
161
 "pxor %%xmm4,%%xmm0\n\t" \
162
 "pand %%xmm4,%%xmm6\n\t" \
163
 "psraw $1,%%xmm0\n\t" \
164
 "movd %[a],%%xmm13\n\t" \
165
 "paddw %%xmm6,%%xmm0\n\t" \
166
 /*xmm4=_y[4]=v=r-u*/ \
167
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
168
 "psubw %%xmm0,%%xmm4\n\t" \
169
 /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
170
 /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
171
 "movdqa %%xmm3,%%xmm10\n\t" \
172
 "movdqa %%xmm3,%%xmm6\n\t" \
173
 "punpcklwd %%xmm3,%%xmm10\n\t" \
174
 "pmaddwd %%xmm13,%%xmm10\n\t" \
175
 "mov $0x61F861F8,%[a]\n\t" \
176
 "punpckhwd %%xmm3,%%xmm6\n\t" \
177
 "pmaddwd %%xmm13,%%xmm6\n\t" \
178
 "movd %[a],%%xmm13\n\t" \
179
 "paddd %%xmm12,%%xmm10\n\t" \
180
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
181
 "paddd %%xmm12,%%xmm6\n\t" \
182
 /*xmm1:xmm2=25080*t2'' \
183
   xmm12=t2''*/ \
184
 "movdqa %%xmm2,%%xmm11\n\t" \
185
 "movdqa %%xmm2,%%xmm12\n\t" \
186
 "pmullw %%xmm13,%%xmm2\n\t" \
187
 "pmulhw %%xmm13,%%xmm11\n\t" \
188
 "movdqa %%xmm2,%%xmm1\n\t" \
189
 "punpcklwd %%xmm11,%%xmm2\n\t" \
190
 "punpckhwd %%xmm11,%%xmm1\n\t" \
191
 /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
192
 "paddd %%xmm2,%%xmm10\n\t" \
193
 "paddd %%xmm1,%%xmm6\n\t" \
194
 "psrad $16,%%xmm10\n\t" \
195
 "pcmpeqw %%xmm15,%%xmm3\n\t" \
196
 "psrad $16,%%xmm6\n\t" \
197
 "psubw %%xmm14,%%xmm3\n\t" \
198
 "packssdw %%xmm6,%%xmm10\n\t" \
199
 "paddw %%xmm3,%%xmm10\n\t" \
200
 /*xmm2=_y[2]=u \
201
   xmm10=s=(25080*u>>16)-t2''*/ \
202
 "movdqa %%xmm10,%%xmm2\n\t" \
203
 "pmulhw %%xmm13,%%xmm10\n\t" \
204
 "psubw %%xmm12,%%xmm10\n\t" \
205
 /*xmm1:xmm6=s*21600+0x2800*/ \
206
 "pxor %%xmm12,%%xmm12\n\t" \
207
 "psubw %%xmm14,%%xmm12\n\t" \
208
 "mov $0x28005460,%[a]\n\t" \
209
 "movd %[a],%%xmm13\n\t" \
210
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
211
 "movdqa %%xmm10,%%xmm6\n\t" \
212
 "movdqa %%xmm10,%%xmm1\n\t" \
213
 "punpcklwd %%xmm12,%%xmm6\n\t" \
214
 "pmaddwd %%xmm13,%%xmm6\n\t" \
215
 "mov $0x0E3D,%[a]\n\t" \
216
 "punpckhwd %%xmm12,%%xmm1\n\t" \
217
 "pmaddwd %%xmm13,%%xmm1\n\t" \
218
 /*xmm6=(s*21600+0x2800>>18)+s*/ \
219
 "psrad $18,%%xmm6\n\t" \
220
 "psrad $18,%%xmm1\n\t" \
221
 "movd %[a],%%xmm12\n\t" \
222
 "packssdw %%xmm1,%%xmm6\n\t" \
223
 "pshufd $00,%%xmm12,%%xmm12\n\t" \
224
 "paddw %%xmm10,%%xmm6\n\t" \
225
 /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
226
 "mov $0x7FFF54DC,%[a]\n\t" \
227
 "pcmpeqw %%xmm15,%%xmm10\n\t" \
228
 "movd %[a],%%xmm13\n\t" \
229
 "psubw %%xmm14,%%xmm10\n\t" \
230
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
231
 "paddw %%xmm10,%%xmm6\n\t " \
232
 /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
233
 /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
234
 "movdqa %%xmm5,%%xmm10\n\t" \
235
 "movdqa %%xmm5,%%xmm11\n\t" \
236
 "punpcklwd %%xmm5,%%xmm10\n\t" \
237
 "pmaddwd %%xmm13,%%xmm10\n\t" \
238
 "mov $0x8E3A8E3A,%[a]\n\t" \
239
 "punpckhwd %%xmm5,%%xmm11\n\t" \
240
 "pmaddwd %%xmm13,%%xmm11\n\t" \
241
 "movd %[a],%%xmm13\n\t" \
242
 "paddd %%xmm12,%%xmm10\n\t" \
243
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
244
 "paddd %%xmm12,%%xmm11\n\t" \
245
 /*xmm7:xmm12=36410*t6''' \
246
   xmm1=t6'''*/ \
247
 "movdqa %%xmm7,%%xmm3\n\t" \
248
 "movdqa %%xmm7,%%xmm1\n\t" \
249
 "pmulhw %%xmm13,%%xmm3\n\t" \
250
 "pmullw %%xmm13,%%xmm7\n\t" \
251
 "paddw %%xmm1,%%xmm3\n\t" \
252
 "movdqa %%xmm7,%%xmm12\n\t" \
253
 "punpckhwd %%xmm3,%%xmm7\n\t" \
254
 "punpcklwd %%xmm3,%%xmm12\n\t" \
255
 /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
256
 "paddd %%xmm12,%%xmm10\n\t" \
257
 "paddd %%xmm7,%%xmm11\n\t" \
258
 "psrad $16,%%xmm10\n\t" \
259
 "pcmpeqw %%xmm15,%%xmm5\n\t" \
260
 "psrad $16,%%xmm11\n\t" \
261
 "psubw %%xmm14,%%xmm5\n\t" \
262
 "packssdw %%xmm11,%%xmm10\n\t" \
263
 "pxor %%xmm12,%%xmm12\n\t" \
264
 "paddw %%xmm5,%%xmm10\n\t" \
265
 /*xmm5=_y[5]=u \
266
   xmm1=s=t6'''-(36410*u>>16)*/ \
267
 "psubw %%xmm14,%%xmm12\n\t" \
268
 "movdqa %%xmm10,%%xmm5\n\t" \
269
 "mov $0x340067C8,%[a]\n\t" \
270
 "pmulhw %%xmm13,%%xmm10\n\t" \
271
 "movd %[a],%%xmm13\n\t" \
272
 "paddw %%xmm5,%%xmm10\n\t" \
273
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
274
 "psubw %%xmm10,%%xmm1\n\t" \
275
 /*xmm11:xmm3=s*26568+0x3400*/ \
276
 "movdqa %%xmm1,%%xmm3\n\t" \
277
 "movdqa %%xmm1,%%xmm11\n\t" \
278
 "punpcklwd %%xmm12,%%xmm3\n\t" \
279
 "pmaddwd %%xmm13,%%xmm3\n\t" \
280
 "mov $0x7B1B,%[a]\n\t" \
281
 "punpckhwd %%xmm12,%%xmm11\n\t" \
282
 "pmaddwd %%xmm13,%%xmm11\n\t" \
283
 /*xmm3=(s*26568+0x3400>>17)+s*/ \
284
 "psrad $17,%%xmm3\n\t" \
285
 "psrad $17,%%xmm11\n\t" \
286
 "movd %[a],%%xmm12\n\t" \
287
 "packssdw %%xmm11,%%xmm3\n\t" \
288
 "pshufd $00,%%xmm12,%%xmm12\n\t" \
289
 "paddw %%xmm1,%%xmm3\n\t" \
290
 /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
291
 "mov $0x7FFF7B16,%[a]\n\t" \
292
 "pcmpeqw %%xmm15,%%xmm1\n\t" \
293
 "movd %[a],%%xmm13\n\t" \
294
 "psubw %%xmm14,%%xmm1\n\t" \
295
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
296
 "paddw %%xmm1,%%xmm3\n\t " \
297
 /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
298
 /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
299
 "movdqa %%xmm9,%%xmm10\n\t" \
300
 "movdqa %%xmm9,%%xmm11\n\t" \
301
 "punpcklwd %%xmm9,%%xmm10\n\t" \
302
 "pmaddwd %%xmm13,%%xmm10\n\t" \
303
 "mov $0x31F131F1,%[a]\n\t" \
304
 "punpckhwd %%xmm9,%%xmm11\n\t" \
305
 "pmaddwd %%xmm13,%%xmm11\n\t" \
306
 "movd %[a],%%xmm13\n\t" \
307
 "paddd %%xmm12,%%xmm10\n\t" \
308
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
309
 "paddd %%xmm12,%%xmm11\n\t" \
310
 /*xmm12:xmm7=12785*t4''*/ \
311
 "movdqa %%xmm8,%%xmm7\n\t" \
312
 "movdqa %%xmm8,%%xmm1\n\t" \
313
 "pmullw %%xmm13,%%xmm7\n\t" \
314
 "pmulhw %%xmm13,%%xmm1\n\t" \
315
 "movdqa %%xmm7,%%xmm12\n\t" \
316
 "punpcklwd %%xmm1,%%xmm7\n\t" \
317
 "punpckhwd %%xmm1,%%xmm12\n\t" \
318
 /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
319
 "paddd %%xmm7,%%xmm10\n\t" \
320
 "paddd %%xmm12,%%xmm11\n\t" \
321
 "psrad $16,%%xmm10\n\t" \
322
 "pcmpeqw %%xmm15,%%xmm9\n\t" \
323
 "psrad $16,%%xmm11\n\t" \
324
 "psubw %%xmm14,%%xmm9\n\t" \
325
 "packssdw %%xmm11,%%xmm10\n\t" \
326
 "pxor %%xmm12,%%xmm12\n\t" \
327
 "paddw %%xmm9,%%xmm10\n\t" \
328
 /*xmm1=_y[1]=u \
329
   xmm10=s=(12785*u>>16)-t4''*/ \
330
 "psubw %%xmm14,%%xmm12\n\t" \
331
 "movdqa %%xmm10,%%xmm1\n\t" \
332
 "mov $0x3000503B,%[a]\n\t" \
333
 "pmulhw %%xmm13,%%xmm10\n\t" \
334
 "movd %[a],%%xmm13\n\t" \
335
 "psubw %%xmm8,%%xmm10\n\t" \
336
 "pshufd $00,%%xmm13,%%xmm13\n\t" \
337
 /*xmm8:xmm7=s*20539+0x3000*/ \
338
 "movdqa %%xmm10,%%xmm7\n\t" \
339
 "movdqa %%xmm10,%%xmm8\n\t" \
340
 "punpcklwd %%xmm12,%%xmm7\n\t" \
341
 "pmaddwd %%xmm13,%%xmm7\n\t" \
342
 "punpckhwd %%xmm12,%%xmm8\n\t" \
343
 "pmaddwd %%xmm13,%%xmm8\n\t" \
344
 /*xmm7=(s*20539+0x3000>>20)+s*/ \
345
 "psrad $20,%%xmm7\n\t" \
346
 "psrad $20,%%xmm8\n\t" \
347
 "packssdw %%xmm8,%%xmm7\n\t" \
348
 "paddw %%xmm10,%%xmm7\n\t" \
349
 /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
350
 "pcmpeqw %%xmm15,%%xmm10\n\t" \
351
 "psubw %%xmm14,%%xmm10\n\t" \
352
 "paddw %%xmm10,%%xmm7\n\t " \
353
354
/*SSE2 implementation of the fDCT for x86-64 only.
355
  Because of the 8 extra XMM registers on x86-64, this version can operate
356
   without any temporary stack access at all.*/
357
24.6M
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
358
24.6M
  ptrdiff_t a;
359
24.6M
  __asm__ __volatile__(
360
    /*Load the input.*/
361
24.6M
    "movdqa 0x00(%[x]),%%xmm0\n\t"
362
24.6M
    "movdqa 0x10(%[x]),%%xmm1\n\t"
363
24.6M
    "movdqa 0x20(%[x]),%%xmm2\n\t"
364
24.6M
    "movdqa 0x30(%[x]),%%xmm3\n\t"
365
24.6M
    "movdqa 0x40(%[x]),%%xmm4\n\t"
366
24.6M
    "movdqa 0x50(%[x]),%%xmm5\n\t"
367
24.6M
    "movdqa 0x60(%[x]),%%xmm6\n\t"
368
24.6M
    "movdqa 0x70(%[x]),%%xmm7\n\t"
369
    /*Add two extra bits of working precision to improve accuracy; any more and
370
       we could overflow.*/
371
    /*We also add a few biases to correct for some systematic error that
372
       remains in the full fDCT->iDCT round trip.*/
373
    /*xmm15={0}x8*/
374
24.6M
    "pxor %%xmm15,%%xmm15\n\t"
375
    /*xmm14={-1}x8*/
376
24.6M
    "pcmpeqb %%xmm14,%%xmm14\n\t"
377
24.6M
    "psllw $2,%%xmm0\n\t"
378
    /*xmm8=xmm0*/
379
24.6M
    "movdqa %%xmm0,%%xmm8\n\t"
380
24.6M
    "psllw $2,%%xmm1\n\t"
381
    /*xmm8={_x[7...0]==0}*/
382
24.6M
    "pcmpeqw %%xmm15,%%xmm8\n\t"
383
24.6M
    "psllw $2,%%xmm2\n\t"
384
    /*xmm8={_x[7...0]!=0}*/
385
24.6M
    "psubw %%xmm14,%%xmm8\n\t"
386
24.6M
    "psllw $2,%%xmm3\n\t"
387
    /*%[a]=1*/
388
24.6M
    "mov $1,%[a]\n\t"
389
    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
390
24.6M
    "pslld $16,%%xmm8\n\t"
391
24.6M
    "psllw $2,%%xmm4\n\t"
392
    /*xmm9={0,0,0,0,0,0,0,1}*/
393
24.6M
    "movd %[a],%%xmm9\n\t"
394
    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
395
24.6M
    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
396
24.6M
    "psllw $2,%%xmm5\n\t"
397
    /*%[a]={1}x2*/
398
24.6M
    "mov $0x10001,%[a]\n\t"
399
    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
400
24.6M
    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
401
24.6M
    "psllw $2,%%xmm6\n\t"
402
    /*xmm10={0,0,0,0,0,0,1,1}*/
403
24.6M
    "movd %[a],%%xmm10\n\t"
404
    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
405
24.6M
    "paddw %%xmm8,%%xmm0\n\t"
406
24.6M
    "psllw $2,%%xmm7\n\t"
407
    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
408
24.6M
    "paddw %%xmm10,%%xmm0\n\t"
409
    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
410
24.6M
    "psubw %%xmm9,%%xmm1\n\t"
411
    /*Transform columns.*/
412
24.6M
    OC_FDCT_8x8
413
    /*Transform rows.*/
414
24.6M
    OC_TRANSPOSE_8x8
415
24.6M
    OC_FDCT_8x8
416
    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
417
24.6M
    "paddw %%xmm14,%%xmm14\n\t"
418
24.6M
    "psubw %%xmm14,%%xmm0\n\t"
419
24.6M
    "psubw %%xmm14,%%xmm1\n\t"
420
24.6M
    "psraw $2,%%xmm0\n\t"
421
24.6M
    "psubw %%xmm14,%%xmm2\n\t"
422
24.6M
    "psraw $2,%%xmm1\n\t"
423
24.6M
    "psubw %%xmm14,%%xmm3\n\t"
424
24.6M
    "psraw $2,%%xmm2\n\t"
425
24.6M
    "psubw %%xmm14,%%xmm4\n\t"
426
24.6M
    "psraw $2,%%xmm3\n\t"
427
24.6M
    "psubw %%xmm14,%%xmm5\n\t"
428
24.6M
    "psraw $2,%%xmm4\n\t"
429
24.6M
    "psubw %%xmm14,%%xmm6\n\t"
430
24.6M
    "psraw $2,%%xmm5\n\t"
431
24.6M
    "psubw %%xmm14,%%xmm7\n\t"
432
24.6M
    "psraw $2,%%xmm6\n\t"
433
24.6M
    "psraw $2,%%xmm7\n\t"
434
    /*Transpose, zig-zag, and store the result.*/
435
    /*We could probably do better using SSSE3's palignr, but re-using MMXEXT
436
       version will do for now.*/
437
24.6M
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
438
24.6M
    "movdq2q %%xmm"#_row","_reg"\n\t" \
439
24.6M
440
24.6M
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
441
24.6M
    "punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
442
24.6M
    "movdq2q %%xmm"#_row","_reg"\n\t" \
443
24.6M
444
24.6M
    OC_TRANSPOSE_ZIG_ZAG_MMXEXT
445
24.6M
#undef OC_ZZ_LOAD_ROW_LO
446
24.6M
#undef OC_ZZ_LOAD_ROW_HI
447
24.6M
    :[a]"=&r"(a)
448
24.6M
    :[y]"r"(_y),[x]"r"(_x)
449
24.6M
    :"memory"
450
24.6M
  );
451
24.6M
}
452
#endif