/src/theora/lib/x86/sse2idct.c
Line | Count | Source |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
15 | | |
16 | | ********************************************************************/ |
17 | | |
18 | | /*SSE2 acceleration of Theora's iDCT.*/ |
19 | | #include "x86int.h" |
20 | | #include "sse2trans.h" |
21 | | #include "../dct.h" |
22 | | |
23 | | #if defined(OC_X86_ASM) |
24 | | |
25 | | /*A table of constants used by the MMX routines.*/ |
26 | | const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
27 | | 8, 8, 8, 8, 8, 8, 8, 8, |
28 | | OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
29 | | OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
30 | | OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
31 | | OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
32 | | OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
33 | | OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
34 | | OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
35 | | }; |
36 | | |
37 | | |
38 | | /*Performs the first three stages of the iDCT. |
39 | | xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
40 | | (accessed in that order). |
41 | | The remaining rows must be in _x at their corresponding locations. |
42 | | On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
43 | | contain rows 4 through 7.*/ |
44 | | #define OC_IDCT_8x8_ABC(_x) \ |
45 | | "#OC_IDCT_8x8_ABC\n\t" \ |
46 | | /*Stage 1:*/ \ |
47 | | /*2-3 rotation by 6pi/16. \ |
48 | | xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
49 | | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
50 | | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
51 | | "movdqa %%xmm1,%%xmm0\n\t" \ |
52 | | "pmulhw %%xmm2,%%xmm1\n\t" \ |
53 | | "movdqa %%xmm4,%%xmm7\n\t" \ |
54 | | "pmulhw %%xmm6,%%xmm0\n\t" \ |
55 | | "pmulhw %%xmm2,%%xmm7\n\t" \ |
56 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
57 | | "paddw %%xmm6,%%xmm0\n\t" \ |
58 | | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
59 | | "paddw %%xmm1,%%xmm2\n\t" \ |
60 | | "psubw %%xmm0,%%xmm7\n\t" \ |
61 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
62 | | "paddw %%xmm4,%%xmm2\n\t" \ |
63 | | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
64 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
65 | | /*5-6 rotation by 3pi/16. \ |
66 | | xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
67 | | "movdqa %%xmm4,%%xmm2\n\t" \ |
68 | | "movdqa %%xmm6,%%xmm1\n\t" \ |
69 | | "pmulhw %%xmm3,%%xmm4\n\t" \ |
70 | | "pmulhw %%xmm5,%%xmm1\n\t" \ |
71 | | "pmulhw %%xmm3,%%xmm6\n\t" \ |
72 | | "pmulhw %%xmm5,%%xmm2\n\t" \ |
73 | | "paddw %%xmm3,%%xmm4\n\t" \ |
74 | | "paddw %%xmm5,%%xmm3\n\t" \ |
75 | | "paddw %%xmm6,%%xmm3\n\t" \ |
76 | | "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
77 | | "paddw %%xmm5,%%xmm1\n\t" \ |
78 | | "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
79 | | "paddw %%xmm3,%%xmm2\n\t" \ |
80 | | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
81 | | "psubw %%xmm4,%%xmm1\n\t" \ |
82 | | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
83 | | /*4-7 rotation by 7pi/16. \ |
84 | | xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
85 | | "movdqa %%xmm3,%%xmm0\n\t" \ |
86 | | "movdqa %%xmm4,%%xmm7\n\t" \ |
87 | | "pmulhw %%xmm5,%%xmm3\n\t" \ |
88 | | "pmulhw %%xmm5,%%xmm7\n\t" \ |
89 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
90 | | "pmulhw %%xmm6,%%xmm0\n\t" \ |
91 | | "paddw %%xmm6,%%xmm4\n\t" \ |
92 | | "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
93 | | "paddw %%xmm5,%%xmm7\n\t" \ |
94 | | "psubw %%xmm4,%%xmm3\n\t" \ |
95 | | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
96 | | "paddw %%xmm7,%%xmm0\n\t" \ |
97 | | "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
98 | | /*0-1 butterfly. \ |
99 | | xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
100 | | "paddw %%xmm7,%%xmm6\n\t" \ |
101 | | "movdqa %%xmm4,%%xmm5\n\t" \ |
102 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
103 | | "paddw %%xmm7,%%xmm7\n\t" \ |
104 | | "psubw %%xmm6,%%xmm7\n\t" \ |
105 | | "paddw %%xmm6,%%xmm4\n\t" \ |
106 | | /*Stage 2:*/ \ |
107 | | /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
108 | | 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
109 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
110 | | "paddw %%xmm1,%%xmm3\n\t" \ |
111 | | "psubw %%xmm1,%%xmm6\n\t" \ |
112 | | "movdqa %%xmm5,%%xmm1\n\t" \ |
113 | | "pmulhw %%xmm7,%%xmm5\n\t" \ |
114 | | "paddw %%xmm7,%%xmm5\n\t" \ |
115 | | "movdqa %%xmm0,%%xmm7\n\t" \ |
116 | | "paddw %%xmm2,%%xmm0\n\t" \ |
117 | | "psubw %%xmm2,%%xmm7\n\t" \ |
118 | | "movdqa %%xmm1,%%xmm2\n\t" \ |
119 | | "pmulhw %%xmm6,%%xmm1\n\t" \ |
120 | | "pmulhw %%xmm7,%%xmm2\n\t" \ |
121 | | "paddw %%xmm6,%%xmm1\n\t" \ |
122 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
123 | | "paddw %%xmm7,%%xmm2\n\t" \ |
124 | | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
125 | | /*Stage 3: \ |
126 | | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
127 | | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
128 | | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
129 | | "paddw %%xmm2,%%xmm1\n\t" \ |
130 | | "paddw %%xmm5,%%xmm6\n\t" \ |
131 | | "paddw %%xmm4,%%xmm7\n\t" \ |
132 | | "paddw %%xmm2,%%xmm2\n\t" \ |
133 | | "paddw %%xmm4,%%xmm4\n\t" \ |
134 | | "paddw %%xmm5,%%xmm5\n\t" \ |
135 | | "psubw %%xmm1,%%xmm2\n\t" \ |
136 | | "psubw %%xmm7,%%xmm4\n\t" \ |
137 | | "psubw %%xmm6,%%xmm5\n\t" \ |
138 | | |
139 | | /*Performs the last stage of the iDCT. |
140 | | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
141 | | contain rows 4 through 7. |
142 | | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
143 | | #define OC_IDCT_8x8_D \ |
144 | | "#OC_IDCT_8x8_D\n\t" \ |
145 | | /*Stage 4: \ |
146 | | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
147 | | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
148 | | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
149 | | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
150 | | "psubw %%xmm0,%%xmm7\n\t" \ |
151 | | "psubw %%xmm1,%%xmm6\n\t" \ |
152 | | "psubw %%xmm2,%%xmm5\n\t" \ |
153 | | "psubw %%xmm3,%%xmm4\n\t" \ |
154 | | "paddw %%xmm0,%%xmm0\n\t" \ |
155 | | "paddw %%xmm1,%%xmm1\n\t" \ |
156 | | "paddw %%xmm2,%%xmm2\n\t" \ |
157 | | "paddw %%xmm3,%%xmm3\n\t" \ |
158 | | "paddw %%xmm7,%%xmm0\n\t" \ |
159 | | "paddw %%xmm6,%%xmm1\n\t" \ |
160 | | "paddw %%xmm5,%%xmm2\n\t" \ |
161 | | "paddw %%xmm4,%%xmm3\n\t" \ |
162 | | |
163 | | /*Performs the last stage of the iDCT. |
164 | | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
165 | | contain rows 4 through 7. |
166 | | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
167 | | #define OC_IDCT_8x8_D_STORE \ |
168 | | "#OC_IDCT_8x8_D_STORE\n\t" \ |
169 | | /*Stage 4: \ |
170 | | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
171 | | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
172 | | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
173 | | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
174 | | "psubw %%xmm3,%%xmm4\n\t" \ |
175 | | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
176 | | "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
177 | | "psubw %%xmm0,%%xmm7\n\t" \ |
178 | | "psubw %%xmm1,%%xmm6\n\t" \ |
179 | | "psubw %%xmm2,%%xmm5\n\t" \ |
180 | | "paddw %%xmm4,%%xmm7\n\t" \ |
181 | | "paddw %%xmm4,%%xmm6\n\t" \ |
182 | | "paddw %%xmm4,%%xmm5\n\t" \ |
183 | | "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
184 | | "paddw %%xmm0,%%xmm0\n\t" \ |
185 | | "paddw %%xmm1,%%xmm1\n\t" \ |
186 | | "paddw %%xmm2,%%xmm2\n\t" \ |
187 | | "paddw %%xmm3,%%xmm3\n\t" \ |
188 | | "paddw %%xmm7,%%xmm0\n\t" \ |
189 | | "paddw %%xmm6,%%xmm1\n\t" \ |
190 | | "psraw $4,%%xmm0\n\t" \ |
191 | | "paddw %%xmm5,%%xmm2\n\t" \ |
192 | | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
193 | | "psraw $4,%%xmm1\n\t" \ |
194 | | "paddw %%xmm4,%%xmm3\n\t" \ |
195 | | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
196 | | "psraw $4,%%xmm2\n\t" \ |
197 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
198 | | "psraw $4,%%xmm3\n\t" \ |
199 | | "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
200 | | "psraw $4,%%xmm4\n\t" \ |
201 | | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
202 | | "psraw $4,%%xmm5\n\t" \ |
203 | | "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
204 | | "psraw $4,%%xmm6\n\t" \ |
205 | | "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
206 | | "psraw $4,%%xmm7\n\t" \ |
207 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
208 | | |
209 | 4.78M | static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
210 | 4.78M | OC_ALIGN16(ogg_int16_t buf[16]); |
211 | 4.78M | int i; |
212 | | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
213 | 4.78M | __asm__ __volatile__( |
214 | | /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
215 | 4.78M | "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
216 | 4.78M | "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
217 | 4.78M | "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
218 | 4.78M | "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
219 | 4.78M | OC_IDCT_8x8_ABC(x) |
220 | 4.78M | OC_IDCT_8x8_D |
221 | 4.78M | OC_TRANSPOSE_8x8 |
222 | | /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
223 | 4.78M | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" |
224 | 4.78M | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" |
225 | 4.78M | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" |
226 | 4.78M | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" |
227 | 4.78M | OC_IDCT_8x8_ABC(y) |
228 | 4.78M | OC_IDCT_8x8_D_STORE |
229 | 4.78M | :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
230 | 4.78M | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
231 | 4.78M | :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
232 | 4.78M | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
233 | 4.78M | ); |
234 | 4.78M | __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); |
235 | | /*Clear input data for next block (decoder only).*/ |
236 | 14.3M | for(i=0;i<2;i++){ |
237 | 9.57M | __asm__ __volatile__( |
238 | 9.57M | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
239 | 9.57M | "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
240 | 9.57M | "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
241 | 9.57M | "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
242 | 9.57M | :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
243 | 9.57M | ); |
244 | 9.57M | } |
245 | 4.78M | } |
246 | | |
247 | | /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
248 | | need to work with four columns at a time. |
249 | | Doing this in MMX is faster on processors with a 64-bit data path.*/ |
250 | | #define OC_IDCT_8x8_10_MMX \ |
251 | | "#OC_IDCT_8x8_10_MMX\n\t" \ |
252 | | /*Stage 1:*/ \ |
253 | | /*2-3 rotation by 6pi/16. \ |
254 | | mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
255 | | "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
256 | | "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
257 | | "pmulhw %%mm2,%%mm6\n\t" \ |
258 | | "pmulhw %%mm2,%%mm7\n\t" \ |
259 | | "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
260 | | "paddw %%mm6,%%mm2\n\t" \ |
261 | | "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
262 | | "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
263 | | "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
264 | | /*5-6 rotation by 3pi/16. \ |
265 | | mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
266 | | "pmulhw %%mm3,%%mm5\n\t" \ |
267 | | "pmulhw %%mm3,%%mm2\n\t" \ |
268 | | "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
269 | | "paddw %%mm3,%%mm5\n\t" \ |
270 | | "paddw %%mm3,%%mm2\n\t" \ |
271 | | "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
272 | | /*4-7 rotation by 7pi/16. \ |
273 | | mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
274 | | "pmulhw %%mm1,%%mm3\n\t" \ |
275 | | "pmulhw %%mm1,%%mm7\n\t" \ |
276 | | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
277 | | "movq %%mm3,%%mm6\n\t" \ |
278 | | "paddw %%mm1,%%mm7\n\t" \ |
279 | | /*0-1 butterfly. \ |
280 | | mm4=C4, mm0=X0, X4=0.*/ \ |
281 | | /*Stage 2:*/ \ |
282 | | /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
283 | | 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
284 | | "psubw %%mm5,%%mm3\n\t" \ |
285 | | "paddw %%mm5,%%mm6\n\t" \ |
286 | | "movq %%mm4,%%mm1\n\t" \ |
287 | | "pmulhw %%mm0,%%mm4\n\t" \ |
288 | | "paddw %%mm0,%%mm4\n\t" \ |
289 | | "movq %%mm7,%%mm0\n\t" \ |
290 | | "movq %%mm4,%%mm5\n\t" \ |
291 | | "paddw %%mm2,%%mm0\n\t" \ |
292 | | "psubw %%mm2,%%mm7\n\t" \ |
293 | | "movq %%mm1,%%mm2\n\t" \ |
294 | | "pmulhw %%mm6,%%mm1\n\t" \ |
295 | | "pmulhw %%mm7,%%mm2\n\t" \ |
296 | | "paddw %%mm6,%%mm1\n\t" \ |
297 | | "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
298 | | "paddw %%mm7,%%mm2\n\t" \ |
299 | | "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
300 | | /*Stage 3: \ |
301 | | 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
302 | | 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
303 | | 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
304 | | "paddw %%mm2,%%mm1\n\t" \ |
305 | | "paddw %%mm5,%%mm6\n\t" \ |
306 | | "paddw %%mm4,%%mm7\n\t" \ |
307 | | "paddw %%mm2,%%mm2\n\t" \ |
308 | | "paddw %%mm4,%%mm4\n\t" \ |
309 | | "paddw %%mm5,%%mm5\n\t" \ |
310 | | "psubw %%mm1,%%mm2\n\t" \ |
311 | | "psubw %%mm7,%%mm4\n\t" \ |
312 | | "psubw %%mm6,%%mm5\n\t" \ |
313 | | /*Stage 4: \ |
314 | | 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
315 | | 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
316 | | 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
317 | | 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
318 | | "psubw %%mm0,%%mm7\n\t" \ |
319 | | "psubw %%mm1,%%mm6\n\t" \ |
320 | | "psubw %%mm2,%%mm5\n\t" \ |
321 | | "psubw %%mm3,%%mm4\n\t" \ |
322 | | "paddw %%mm0,%%mm0\n\t" \ |
323 | | "paddw %%mm1,%%mm1\n\t" \ |
324 | | "paddw %%mm2,%%mm2\n\t" \ |
325 | | "paddw %%mm3,%%mm3\n\t" \ |
326 | | "paddw %%mm7,%%mm0\n\t" \ |
327 | | "paddw %%mm6,%%mm1\n\t" \ |
328 | | "paddw %%mm5,%%mm2\n\t" \ |
329 | | "paddw %%mm4,%%mm3\n\t" \ |
330 | | |
331 | | #define OC_IDCT_8x8_10_ABC \ |
332 | | "#OC_IDCT_8x8_10_ABC\n\t" \ |
333 | | /*Stage 1:*/ \ |
334 | | /*2-3 rotation by 6pi/16. \ |
335 | | xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
336 | | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
337 | | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
338 | | "pmulhw %%xmm2,%%xmm6\n\t" \ |
339 | | "pmulhw %%xmm2,%%xmm7\n\t" \ |
340 | | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
341 | | "paddw %%xmm6,%%xmm2\n\t" \ |
342 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
343 | | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
344 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
345 | | /*5-6 rotation by 3pi/16. \ |
346 | | xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
347 | | "pmulhw %%xmm3,%%xmm5\n\t" \ |
348 | | "pmulhw %%xmm3,%%xmm2\n\t" \ |
349 | | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
350 | | "paddw %%xmm3,%%xmm5\n\t" \ |
351 | | "paddw %%xmm3,%%xmm2\n\t" \ |
352 | | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
353 | | /*4-7 rotation by 7pi/16. \ |
354 | | xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
355 | | "pmulhw %%xmm1,%%xmm3\n\t" \ |
356 | | "pmulhw %%xmm1,%%xmm7\n\t" \ |
357 | | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
358 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
359 | | "paddw %%xmm1,%%xmm7\n\t" \ |
360 | | /*0-1 butterfly. \ |
361 | | xmm4=C4, xmm0=X0, X4=0.*/ \ |
362 | | /*Stage 2:*/ \ |
363 | | /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
364 | | 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
365 | | "psubw %%xmm5,%%xmm3\n\t" \ |
366 | | "paddw %%xmm5,%%xmm6\n\t" \ |
367 | | "movdqa %%xmm4,%%xmm1\n\t" \ |
368 | | "pmulhw %%xmm0,%%xmm4\n\t" \ |
369 | | "paddw %%xmm0,%%xmm4\n\t" \ |
370 | | "movdqa %%xmm7,%%xmm0\n\t" \ |
371 | | "movdqa %%xmm4,%%xmm5\n\t" \ |
372 | | "paddw %%xmm2,%%xmm0\n\t" \ |
373 | | "psubw %%xmm2,%%xmm7\n\t" \ |
374 | | "movdqa %%xmm1,%%xmm2\n\t" \ |
375 | | "pmulhw %%xmm6,%%xmm1\n\t" \ |
376 | | "pmulhw %%xmm7,%%xmm2\n\t" \ |
377 | | "paddw %%xmm6,%%xmm1\n\t" \ |
378 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
379 | | "paddw %%xmm7,%%xmm2\n\t" \ |
380 | | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
381 | | /*Stage 3: \ |
382 | | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
383 | | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
384 | | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
385 | | "paddw %%xmm2,%%xmm1\n\t" \ |
386 | | "paddw %%xmm5,%%xmm6\n\t" \ |
387 | | "paddw %%xmm4,%%xmm7\n\t" \ |
388 | | "paddw %%xmm2,%%xmm2\n\t" \ |
389 | | "paddw %%xmm4,%%xmm4\n\t" \ |
390 | | "paddw %%xmm5,%%xmm5\n\t" \ |
391 | | "psubw %%xmm1,%%xmm2\n\t" \ |
392 | | "psubw %%xmm7,%%xmm4\n\t" \ |
393 | | "psubw %%xmm6,%%xmm5\n\t" \ |
394 | | |
395 | 273k | static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
396 | 273k | OC_ALIGN16(ogg_int16_t buf[16]); |
397 | | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
398 | 273k | __asm__ __volatile__( |
399 | 273k | "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
400 | 273k | "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
401 | 273k | "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
402 | 273k | "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
403 | 273k | OC_IDCT_8x8_10_MMX |
404 | 273k | OC_TRANSPOSE_8x4_MMX2SSE |
405 | 273k | OC_IDCT_8x8_10_ABC |
406 | 273k | OC_IDCT_8x8_D_STORE |
407 | 273k | :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), |
408 | 273k | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
409 | 273k | :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
410 | 273k | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
411 | 273k | ); |
412 | | /*Clear input data for next block (decoder only).*/ |
413 | 273k | __asm__ __volatile__( |
414 | 273k | "pxor %%mm0,%%mm0\n\t" |
415 | 273k | "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
416 | 273k | "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
417 | 273k | "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
418 | 273k | "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
419 | 273k | :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
420 | 273k | ); |
421 | 273k | } |
422 | | |
423 | | /*Performs an inverse 8x8 Type-II DCT transform. |
424 | | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
425 | | version of the transform.*/ |
426 | 5.06M | void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
427 | | /*_last_zzi is subtly different from an actual count of the number of |
428 | | coefficients we decoded for this block. |
429 | | It contains the value of zzi BEFORE the final token in the block was |
430 | | decoded. |
431 | | In most cases this is an EOB token (the continuation of an EOB run from a |
432 | | previous block counts), and so this is the same as the coefficient count. |
433 | | However, in the case that the last token was NOT an EOB token, but filled |
434 | | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
435 | | Provided the last token was not a pure zero run, the minimum value it can |
436 | | be is 46, and so that doesn't affect any of the cases in this routine. |
437 | | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
438 | | will be 1 while the number of coefficients decoded is 64. |
439 | | Thus, we will trigger the following special case, where the real |
440 | | coefficient count would not. |
441 | | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
442 | | but we still process the DC coefficient, which might have a non-zero value |
443 | | due to DC prediction. |
444 | | Although convoluted, this is arguably the correct behavior: it allows us to |
445 | | use a smaller transform when the block ends with a long zero run instead |
446 | | of a normal EOB token. |
447 | | It could be smarter... multiple separate zero runs at the end of a block |
448 | | will fool it, but an encoder that generates these really deserves what it |
449 | | gets. |
450 | | Needless to say we inherited this approach from VP3.*/ |
451 | | /*Then perform the iDCT.*/ |
452 | 5.06M | if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
453 | 4.78M | else oc_idct8x8_slow_sse2(_y,_x); |
454 | 5.06M | } |
455 | | |
456 | | #endif |