/src/mozilla-central/media/libtheora/lib/x86/sse2idct.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
15 | | |
16 | | ********************************************************************/ |
17 | | |
18 | | /*SSE2 acceleration of Theora's iDCT.*/ |
19 | | #include "x86int.h" |
20 | | #include "sse2trans.h" |
21 | | #include "../dct.h" |
22 | | |
23 | | #if defined(OC_X86_ASM) |
24 | | |
25 | | /*A table of constants used by the MMX routines.*/ |
26 | | const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
27 | | 8, 8, 8, 8, 8, 8, 8, 8, |
28 | | OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
29 | | OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
30 | | OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
31 | | OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
32 | | OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
33 | | OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
34 | | OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
35 | | }; |
36 | | |
37 | | |
38 | | /*Performs the first three stages of the iDCT. |
39 | | xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
40 | | (accessed in that order). |
41 | | The remaining rows must be in _x at their corresponding locations. |
42 | | On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
43 | | contain rows 4 through 7.*/ |
44 | | #define OC_IDCT_8x8_ABC(_x) \ |
45 | | "#OC_IDCT_8x8_ABC\n\t" \ |
46 | | /*Stage 1:*/ \ |
47 | | /*2-3 rotation by 6pi/16. \ |
48 | | xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
49 | | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
50 | | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
51 | | "movdqa %%xmm1,%%xmm0\n\t" \ |
52 | | "pmulhw %%xmm2,%%xmm1\n\t" \ |
53 | | "movdqa %%xmm4,%%xmm7\n\t" \ |
54 | | "pmulhw %%xmm6,%%xmm0\n\t" \ |
55 | | "pmulhw %%xmm2,%%xmm7\n\t" \ |
56 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
57 | | "paddw %%xmm6,%%xmm0\n\t" \ |
58 | | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
59 | | "paddw %%xmm1,%%xmm2\n\t" \ |
60 | | "psubw %%xmm0,%%xmm7\n\t" \ |
61 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
62 | | "paddw %%xmm4,%%xmm2\n\t" \ |
63 | | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
64 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
65 | | /*5-6 rotation by 3pi/16. \ |
66 | | xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
67 | | "movdqa %%xmm4,%%xmm2\n\t" \ |
68 | | "movdqa %%xmm6,%%xmm1\n\t" \ |
69 | | "pmulhw %%xmm3,%%xmm4\n\t" \ |
70 | | "pmulhw %%xmm5,%%xmm1\n\t" \ |
71 | | "pmulhw %%xmm3,%%xmm6\n\t" \ |
72 | | "pmulhw %%xmm5,%%xmm2\n\t" \ |
73 | | "paddw %%xmm3,%%xmm4\n\t" \ |
74 | | "paddw %%xmm5,%%xmm3\n\t" \ |
75 | | "paddw %%xmm6,%%xmm3\n\t" \ |
76 | | "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
77 | | "paddw %%xmm5,%%xmm1\n\t" \ |
78 | | "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
79 | | "paddw %%xmm3,%%xmm2\n\t" \ |
80 | | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
81 | | "psubw %%xmm4,%%xmm1\n\t" \ |
82 | | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
83 | | /*4-7 rotation by 7pi/16. \ |
84 | | xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
85 | | "movdqa %%xmm3,%%xmm0\n\t" \ |
86 | | "movdqa %%xmm4,%%xmm7\n\t" \ |
87 | | "pmulhw %%xmm5,%%xmm3\n\t" \ |
88 | | "pmulhw %%xmm5,%%xmm7\n\t" \ |
89 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
90 | | "pmulhw %%xmm6,%%xmm0\n\t" \ |
91 | | "paddw %%xmm6,%%xmm4\n\t" \ |
92 | | "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
93 | | "paddw %%xmm5,%%xmm7\n\t" \ |
94 | | "psubw %%xmm4,%%xmm3\n\t" \ |
95 | | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
96 | | "paddw %%xmm7,%%xmm0\n\t" \ |
97 | | "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
98 | | /*0-1 butterfly. \ |
99 | | xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
100 | | "paddw %%xmm7,%%xmm6\n\t" \ |
101 | | "movdqa %%xmm4,%%xmm5\n\t" \ |
102 | | "pmulhw %%xmm6,%%xmm4\n\t" \ |
103 | | "paddw %%xmm7,%%xmm7\n\t" \ |
104 | | "psubw %%xmm6,%%xmm7\n\t" \ |
105 | | "paddw %%xmm6,%%xmm4\n\t" \ |
106 | | /*Stage 2:*/ \ |
107 | | /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
108 | | 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
109 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
110 | | "paddw %%xmm1,%%xmm3\n\t" \ |
111 | | "psubw %%xmm1,%%xmm6\n\t" \ |
112 | | "movdqa %%xmm5,%%xmm1\n\t" \ |
113 | | "pmulhw %%xmm7,%%xmm5\n\t" \ |
114 | | "paddw %%xmm7,%%xmm5\n\t" \ |
115 | | "movdqa %%xmm0,%%xmm7\n\t" \ |
116 | | "paddw %%xmm2,%%xmm0\n\t" \ |
117 | | "psubw %%xmm2,%%xmm7\n\t" \ |
118 | | "movdqa %%xmm1,%%xmm2\n\t" \ |
119 | | "pmulhw %%xmm6,%%xmm1\n\t" \ |
120 | | "pmulhw %%xmm7,%%xmm2\n\t" \ |
121 | | "paddw %%xmm6,%%xmm1\n\t" \ |
122 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
123 | | "paddw %%xmm7,%%xmm2\n\t" \ |
124 | | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
125 | | /*Stage 3: \ |
126 | | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
127 | | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
128 | | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
129 | | "paddw %%xmm2,%%xmm1\n\t" \ |
130 | | "paddw %%xmm5,%%xmm6\n\t" \ |
131 | | "paddw %%xmm4,%%xmm7\n\t" \ |
132 | | "paddw %%xmm2,%%xmm2\n\t" \ |
133 | | "paddw %%xmm4,%%xmm4\n\t" \ |
134 | | "paddw %%xmm5,%%xmm5\n\t" \ |
135 | | "psubw %%xmm1,%%xmm2\n\t" \ |
136 | | "psubw %%xmm7,%%xmm4\n\t" \ |
137 | | "psubw %%xmm6,%%xmm5\n\t" \ |
138 | | |
139 | | /*Performs the last stage of the iDCT. |
140 | | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
141 | | contain rows 4 through 7. |
142 | | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
143 | | #define OC_IDCT_8x8_D \ |
144 | | "#OC_IDCT_8x8_D\n\t" \ |
145 | | /*Stage 4: \ |
146 | | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
147 | | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
148 | | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
149 | | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
150 | | "psubw %%xmm0,%%xmm7\n\t" \ |
151 | | "psubw %%xmm1,%%xmm6\n\t" \ |
152 | | "psubw %%xmm2,%%xmm5\n\t" \ |
153 | | "psubw %%xmm3,%%xmm4\n\t" \ |
154 | | "paddw %%xmm0,%%xmm0\n\t" \ |
155 | | "paddw %%xmm1,%%xmm1\n\t" \ |
156 | | "paddw %%xmm2,%%xmm2\n\t" \ |
157 | | "paddw %%xmm3,%%xmm3\n\t" \ |
158 | | "paddw %%xmm7,%%xmm0\n\t" \ |
159 | | "paddw %%xmm6,%%xmm1\n\t" \ |
160 | | "paddw %%xmm5,%%xmm2\n\t" \ |
161 | | "paddw %%xmm4,%%xmm3\n\t" \ |
162 | | |
163 | | /*Performs the last stage of the iDCT. |
164 | | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
165 | | contain rows 4 through 7. |
166 | | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
167 | | #define OC_IDCT_8x8_D_STORE \ |
168 | | "#OC_IDCT_8x8_D_STORE\n\t" \ |
169 | | /*Stage 4: \ |
170 | | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
171 | | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
172 | | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
173 | | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
174 | | "psubw %%xmm3,%%xmm4\n\t" \ |
175 | | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
176 | | "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
177 | | "psubw %%xmm0,%%xmm7\n\t" \ |
178 | | "psubw %%xmm1,%%xmm6\n\t" \ |
179 | | "psubw %%xmm2,%%xmm5\n\t" \ |
180 | | "paddw %%xmm4,%%xmm7\n\t" \ |
181 | | "paddw %%xmm4,%%xmm6\n\t" \ |
182 | | "paddw %%xmm4,%%xmm5\n\t" \ |
183 | | "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
184 | | "paddw %%xmm0,%%xmm0\n\t" \ |
185 | | "paddw %%xmm1,%%xmm1\n\t" \ |
186 | | "paddw %%xmm2,%%xmm2\n\t" \ |
187 | | "paddw %%xmm3,%%xmm3\n\t" \ |
188 | | "paddw %%xmm7,%%xmm0\n\t" \ |
189 | | "paddw %%xmm6,%%xmm1\n\t" \ |
190 | | "psraw $4,%%xmm0\n\t" \ |
191 | | "paddw %%xmm5,%%xmm2\n\t" \ |
192 | | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
193 | | "psraw $4,%%xmm1\n\t" \ |
194 | | "paddw %%xmm4,%%xmm3\n\t" \ |
195 | | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
196 | | "psraw $4,%%xmm2\n\t" \ |
197 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
198 | | "psraw $4,%%xmm3\n\t" \ |
199 | | "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
200 | | "psraw $4,%%xmm4\n\t" \ |
201 | | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
202 | | "psraw $4,%%xmm5\n\t" \ |
203 | | "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
204 | | "psraw $4,%%xmm6\n\t" \ |
205 | | "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
206 | | "psraw $4,%%xmm7\n\t" \ |
207 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
208 | | |
209 | 0 | static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
210 | 0 | OC_ALIGN16(ogg_int16_t buf[16]); |
211 | 0 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
212 | 0 | __asm__ __volatile__( |
213 | 0 | /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
214 | 0 | "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
215 | 0 | "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
216 | 0 | "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
217 | 0 | "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
218 | 0 | OC_IDCT_8x8_ABC(x) |
219 | 0 | OC_IDCT_8x8_D |
220 | 0 | OC_TRANSPOSE_8x8 |
221 | 0 | /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
222 | 0 | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" |
223 | 0 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" |
224 | 0 | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" |
225 | 0 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" |
226 | 0 | OC_IDCT_8x8_ABC(y) |
227 | 0 | OC_IDCT_8x8_D_STORE |
228 | 0 | :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
229 | 0 | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
230 | 0 | :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
231 | 0 | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
232 | 0 | ); |
233 | 0 | if(_x!=_y){ |
234 | 0 | int i; |
235 | 0 | __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); |
236 | 0 | /*Clear input data for next block (decoder only).*/ |
237 | 0 | for(i=0;i<2;i++){ |
238 | 0 | __asm__ __volatile__( |
239 | 0 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
240 | 0 | "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
241 | 0 | "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
242 | 0 | "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
243 | 0 | :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
244 | 0 | ); |
245 | 0 | } |
246 | 0 | } |
247 | 0 | } |
248 | | |
249 | | /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
250 | | need to work with four columns at a time. |
251 | | Doing this in MMX is faster on processors with a 64-bit data path.*/ |
252 | | #define OC_IDCT_8x8_10_MMX \ |
253 | | "#OC_IDCT_8x8_10_MMX\n\t" \ |
254 | | /*Stage 1:*/ \ |
255 | | /*2-3 rotation by 6pi/16. \ |
256 | | mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
257 | | "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
258 | | "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
259 | | "pmulhw %%mm2,%%mm6\n\t" \ |
260 | | "pmulhw %%mm2,%%mm7\n\t" \ |
261 | | "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
262 | | "paddw %%mm6,%%mm2\n\t" \ |
263 | | "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
264 | | "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
265 | | "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
266 | | /*5-6 rotation by 3pi/16. \ |
267 | | mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
268 | | "pmulhw %%mm3,%%mm5\n\t" \ |
269 | | "pmulhw %%mm3,%%mm2\n\t" \ |
270 | | "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
271 | | "paddw %%mm3,%%mm5\n\t" \ |
272 | | "paddw %%mm3,%%mm2\n\t" \ |
273 | | "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
274 | | /*4-7 rotation by 7pi/16. \ |
275 | | mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
276 | | "pmulhw %%mm1,%%mm3\n\t" \ |
277 | | "pmulhw %%mm1,%%mm7\n\t" \ |
278 | | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
279 | | "movq %%mm3,%%mm6\n\t" \ |
280 | | "paddw %%mm1,%%mm7\n\t" \ |
281 | | /*0-1 butterfly. \ |
282 | | mm4=C4, mm0=X0, X4=0.*/ \ |
283 | | /*Stage 2:*/ \ |
284 | | /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
285 | | 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
286 | | "psubw %%mm5,%%mm3\n\t" \ |
287 | | "paddw %%mm5,%%mm6\n\t" \ |
288 | | "movq %%mm4,%%mm1\n\t" \ |
289 | | "pmulhw %%mm0,%%mm4\n\t" \ |
290 | | "paddw %%mm0,%%mm4\n\t" \ |
291 | | "movq %%mm7,%%mm0\n\t" \ |
292 | | "movq %%mm4,%%mm5\n\t" \ |
293 | | "paddw %%mm2,%%mm0\n\t" \ |
294 | | "psubw %%mm2,%%mm7\n\t" \ |
295 | | "movq %%mm1,%%mm2\n\t" \ |
296 | | "pmulhw %%mm6,%%mm1\n\t" \ |
297 | | "pmulhw %%mm7,%%mm2\n\t" \ |
298 | | "paddw %%mm6,%%mm1\n\t" \ |
299 | | "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
300 | | "paddw %%mm7,%%mm2\n\t" \ |
301 | | "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
302 | | /*Stage 3: \ |
303 | | 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
304 | | 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
305 | | 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
306 | | "paddw %%mm2,%%mm1\n\t" \ |
307 | | "paddw %%mm5,%%mm6\n\t" \ |
308 | | "paddw %%mm4,%%mm7\n\t" \ |
309 | | "paddw %%mm2,%%mm2\n\t" \ |
310 | | "paddw %%mm4,%%mm4\n\t" \ |
311 | | "paddw %%mm5,%%mm5\n\t" \ |
312 | | "psubw %%mm1,%%mm2\n\t" \ |
313 | | "psubw %%mm7,%%mm4\n\t" \ |
314 | | "psubw %%mm6,%%mm5\n\t" \ |
315 | | /*Stage 4: \ |
316 | | 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
317 | | 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
318 | | 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
319 | | 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
320 | | "psubw %%mm0,%%mm7\n\t" \ |
321 | | "psubw %%mm1,%%mm6\n\t" \ |
322 | | "psubw %%mm2,%%mm5\n\t" \ |
323 | | "psubw %%mm3,%%mm4\n\t" \ |
324 | | "paddw %%mm0,%%mm0\n\t" \ |
325 | | "paddw %%mm1,%%mm1\n\t" \ |
326 | | "paddw %%mm2,%%mm2\n\t" \ |
327 | | "paddw %%mm3,%%mm3\n\t" \ |
328 | | "paddw %%mm7,%%mm0\n\t" \ |
329 | | "paddw %%mm6,%%mm1\n\t" \ |
330 | | "paddw %%mm5,%%mm2\n\t" \ |
331 | | "paddw %%mm4,%%mm3\n\t" \ |
332 | | |
333 | | #define OC_IDCT_8x8_10_ABC \ |
334 | | "#OC_IDCT_8x8_10_ABC\n\t" \ |
335 | | /*Stage 1:*/ \ |
336 | | /*2-3 rotation by 6pi/16. \ |
337 | | xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
338 | | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
339 | | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
340 | | "pmulhw %%xmm2,%%xmm6\n\t" \ |
341 | | "pmulhw %%xmm2,%%xmm7\n\t" \ |
342 | | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
343 | | "paddw %%xmm6,%%xmm2\n\t" \ |
344 | | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
345 | | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
346 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
347 | | /*5-6 rotation by 3pi/16. \ |
348 | | xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
349 | | "pmulhw %%xmm3,%%xmm5\n\t" \ |
350 | | "pmulhw %%xmm3,%%xmm2\n\t" \ |
351 | | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
352 | | "paddw %%xmm3,%%xmm5\n\t" \ |
353 | | "paddw %%xmm3,%%xmm2\n\t" \ |
354 | | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
355 | | /*4-7 rotation by 7pi/16. \ |
356 | | xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
357 | | "pmulhw %%xmm1,%%xmm3\n\t" \ |
358 | | "pmulhw %%xmm1,%%xmm7\n\t" \ |
359 | | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
360 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
361 | | "paddw %%xmm1,%%xmm7\n\t" \ |
362 | | /*0-1 butterfly. \ |
363 | | xmm4=C4, xmm0=X0, X4=0.*/ \ |
364 | | /*Stage 2:*/ \ |
365 | | /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
366 | | 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
367 | | "psubw %%xmm5,%%xmm3\n\t" \ |
368 | | "paddw %%xmm5,%%xmm6\n\t" \ |
369 | | "movdqa %%xmm4,%%xmm1\n\t" \ |
370 | | "pmulhw %%xmm0,%%xmm4\n\t" \ |
371 | | "paddw %%xmm0,%%xmm4\n\t" \ |
372 | | "movdqa %%xmm7,%%xmm0\n\t" \ |
373 | | "movdqa %%xmm4,%%xmm5\n\t" \ |
374 | | "paddw %%xmm2,%%xmm0\n\t" \ |
375 | | "psubw %%xmm2,%%xmm7\n\t" \ |
376 | | "movdqa %%xmm1,%%xmm2\n\t" \ |
377 | | "pmulhw %%xmm6,%%xmm1\n\t" \ |
378 | | "pmulhw %%xmm7,%%xmm2\n\t" \ |
379 | | "paddw %%xmm6,%%xmm1\n\t" \ |
380 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
381 | | "paddw %%xmm7,%%xmm2\n\t" \ |
382 | | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
383 | | /*Stage 3: \ |
384 | | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
385 | | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
386 | | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
387 | | "paddw %%xmm2,%%xmm1\n\t" \ |
388 | | "paddw %%xmm5,%%xmm6\n\t" \ |
389 | | "paddw %%xmm4,%%xmm7\n\t" \ |
390 | | "paddw %%xmm2,%%xmm2\n\t" \ |
391 | | "paddw %%xmm4,%%xmm4\n\t" \ |
392 | | "paddw %%xmm5,%%xmm5\n\t" \ |
393 | | "psubw %%xmm1,%%xmm2\n\t" \ |
394 | | "psubw %%xmm7,%%xmm4\n\t" \ |
395 | | "psubw %%xmm6,%%xmm5\n\t" \ |
396 | | |
397 | 0 | static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
398 | 0 | OC_ALIGN16(ogg_int16_t buf[16]); |
399 | 0 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
400 | 0 | __asm__ __volatile__( |
401 | 0 | "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
402 | 0 | "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
403 | 0 | "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
404 | 0 | "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
405 | 0 | OC_IDCT_8x8_10_MMX |
406 | 0 | OC_TRANSPOSE_8x4_MMX2SSE |
407 | 0 | OC_IDCT_8x8_10_ABC |
408 | 0 | OC_IDCT_8x8_D_STORE |
409 | 0 | :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), |
410 | 0 | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
411 | 0 | :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
412 | 0 | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
413 | 0 | ); |
414 | 0 | if(_x!=_y){ |
415 | 0 | /*Clear input data for next block (decoder only).*/ |
416 | 0 | __asm__ __volatile__( |
417 | 0 | "pxor %%mm0,%%mm0\n\t" |
418 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
419 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
420 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
421 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
422 | 0 | :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
423 | 0 | ); |
424 | 0 | } |
425 | 0 | } |
426 | | |
427 | | /*Performs an inverse 8x8 Type-II DCT transform. |
428 | | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
429 | | version of the transform.*/ |
430 | 0 | void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
431 | 0 | /*_last_zzi is subtly different from an actual count of the number of |
432 | 0 | coefficients we decoded for this block. |
433 | 0 | It contains the value of zzi BEFORE the final token in the block was |
434 | 0 | decoded. |
435 | 0 | In most cases this is an EOB token (the continuation of an EOB run from a |
436 | 0 | previous block counts), and so this is the same as the coefficient count. |
437 | 0 | However, in the case that the last token was NOT an EOB token, but filled |
438 | 0 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
439 | 0 | Provided the last token was not a pure zero run, the minimum value it can |
440 | 0 | be is 46, and so that doesn't affect any of the cases in this routine. |
441 | 0 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
442 | 0 | will be 1 while the number of coefficients decoded is 64. |
443 | 0 | Thus, we will trigger the following special case, where the real |
444 | 0 | coefficient count would not. |
445 | 0 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
446 | 0 | but we still process the DC coefficient, which might have a non-zero value |
447 | 0 | due to DC prediction. |
448 | 0 | Although convoluted, this is arguably the correct behavior: it allows us to |
449 | 0 | use a smaller transform when the block ends with a long zero run instead |
450 | 0 | of a normal EOB token. |
451 | 0 | It could be smarter... multiple separate zero runs at the end of a block |
452 | 0 | will fool it, but an encoder that generates these really deserves what it |
453 | 0 | gets. |
454 | 0 | Needless to say we inherited this approach from VP3.*/ |
455 | 0 | /*Then perform the iDCT.*/ |
456 | 0 | if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
457 | 0 | else oc_idct8x8_slow_sse2(_y,_x); |
458 | 0 | } |
459 | | |
460 | | #endif |