/src/mozilla-central/media/libtheora/lib/x86/mmxidct.c
Line | Count | Source (jump to first uncovered line) |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ |
15 | | |
16 | | ********************************************************************/ |
17 | | |
18 | | /*MMX acceleration of Theora's iDCT. |
19 | | Originally written by Rudolf Marek, based on code from On2's VP3.*/ |
20 | | #include "x86int.h" |
21 | | #include "../dct.h" |
22 | | |
23 | | #if defined(OC_X86_ASM) |
24 | | |
25 | | /*These are offsets into the table of constants below.*/ |
26 | | /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ |
27 | | #define OC_COSINE_OFFSET (0) |
28 | | /*A row of 8's.*/ |
29 | | #define OC_EIGHT_OFFSET (56) |
30 | | |
31 | | |
32 | | |
33 | | /*38 cycles*/ |
34 | | #define OC_IDCT_BEGIN(_y,_x) \ |
35 | | "#OC_IDCT_BEGIN\n\t" \ |
36 | | "movq "OC_I(3,_x)",%%mm2\n\t" \ |
37 | | "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ |
38 | | "movq %%mm2,%%mm4\n\t" \ |
39 | | "movq "OC_J(5,_x)",%%mm7\n\t" \ |
40 | | "pmulhw %%mm6,%%mm4\n\t" \ |
41 | | "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ |
42 | | "pmulhw %%mm7,%%mm6\n\t" \ |
43 | | "movq %%mm1,%%mm5\n\t" \ |
44 | | "pmulhw %%mm2,%%mm1\n\t" \ |
45 | | "movq "OC_I(1,_x)",%%mm3\n\t" \ |
46 | | "pmulhw %%mm7,%%mm5\n\t" \ |
47 | | "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ |
48 | | "paddw %%mm2,%%mm4\n\t" \ |
49 | | "paddw %%mm7,%%mm6\n\t" \ |
50 | | "paddw %%mm1,%%mm2\n\t" \ |
51 | | "movq "OC_J(7,_x)",%%mm1\n\t" \ |
52 | | "paddw %%mm5,%%mm7\n\t" \ |
53 | | "movq %%mm0,%%mm5\n\t" \ |
54 | | "pmulhw %%mm3,%%mm0\n\t" \ |
55 | | "paddw %%mm7,%%mm4\n\t" \ |
56 | | "pmulhw %%mm1,%%mm5\n\t" \ |
57 | | "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ |
58 | | "psubw %%mm2,%%mm6\n\t" \ |
59 | | "paddw %%mm3,%%mm0\n\t" \ |
60 | | "pmulhw %%mm7,%%mm3\n\t" \ |
61 | | "movq "OC_I(2,_x)",%%mm2\n\t" \ |
62 | | "pmulhw %%mm1,%%mm7\n\t" \ |
63 | | "paddw %%mm1,%%mm5\n\t" \ |
64 | | "movq %%mm2,%%mm1\n\t" \ |
65 | | "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ |
66 | | "psubw %%mm5,%%mm3\n\t" \ |
67 | | "movq "OC_J(6,_x)",%%mm5\n\t" \ |
68 | | "paddw %%mm7,%%mm0\n\t" \ |
69 | | "movq %%mm5,%%mm7\n\t" \ |
70 | | "psubw %%mm4,%%mm0\n\t" \ |
71 | | "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ |
72 | | "paddw %%mm1,%%mm2\n\t" \ |
73 | | "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ |
74 | | "paddw %%mm4,%%mm4\n\t" \ |
75 | | "paddw %%mm0,%%mm4\n\t" \ |
76 | | "psubw %%mm6,%%mm3\n\t" \ |
77 | | "paddw %%mm7,%%mm5\n\t" \ |
78 | | "paddw %%mm6,%%mm6\n\t" \ |
79 | | "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
80 | | "paddw %%mm3,%%mm6\n\t" \ |
81 | | "movq %%mm4,"OC_I(1,_y)"\n\t" \ |
82 | | "psubw %%mm5,%%mm1\n\t" \ |
83 | | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
84 | | "movq %%mm3,%%mm5\n\t" \ |
85 | | "pmulhw %%mm4,%%mm3\n\t" \ |
86 | | "paddw %%mm2,%%mm7\n\t" \ |
87 | | "movq %%mm6,"OC_I(2,_y)"\n\t" \ |
88 | | "movq %%mm0,%%mm2\n\t" \ |
89 | | "movq "OC_I(0,_x)",%%mm6\n\t" \ |
90 | | "pmulhw %%mm4,%%mm0\n\t" \ |
91 | | "paddw %%mm3,%%mm5\n\t" \ |
92 | | "movq "OC_J(4,_x)",%%mm3\n\t" \ |
93 | | "psubw %%mm1,%%mm5\n\t" \ |
94 | | "paddw %%mm0,%%mm2\n\t" \ |
95 | | "psubw %%mm3,%%mm6\n\t" \ |
96 | | "movq %%mm6,%%mm0\n\t" \ |
97 | | "pmulhw %%mm4,%%mm6\n\t" \ |
98 | | "paddw %%mm3,%%mm3\n\t" \ |
99 | | "paddw %%mm1,%%mm1\n\t" \ |
100 | | "paddw %%mm0,%%mm3\n\t" \ |
101 | | "paddw %%mm5,%%mm1\n\t" \ |
102 | | "pmulhw %%mm3,%%mm4\n\t" \ |
103 | | "paddw %%mm0,%%mm6\n\t" \ |
104 | | "psubw %%mm2,%%mm6\n\t" \ |
105 | | "paddw %%mm2,%%mm2\n\t" \ |
106 | | "movq "OC_I(1,_y)",%%mm0\n\t" \ |
107 | | "paddw %%mm6,%%mm2\n\t" \ |
108 | | "paddw %%mm3,%%mm4\n\t" \ |
109 | | "psubw %%mm1,%%mm2\n\t" \ |
110 | | "#end OC_IDCT_BEGIN\n\t" \ |
111 | | |
112 | | /*38+8=46 cycles.*/ |
113 | | #define OC_ROW_IDCT(_y,_x) \ |
114 | | "#OC_ROW_IDCT\n" \ |
115 | | OC_IDCT_BEGIN(_y,_x) \ |
116 | | /*r3=D'*/ \ |
117 | | "movq "OC_I(2,_y)",%%mm3\n\t" \ |
118 | | /*r4=E'=E-G*/ \ |
119 | | "psubw %%mm7,%%mm4\n\t" \ |
120 | | /*r1=H'+H'*/ \ |
121 | | "paddw %%mm1,%%mm1\n\t" \ |
122 | | /*r7=G+G*/ \ |
123 | | "paddw %%mm7,%%mm7\n\t" \ |
124 | | /*r1=R1=A''+H'*/ \ |
125 | | "paddw %%mm2,%%mm1\n\t" \ |
126 | | /*r7=G'=E+G*/ \ |
127 | | "paddw %%mm4,%%mm7\n\t" \ |
128 | | /*r4=R4=E'-D'*/ \ |
129 | | "psubw %%mm3,%%mm4\n\t" \ |
130 | | "paddw %%mm3,%%mm3\n\t" \ |
131 | | /*r6=R6=F'-B''*/ \ |
132 | | "psubw %%mm5,%%mm6\n\t" \ |
133 | | "paddw %%mm5,%%mm5\n\t" \ |
134 | | /*r3=R3=E'+D'*/ \ |
135 | | "paddw %%mm4,%%mm3\n\t" \ |
136 | | /*r5=R5=F'+B''*/ \ |
137 | | "paddw %%mm6,%%mm5\n\t" \ |
138 | | /*r7=R7=G'-C'*/ \ |
139 | | "psubw %%mm0,%%mm7\n\t" \ |
140 | | "paddw %%mm0,%%mm0\n\t" \ |
141 | | /*Save R1.*/ \ |
142 | | "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
143 | | /*r0=R0=G.+C.*/ \ |
144 | | "paddw %%mm7,%%mm0\n\t" \ |
145 | | "#end OC_ROW_IDCT\n\t" \ |
146 | | |
147 | | /*The following macro does two 4x4 transposes in place. |
148 | | At entry, we assume: |
149 | | r0 = a3 a2 a1 a0 |
150 | | I(1) = b3 b2 b1 b0 |
151 | | r2 = c3 c2 c1 c0 |
152 | | r3 = d3 d2 d1 d0 |
153 | | |
154 | | r4 = e3 e2 e1 e0 |
155 | | r5 = f3 f2 f1 f0 |
156 | | r6 = g3 g2 g1 g0 |
157 | | r7 = h3 h2 h1 h0 |
158 | | |
159 | | At exit, we have: |
160 | | I(0) = d0 c0 b0 a0 |
161 | | I(1) = d1 c1 b1 a1 |
162 | | I(2) = d2 c2 b2 a2 |
163 | | I(3) = d3 c3 b3 a3 |
164 | | |
165 | | J(4) = h0 g0 f0 e0 |
166 | | J(5) = h1 g1 f1 e1 |
167 | | J(6) = h2 g2 f2 e2 |
168 | | J(7) = h3 g3 f3 e3 |
169 | | |
170 | | I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
171 | | J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
172 | | |
173 | | Since r1 is free at entry, we calculate the Js first.*/ |
174 | | /*19 cycles.*/ |
175 | | #define OC_TRANSPOSE(_y) \ |
176 | | "#OC_TRANSPOSE\n\t" \ |
177 | | "movq %%mm4,%%mm1\n\t" \ |
178 | | "punpcklwd %%mm5,%%mm4\n\t" \ |
179 | | "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
180 | | "punpckhwd %%mm5,%%mm1\n\t" \ |
181 | | "movq %%mm6,%%mm0\n\t" \ |
182 | | "punpcklwd %%mm7,%%mm6\n\t" \ |
183 | | "movq %%mm4,%%mm5\n\t" \ |
184 | | "punpckldq %%mm6,%%mm4\n\t" \ |
185 | | "punpckhdq %%mm6,%%mm5\n\t" \ |
186 | | "movq %%mm1,%%mm6\n\t" \ |
187 | | "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
188 | | "punpckhwd %%mm7,%%mm0\n\t" \ |
189 | | "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
190 | | "punpckhdq %%mm0,%%mm6\n\t" \ |
191 | | "movq "OC_I(0,_y)",%%mm4\n\t" \ |
192 | | "punpckldq %%mm0,%%mm1\n\t" \ |
193 | | "movq "OC_I(1,_y)",%%mm5\n\t" \ |
194 | | "movq %%mm4,%%mm0\n\t" \ |
195 | | "movq %%mm6,"OC_J(7,_y)"\n\t" \ |
196 | | "punpcklwd %%mm5,%%mm0\n\t" \ |
197 | | "movq %%mm1,"OC_J(6,_y)"\n\t" \ |
198 | | "punpckhwd %%mm5,%%mm4\n\t" \ |
199 | | "movq %%mm2,%%mm5\n\t" \ |
200 | | "punpcklwd %%mm3,%%mm2\n\t" \ |
201 | | "movq %%mm0,%%mm1\n\t" \ |
202 | | "punpckldq %%mm2,%%mm0\n\t" \ |
203 | | "punpckhdq %%mm2,%%mm1\n\t" \ |
204 | | "movq %%mm4,%%mm2\n\t" \ |
205 | | "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
206 | | "punpckhwd %%mm3,%%mm5\n\t" \ |
207 | | "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
208 | | "punpckhdq %%mm5,%%mm4\n\t" \ |
209 | | "punpckldq %%mm5,%%mm2\n\t" \ |
210 | | "movq %%mm4,"OC_I(3,_y)"\n\t" \ |
211 | | "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
212 | | "#end OC_TRANSPOSE\n\t" \ |
213 | | |
214 | | /*38+19=57 cycles.*/ |
215 | | #define OC_COLUMN_IDCT(_y) \ |
216 | | "#OC_COLUMN_IDCT\n" \ |
217 | | OC_IDCT_BEGIN(_y,_y) \ |
218 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ |
219 | | /*r1=H'+H'*/ \ |
220 | | "paddw %%mm1,%%mm1\n\t" \ |
221 | | /*r1=R1=A''+H'*/ \ |
222 | | "paddw %%mm2,%%mm1\n\t" \ |
223 | | /*r2=NR2*/ \ |
224 | | "psraw $4,%%mm2\n\t" \ |
225 | | /*r4=E'=E-G*/ \ |
226 | | "psubw %%mm7,%%mm4\n\t" \ |
227 | | /*r1=NR1*/ \ |
228 | | "psraw $4,%%mm1\n\t" \ |
229 | | /*r3=D'*/ \ |
230 | | "movq "OC_I(2,_y)",%%mm3\n\t" \ |
231 | | /*r7=G+G*/ \ |
232 | | "paddw %%mm7,%%mm7\n\t" \ |
233 | | /*Store NR2 at I(2).*/ \ |
234 | | "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
235 | | /*r7=G'=E+G*/ \ |
236 | | "paddw %%mm4,%%mm7\n\t" \ |
237 | | /*Store NR1 at I(1).*/ \ |
238 | | "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
239 | | /*r4=R4=E'-D'*/ \ |
240 | | "psubw %%mm3,%%mm4\n\t" \ |
241 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ |
242 | | /*r3=D'+D'*/ \ |
243 | | "paddw %%mm3,%%mm3\n\t" \ |
244 | | /*r3=R3=E'+D'*/ \ |
245 | | "paddw %%mm4,%%mm3\n\t" \ |
246 | | /*r4=NR4*/ \ |
247 | | "psraw $4,%%mm4\n\t" \ |
248 | | /*r6=R6=F'-B''*/ \ |
249 | | "psubw %%mm5,%%mm6\n\t" \ |
250 | | /*r3=NR3*/ \ |
251 | | "psraw $4,%%mm3\n\t" \ |
252 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ |
253 | | /*r5=B''+B''*/ \ |
254 | | "paddw %%mm5,%%mm5\n\t" \ |
255 | | /*r5=R5=F'+B''*/ \ |
256 | | "paddw %%mm6,%%mm5\n\t" \ |
257 | | /*r6=NR6*/ \ |
258 | | "psraw $4,%%mm6\n\t" \ |
259 | | /*Store NR4 at J(4).*/ \ |
260 | | "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
261 | | /*r5=NR5*/ \ |
262 | | "psraw $4,%%mm5\n\t" \ |
263 | | /*Store NR3 at I(3).*/ \ |
264 | | "movq %%mm3,"OC_I(3,_y)"\n\t" \ |
265 | | /*r7=R7=G'-C'*/ \ |
266 | | "psubw %%mm0,%%mm7\n\t" \ |
267 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ |
268 | | /*r0=C'+C'*/ \ |
269 | | "paddw %%mm0,%%mm0\n\t" \ |
270 | | /*r0=R0=G'+C'*/ \ |
271 | | "paddw %%mm7,%%mm0\n\t" \ |
272 | | /*r7=NR7*/ \ |
273 | | "psraw $4,%%mm7\n\t" \ |
274 | | /*Store NR6 at J(6).*/ \ |
275 | | "movq %%mm6,"OC_J(6,_y)"\n\t" \ |
276 | | /*r0=NR0*/ \ |
277 | | "psraw $4,%%mm0\n\t" \ |
278 | | /*Store NR5 at J(5).*/ \ |
279 | | "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
280 | | /*Store NR7 at J(7).*/ \ |
281 | | "movq %%mm7,"OC_J(7,_y)"\n\t" \ |
282 | | /*Store NR0 at I(0).*/ \ |
283 | | "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
284 | | "#end OC_COLUMN_IDCT\n\t" \ |
285 | | |
286 | 0 | static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
287 | 0 | /*This routine accepts an 8x8 matrix, but in partially transposed form. |
288 | 0 | Every 4x4 block is transposed.*/ |
289 | 0 | __asm__ __volatile__( |
290 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
291 | 0 | #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) |
292 | 0 | OC_ROW_IDCT(y,x) |
293 | 0 | OC_TRANSPOSE(y) |
294 | 0 | #undef OC_I |
295 | 0 | #undef OC_J |
296 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) |
297 | 0 | #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) |
298 | 0 | OC_ROW_IDCT(y,x) |
299 | 0 | OC_TRANSPOSE(y) |
300 | 0 | #undef OC_I |
301 | 0 | #undef OC_J |
302 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
303 | 0 | #define OC_J(_k,_y) OC_I(_k,_y) |
304 | 0 | OC_COLUMN_IDCT(y) |
305 | 0 | #undef OC_I |
306 | 0 | #undef OC_J |
307 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) |
308 | 0 | #define OC_J(_k,_y) OC_I(_k,_y) |
309 | 0 | OC_COLUMN_IDCT(y) |
310 | 0 | #undef OC_I |
311 | 0 | #undef OC_J |
312 | 0 | :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) |
313 | 0 | :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
314 | 0 | [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) |
315 | 0 | ); |
316 | 0 | if(_x!=_y){ |
317 | 0 | int i; |
318 | 0 | __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); |
319 | 0 | for(i=0;i<4;i++){ |
320 | 0 | __asm__ __volatile__( |
321 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
322 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" |
323 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
324 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" |
325 | 0 | :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) |
326 | 0 | ); |
327 | 0 | } |
328 | 0 | } |
329 | 0 | } |
330 | | |
331 | | /*25 cycles.*/ |
332 | | #define OC_IDCT_BEGIN_10(_y,_x) \ |
333 | | "#OC_IDCT_BEGIN_10\n\t" \ |
334 | | "movq "OC_I(3,_x)",%%mm2\n\t" \ |
335 | | "nop\n\t" \ |
336 | | "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ |
337 | | "movq %%mm2,%%mm4\n\t" \ |
338 | | "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ |
339 | | "pmulhw %%mm6,%%mm4\n\t" \ |
340 | | "movq "OC_I(1,_x)",%%mm3\n\t" \ |
341 | | "pmulhw %%mm2,%%mm1\n\t" \ |
342 | | "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ |
343 | | "paddw %%mm2,%%mm4\n\t" \ |
344 | | "pxor %%mm6,%%mm6\n\t" \ |
345 | | "paddw %%mm1,%%mm2\n\t" \ |
346 | | "movq "OC_I(2,_x)",%%mm5\n\t" \ |
347 | | "pmulhw %%mm3,%%mm0\n\t" \ |
348 | | "movq %%mm5,%%mm1\n\t" \ |
349 | | "paddw %%mm3,%%mm0\n\t" \ |
350 | | "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
351 | | "psubw %%mm2,%%mm6\n\t" \ |
352 | | "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ |
353 | | "psubw %%mm4,%%mm0\n\t" \ |
354 | | "movq "OC_I(2,_x)",%%mm7\n\t" \ |
355 | | "paddw %%mm4,%%mm4\n\t" \ |
356 | | "paddw %%mm5,%%mm7\n\t" \ |
357 | | "paddw %%mm0,%%mm4\n\t" \ |
358 | | "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ |
359 | | "psubw %%mm6,%%mm3\n\t" \ |
360 | | "movq %%mm4,"OC_I(1,_y)"\n\t" \ |
361 | | "paddw %%mm6,%%mm6\n\t" \ |
362 | | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
363 | | "paddw %%mm3,%%mm6\n\t" \ |
364 | | "movq %%mm3,%%mm5\n\t" \ |
365 | | "pmulhw %%mm4,%%mm3\n\t" \ |
366 | | "movq %%mm6,"OC_I(2,_y)"\n\t" \ |
367 | | "movq %%mm0,%%mm2\n\t" \ |
368 | | "movq "OC_I(0,_x)",%%mm6\n\t" \ |
369 | | "pmulhw %%mm4,%%mm0\n\t" \ |
370 | | "paddw %%mm3,%%mm5\n\t" \ |
371 | | "paddw %%mm0,%%mm2\n\t" \ |
372 | | "psubw %%mm1,%%mm5\n\t" \ |
373 | | "pmulhw %%mm4,%%mm6\n\t" \ |
374 | | "paddw "OC_I(0,_x)",%%mm6\n\t" \ |
375 | | "paddw %%mm1,%%mm1\n\t" \ |
376 | | "movq %%mm6,%%mm4\n\t" \ |
377 | | "paddw %%mm5,%%mm1\n\t" \ |
378 | | "psubw %%mm2,%%mm6\n\t" \ |
379 | | "paddw %%mm2,%%mm2\n\t" \ |
380 | | "movq "OC_I(1,_y)",%%mm0\n\t" \ |
381 | | "paddw %%mm6,%%mm2\n\t" \ |
382 | | "psubw %%mm1,%%mm2\n\t" \ |
383 | | "nop\n\t" \ |
384 | | "#end OC_IDCT_BEGIN_10\n\t" \ |
385 | | |
386 | | /*25+8=33 cycles.*/ |
387 | | #define OC_ROW_IDCT_10(_y,_x) \ |
388 | | "#OC_ROW_IDCT_10\n\t" \ |
389 | | OC_IDCT_BEGIN_10(_y,_x) \ |
390 | | /*r3=D'*/ \ |
391 | | "movq "OC_I(2,_y)",%%mm3\n\t" \ |
392 | | /*r4=E'=E-G*/ \ |
393 | | "psubw %%mm7,%%mm4\n\t" \ |
394 | | /*r1=H'+H'*/ \ |
395 | | "paddw %%mm1,%%mm1\n\t" \ |
396 | | /*r7=G+G*/ \ |
397 | | "paddw %%mm7,%%mm7\n\t" \ |
398 | | /*r1=R1=A''+H'*/ \ |
399 | | "paddw %%mm2,%%mm1\n\t" \ |
400 | | /*r7=G'=E+G*/ \ |
401 | | "paddw %%mm4,%%mm7\n\t" \ |
402 | | /*r4=R4=E'-D'*/ \ |
403 | | "psubw %%mm3,%%mm4\n\t" \ |
404 | | "paddw %%mm3,%%mm3\n\t" \ |
405 | | /*r6=R6=F'-B''*/ \ |
406 | | "psubw %%mm5,%%mm6\n\t" \ |
407 | | "paddw %%mm5,%%mm5\n\t" \ |
408 | | /*r3=R3=E'+D'*/ \ |
409 | | "paddw %%mm4,%%mm3\n\t" \ |
410 | | /*r5=R5=F'+B''*/ \ |
411 | | "paddw %%mm6,%%mm5\n\t" \ |
412 | | /*r7=R7=G'-C'*/ \ |
413 | | "psubw %%mm0,%%mm7\n\t" \ |
414 | | "paddw %%mm0,%%mm0\n\t" \ |
415 | | /*Save R1.*/ \ |
416 | | "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
417 | | /*r0=R0=G'+C'*/ \ |
418 | | "paddw %%mm7,%%mm0\n\t" \ |
419 | | "#end OC_ROW_IDCT_10\n\t" \ |
420 | | |
421 | | /*25+19=44 cycles'*/ |
422 | | #define OC_COLUMN_IDCT_10(_y) \ |
423 | | "#OC_COLUMN_IDCT_10\n\t" \ |
424 | | OC_IDCT_BEGIN_10(_y,_y) \ |
425 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ |
426 | | /*r1=H'+H'*/ \ |
427 | | "paddw %%mm1,%%mm1\n\t" \ |
428 | | /*r1=R1=A''+H'*/ \ |
429 | | "paddw %%mm2,%%mm1\n\t" \ |
430 | | /*r2=NR2*/ \ |
431 | | "psraw $4,%%mm2\n\t" \ |
432 | | /*r4=E'=E-G*/ \ |
433 | | "psubw %%mm7,%%mm4\n\t" \ |
434 | | /*r1=NR1*/ \ |
435 | | "psraw $4,%%mm1\n\t" \ |
436 | | /*r3=D'*/ \ |
437 | | "movq "OC_I(2,_y)",%%mm3\n\t" \ |
438 | | /*r7=G+G*/ \ |
439 | | "paddw %%mm7,%%mm7\n\t" \ |
440 | | /*Store NR2 at I(2).*/ \ |
441 | | "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
442 | | /*r7=G'=E+G*/ \ |
443 | | "paddw %%mm4,%%mm7\n\t" \ |
444 | | /*Store NR1 at I(1).*/ \ |
445 | | "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
446 | | /*r4=R4=E'-D'*/ \ |
447 | | "psubw %%mm3,%%mm4\n\t" \ |
448 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ |
449 | | /*r3=D'+D'*/ \ |
450 | | "paddw %%mm3,%%mm3\n\t" \ |
451 | | /*r3=R3=E'+D'*/ \ |
452 | | "paddw %%mm4,%%mm3\n\t" \ |
453 | | /*r4=NR4*/ \ |
454 | | "psraw $4,%%mm4\n\t" \ |
455 | | /*r6=R6=F'-B''*/ \ |
456 | | "psubw %%mm5,%%mm6\n\t" \ |
457 | | /*r3=NR3*/ \ |
458 | | "psraw $4,%%mm3\n\t" \ |
459 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ |
460 | | /*r5=B''+B''*/ \ |
461 | | "paddw %%mm5,%%mm5\n\t" \ |
462 | | /*r5=R5=F'+B''*/ \ |
463 | | "paddw %%mm6,%%mm5\n\t" \ |
464 | | /*r6=NR6*/ \ |
465 | | "psraw $4,%%mm6\n\t" \ |
466 | | /*Store NR4 at J(4).*/ \ |
467 | | "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
468 | | /*r5=NR5*/ \ |
469 | | "psraw $4,%%mm5\n\t" \ |
470 | | /*Store NR3 at I(3).*/ \ |
471 | | "movq %%mm3,"OC_I(3,_y)"\n\t" \ |
472 | | /*r7=R7=G'-C'*/ \ |
473 | | "psubw %%mm0,%%mm7\n\t" \ |
474 | | "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ |
475 | | /*r0=C'+C'*/ \ |
476 | | "paddw %%mm0,%%mm0\n\t" \ |
477 | | /*r0=R0=G'+C'*/ \ |
478 | | "paddw %%mm7,%%mm0\n\t" \ |
479 | | /*r7=NR7*/ \ |
480 | | "psraw $4,%%mm7\n\t" \ |
481 | | /*Store NR6 at J(6).*/ \ |
482 | | "movq %%mm6,"OC_J(6,_y)"\n\t" \ |
483 | | /*r0=NR0*/ \ |
484 | | "psraw $4,%%mm0\n\t" \ |
485 | | /*Store NR5 at J(5).*/ \ |
486 | | "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
487 | | /*Store NR7 at J(7).*/ \ |
488 | | "movq %%mm7,"OC_J(7,_y)"\n\t" \ |
489 | | /*Store NR0 at I(0).*/ \ |
490 | | "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
491 | | "#end OC_COLUMN_IDCT_10\n\t" \ |
492 | | |
493 | 0 | static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
494 | 0 | __asm__ __volatile__( |
495 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
496 | 0 | #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) |
497 | 0 | /*Done with dequant, descramble, and partial transpose. |
498 | 0 | Now do the iDCT itself.*/ |
499 | 0 | OC_ROW_IDCT_10(y,x) |
500 | 0 | OC_TRANSPOSE(y) |
501 | 0 | #undef OC_I |
502 | 0 | #undef OC_J |
503 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
504 | 0 | #define OC_J(_k,_y) OC_I(_k,_y) |
505 | 0 | OC_COLUMN_IDCT_10(y) |
506 | 0 | #undef OC_I |
507 | 0 | #undef OC_J |
508 | 0 | #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) |
509 | 0 | #define OC_J(_k,_y) OC_I(_k,_y) |
510 | 0 | OC_COLUMN_IDCT_10(y) |
511 | 0 | #undef OC_I |
512 | 0 | #undef OC_J |
513 | 0 | :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) |
514 | 0 | :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
515 | 0 | [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) |
516 | 0 | ); |
517 | 0 | if(_x!=_y){ |
518 | 0 | __asm__ __volatile__( |
519 | 0 | "pxor %%mm0,%%mm0\n\t" |
520 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
521 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
522 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
523 | 0 | "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
524 | 0 | :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) |
525 | 0 | ); |
526 | 0 | } |
527 | 0 | } |
528 | | |
529 | | /*Performs an inverse 8x8 Type-II DCT transform. |
530 | | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
531 | | version of the transform.*/ |
532 | 0 | void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
533 | 0 | /*_last_zzi is subtly different from an actual count of the number of |
534 | 0 | coefficients we decoded for this block. |
535 | 0 | It contains the value of zzi BEFORE the final token in the block was |
536 | 0 | decoded. |
537 | 0 | In most cases this is an EOB token (the continuation of an EOB run from a |
538 | 0 | previous block counts), and so this is the same as the coefficient count. |
539 | 0 | However, in the case that the last token was NOT an EOB token, but filled |
540 | 0 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
541 | 0 | Provided the last token was not a pure zero run, the minimum value it can |
542 | 0 | be is 46, and so that doesn't affect any of the cases in this routine. |
543 | 0 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
544 | 0 | will be 1 while the number of coefficients decoded is 64. |
545 | 0 | Thus, we will trigger the following special case, where the real |
546 | 0 | coefficient count would not. |
547 | 0 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
548 | 0 | but we still process the DC coefficient, which might have a non-zero value |
549 | 0 | due to DC prediction. |
550 | 0 | Although convoluted, this is arguably the correct behavior: it allows us to |
551 | 0 | use a smaller transform when the block ends with a long zero run instead |
552 | 0 | of a normal EOB token. |
553 | 0 | It could be smarter... multiple separate zero runs at the end of a block |
554 | 0 | will fool it, but an encoder that generates these really deserves what it |
555 | 0 | gets. |
556 | 0 | Needless to say we inherited this approach from VP3.*/ |
557 | 0 | /*Then perform the iDCT.*/ |
558 | 0 | if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); |
559 | 0 | else oc_idct8x8_slow_mmx(_y,_x); |
560 | 0 | } |
561 | | |
562 | | #endif |