/src/theora/lib/x86/x86enquant.c
Line | Count | Source |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $ |
15 | | |
16 | | ********************************************************************/ |
17 | | |
18 | | #include "x86enc.h" |
19 | | |
20 | | #if defined(OC_X86_ASM) |
21 | | |
22 | | |
23 | | |
24 | | /*The default enquant table is not quite suitable for SIMD purposes. |
25 | | First, the m and l parameters need to be separated so that an entire row full |
26 | | of m's or l's can be loaded at a time. |
27 | | Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to |
28 | | emulate one with a multiply. |
29 | | Therefore we translate the shift count into a scale factor.*/ |
30 | | void oc_enc_enquant_table_init_x86(void *_enquant, |
31 | 1.32M | const ogg_uint16_t _dequant[64]){ |
32 | 1.32M | ogg_int16_t *m; |
33 | 1.32M | ogg_int16_t *l; |
34 | 1.32M | int zzi; |
35 | 1.32M | m=(ogg_int16_t *)_enquant; |
36 | 1.32M | l=m+64; |
37 | 86.0M | for(zzi=0;zzi<64;zzi++){ |
38 | 84.6M | oc_iquant q; |
39 | 84.6M | oc_iquant_init(&q,_dequant[zzi]); |
40 | 84.6M | m[zzi]=q.m; |
41 | | /*q.l must be at least 2 for this to work; fortunately, once all the scale |
42 | | factors are baked in, the minimum quantizer is much larger than that.*/ |
43 | 84.6M | l[zzi]=1<<16-q.l; |
44 | 84.6M | } |
45 | 1.32M | } |
46 | | |
47 | 62.0k | void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){ |
48 | 62.0k | int pli; |
49 | 62.0k | int qii; |
50 | 62.0k | int qti; |
51 | 363k | for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){ |
52 | 242k | ((ogg_int16_t *)_enquant[pli][qii][qti])[0]= |
53 | 242k | ((ogg_int16_t *)_enquant[pli][0][qti])[0]; |
54 | 242k | ((ogg_int16_t *)_enquant[pli][qii][qti])[64]= |
55 | 242k | ((ogg_int16_t *)_enquant[pli][0][qti])[64]; |
56 | 242k | } |
57 | 62.0k | } |
58 | | |
59 | | int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64], |
60 | 24.6M | const ogg_uint16_t _dequant[64],const void *_enquant){ |
61 | 24.6M | ptrdiff_t r; |
62 | 24.6M | __asm__ __volatile__( |
63 | 24.6M | "xor %[r],%[r]\n\t" |
64 | | /*Loop through two rows at a time.*/ |
65 | 24.6M | ".p2align 4\n\t" |
66 | 24.6M | "0:\n\t" |
67 | | /*Load the first two rows of the data and the quant matrices.*/ |
68 | 24.6M | "movdqa 0x00(%[dct],%[r]),%%xmm0\n\t" |
69 | 24.6M | "movdqa 0x10(%[dct],%[r]),%%xmm1\n\t" |
70 | 24.6M | "movdqa 0x00(%[dq],%[r]),%%xmm2\n\t" |
71 | 24.6M | "movdqa 0x10(%[dq],%[r]),%%xmm3\n\t" |
72 | 24.6M | "movdqa 0x00(%[q],%[r]),%%xmm4\n\t" |
73 | 24.6M | "movdqa 0x10(%[q],%[r]),%%xmm5\n\t" |
74 | | /*Double the input and propagate its sign to the rounding factor. |
75 | | Using SSSE3's psignw would help here, but we need the mask later anyway.*/ |
76 | 24.6M | "movdqa %%xmm0,%%xmm6\n\t" |
77 | 24.6M | "psraw $15,%%xmm0\n\t" |
78 | 24.6M | "movdqa %%xmm1,%%xmm7\n\t" |
79 | 24.6M | "paddw %%xmm6,%%xmm6\n\t" |
80 | 24.6M | "psraw $15,%%xmm1\n\t" |
81 | 24.6M | "paddw %%xmm7,%%xmm7\n\t" |
82 | 24.6M | "paddw %%xmm0,%%xmm2\n\t" |
83 | 24.6M | "paddw %%xmm1,%%xmm3\n\t" |
84 | 24.6M | "pxor %%xmm0,%%xmm2\n\t" |
85 | 24.6M | "pxor %%xmm1,%%xmm3\n\t" |
86 | | /*Add the rounding factor and perform the first multiply.*/ |
87 | 24.6M | "paddw %%xmm2,%%xmm6\n\t" |
88 | 24.6M | "paddw %%xmm3,%%xmm7\n\t" |
89 | 24.6M | "pmulhw %%xmm6,%%xmm4\n\t" |
90 | 24.6M | "pmulhw %%xmm7,%%xmm5\n\t" |
91 | 24.6M | "movdqa 0x80(%[q],%[r]),%%xmm2\n\t" |
92 | 24.6M | "movdqa 0x90(%[q],%[r]),%%xmm3\n\t" |
93 | 24.6M | "paddw %%xmm4,%%xmm6\n\t" |
94 | 24.6M | "paddw %%xmm5,%%xmm7\n\t" |
95 | | /*Emulate an element-wise right-shift via a second multiply.*/ |
96 | 24.6M | "pmulhw %%xmm2,%%xmm6\n\t" |
97 | 24.6M | "pmulhw %%xmm3,%%xmm7\n\t" |
98 | 24.6M | "add $32,%[r]\n\t" |
99 | 24.6M | "cmp $96,%[r]\n\t" |
100 | | /*Correct for the sign.*/ |
101 | 24.6M | "psubw %%xmm0,%%xmm6\n\t" |
102 | 24.6M | "psubw %%xmm1,%%xmm7\n\t" |
103 | | /*Save the result.*/ |
104 | 24.6M | "movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t" |
105 | 24.6M | "movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t" |
106 | 24.6M | "jle 0b\n\t" |
107 | | /*Now find the location of the last non-zero value.*/ |
108 | 24.6M | "movdqa 0x50(%[qdct]),%%xmm5\n\t" |
109 | 24.6M | "movdqa 0x40(%[qdct]),%%xmm4\n\t" |
110 | 24.6M | "packsswb %%xmm7,%%xmm6\n\t" |
111 | 24.6M | "packsswb %%xmm5,%%xmm4\n\t" |
112 | 24.6M | "pxor %%xmm0,%%xmm0\n\t" |
113 | 24.6M | "mov $-1,%k[dq]\n\t" |
114 | 24.6M | "pcmpeqb %%xmm0,%%xmm6\n\t" |
115 | 24.6M | "pcmpeqb %%xmm0,%%xmm4\n\t" |
116 | 24.6M | "pmovmskb %%xmm6,%k[q]\n\t" |
117 | 24.6M | "pmovmskb %%xmm4,%k[r]\n\t" |
118 | 24.6M | "shl $16,%k[q]\n\t" |
119 | 24.6M | "or %k[r],%k[q]\n\t" |
120 | 24.6M | "mov $32,%[r]\n\t" |
121 | | /*We have to use xor here instead of not in order to set the flags.*/ |
122 | 24.6M | "xor %k[dq],%k[q]\n\t" |
123 | 24.6M | "jnz 1f\n\t" |
124 | 24.6M | "movdqa 0x30(%[qdct]),%%xmm7\n\t" |
125 | 24.6M | "movdqa 0x20(%[qdct]),%%xmm6\n\t" |
126 | 24.6M | "movdqa 0x10(%[qdct]),%%xmm5\n\t" |
127 | 24.6M | "movdqa 0x00(%[qdct]),%%xmm4\n\t" |
128 | 24.6M | "packsswb %%xmm7,%%xmm6\n\t" |
129 | 24.6M | "packsswb %%xmm5,%%xmm4\n\t" |
130 | 24.6M | "pcmpeqb %%xmm0,%%xmm6\n\t" |
131 | 24.6M | "pcmpeqb %%xmm0,%%xmm4\n\t" |
132 | 24.6M | "pmovmskb %%xmm6,%k[q]\n\t" |
133 | 24.6M | "pmovmskb %%xmm4,%k[r]\n\t" |
134 | 24.6M | "shl $16,%k[q]\n\t" |
135 | 24.6M | "or %k[r],%k[q]\n\t" |
136 | 24.6M | "xor %[r],%[r]\n\t" |
137 | 24.6M | "not %k[q]\n\t" |
138 | 24.6M | "or $1,%k[q]\n\t" |
139 | 24.6M | "1:\n\t" |
140 | 24.6M | "bsr %k[q],%k[q]\n\t" |
141 | 24.6M | "add %k[q],%k[r]\n\t" |
142 | 24.6M | :[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant) |
143 | 24.6M | :[dct]"r"(_dct),[qdct]"r"(_qdct) |
144 | 24.6M | :"cc","memory" |
145 | 24.6M | ); |
146 | 24.6M | return (int)r; |
147 | 24.6M | } |
148 | | |
149 | | #endif |