/src/theora/lib/x86/sse2encfrag.c
Line | Count | Source |
1 | | /******************************************************************** |
2 | | * * |
3 | | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 | | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 | | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 | | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 | | * * |
8 | | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
9 | | * by the Xiph.Org Foundation https://www.xiph.org/ * |
10 | | * * |
11 | | ******************************************************************** |
12 | | |
13 | | function: |
14 | | |
15 | | ********************************************************************/ |
16 | | #include <stddef.h> |
17 | | #include "x86enc.h" |
18 | | #include "sse2trans.h" |
19 | | |
20 | | #if defined(OC_X86_ASM) |
21 | | |
22 | | /*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their |
23 | | 16-bit differences. |
24 | | On output, these are stored in _m0, xmm1, xmm2, and xmm3. |
25 | | xmm4 and xmm5 are clobbered.*/ |
26 | | #define OC_LOAD_SUB_4x8(_m0) \ |
27 | | "#OC_LOAD_SUB_4x8\n\t" \ |
28 | | /*Load the first three rows.*/ \ |
29 | | "movq (%[src]),"_m0"\n\t" \ |
30 | | "movq (%[ref]),%%xmm4\n\t" \ |
31 | | "movq (%[src],%[ystride]),%%xmm1\n\t" \ |
32 | | "movq (%[ref],%[ystride]),%%xmm3\n\t" \ |
33 | | "movq (%[src],%[ystride],2),%%xmm2\n\t" \ |
34 | | "movq (%[ref],%[ystride],2),%%xmm5\n\t" \ |
35 | | /*Unpack and subtract.*/ \ |
36 | | "punpcklbw %%xmm4,"_m0"\n\t" \ |
37 | | "punpcklbw %%xmm4,%%xmm4\n\t" \ |
38 | | "punpcklbw %%xmm3,%%xmm1\n\t" \ |
39 | | "punpcklbw %%xmm3,%%xmm3\n\t" \ |
40 | | "psubw %%xmm4,"_m0"\n\t" \ |
41 | | "psubw %%xmm3,%%xmm1\n\t" \ |
42 | | /*Load the last row.*/ \ |
43 | | "movq (%[src],%[ystride3]),%%xmm3\n\t" \ |
44 | | "movq (%[ref],%[ystride3]),%%xmm4\n\t" \ |
45 | | /*Unpack, subtract, and advance the pointers.*/ \ |
46 | | "punpcklbw %%xmm5,%%xmm2\n\t" \ |
47 | | "punpcklbw %%xmm5,%%xmm5\n\t" \ |
48 | | "lea (%[src],%[ystride],4),%[src]\n\t" \ |
49 | | "psubw %%xmm5,%%xmm2\n\t" \ |
50 | | "punpcklbw %%xmm4,%%xmm3\n\t" \ |
51 | | "punpcklbw %%xmm4,%%xmm4\n\t" \ |
52 | | "lea (%[ref],%[ystride],4),%[ref]\n\t" \ |
53 | | "psubw %%xmm4,%%xmm3\n\t" \ |
54 | | |
55 | | /*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3. |
56 | | On output, xmm0 contains the sum of two of the rows, and the other two are |
57 | | added to xmm7.*/ |
58 | | #define OC_SSD_4x8(_m0) \ |
59 | | "pmaddwd "_m0","_m0"\n\t" \ |
60 | | "pmaddwd %%xmm1,%%xmm1\n\t" \ |
61 | | "pmaddwd %%xmm2,%%xmm2\n\t" \ |
62 | | "pmaddwd %%xmm3,%%xmm3\n\t" \ |
63 | | "paddd %%xmm1,"_m0"\n\t" \ |
64 | | "paddd %%xmm3,%%xmm2\n\t" \ |
65 | | "paddd %%xmm2,%%xmm7\n\t" \ |
66 | | |
67 | | unsigned __attribute__((target("sse2"))) oc_enc_frag_ssd_sse2(const unsigned char *_src, |
68 | 2.07M | const unsigned char *_ref,int _ystride){ |
69 | 2.07M | unsigned ret; |
70 | 2.07M | __asm__ __volatile__( |
71 | 2.07M | OC_LOAD_SUB_4x8("%%xmm7") |
72 | 2.07M | OC_SSD_4x8("%%xmm7") |
73 | 2.07M | OC_LOAD_SUB_4x8("%%xmm0") |
74 | 2.07M | OC_SSD_4x8("%%xmm0") |
75 | 2.07M | "paddd %%xmm0,%%xmm7\n\t" |
76 | 2.07M | "movdqa %%xmm7,%%xmm6\n\t" |
77 | 2.07M | "punpckhqdq %%xmm7,%%xmm7\n\t" |
78 | 2.07M | "paddd %%xmm6,%%xmm7\n\t" |
79 | 2.07M | "pshufd $1,%%xmm7,%%xmm6\n\t" |
80 | 2.07M | "paddd %%xmm6,%%xmm7\n\t" |
81 | 2.07M | "movd %%xmm7,%[ret]\n\t" |
82 | 2.07M | :[ret]"=a"(ret) |
83 | 2.07M | :[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride), |
84 | 2.07M | [ystride3]"r"((ptrdiff_t)_ystride*3) |
85 | 2.07M | :"%xmm0", "%xmm1", "%xmm2", "%xmm3", |
86 | 2.07M | "%xmm4", "%xmm5", "%xmm6", "%xmm7" |
87 | 2.07M | ); |
88 | 2.07M | return ret; |
89 | 2.07M | } |
90 | | |
91 | | static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={ |
92 | | 0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80 |
93 | | }; |
94 | | |
95 | | /*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their |
96 | | horizontal sums as well as their 16-bit differences subject to a mask. |
97 | | %[cr] must contain OC_MASK_CONSTS[0...7] and %[mr] must contain 0.*/ |
98 | | #define OC_LOAD_SUB_MASK_2x8 \ |
99 | | "#OC_LOAD_SUB_MASK_2x8\n\t" \ |
100 | | /*Start the loads and expand the next 8 bits of the mask.*/ \ |
101 | | "shl $8,%[m]\n\t" \ |
102 | | "movq (%[src]),%%xmm0\n\t" \ |
103 | | "mov %h[m],%b[m]\n\t" \ |
104 | | "movq (%[ref]),%%xmm2\n\t" \ |
105 | | "movd %[m],%%xmm4\n\t" \ |
106 | | "shr $8,%[m]\n\t" \ |
107 | | "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \ |
108 | | "mov %h[m],%b[m]\n\t" \ |
109 | | "pand %[cr],%%xmm4\n\t" \ |
110 | | "pcmpeqb %[cr],%%xmm4\n\t" \ |
111 | | /*Perform the masking.*/ \ |
112 | | "pand %%xmm4,%%xmm0\n\t" \ |
113 | | "pand %%xmm4,%%xmm2\n\t" \ |
114 | | /*Finish the loads while unpacking the first set of rows, and expand the next |
115 | | 8 bits of the mask.*/ \ |
116 | | "movd %[m],%%xmm4\n\t" \ |
117 | | "movq (%[src],%[ystride]),%%xmm1\n\t" \ |
118 | | "pshuflw $0x00,%%xmm4,%%xmm4\n\t" \ |
119 | | "movq (%[ref],%[ystride]),%%xmm3\n\t" \ |
120 | | "pand %[cr],%%xmm4\n\t" \ |
121 | | "punpcklbw %%xmm2,%%xmm0\n\t" \ |
122 | | "pcmpeqb %[cr],%%xmm4\n\t" \ |
123 | | "punpcklbw %%xmm2,%%xmm2\n\t" \ |
124 | | /*Mask and unpack the second set of rows.*/ \ |
125 | | "pand %%xmm4,%%xmm1\n\t" \ |
126 | | "pand %%xmm4,%%xmm3\n\t" \ |
127 | | "punpcklbw %%xmm3,%%xmm1\n\t" \ |
128 | | "punpcklbw %%xmm3,%%xmm3\n\t" \ |
129 | | "psubw %%xmm2,%%xmm0\n\t" \ |
130 | | "psubw %%xmm3,%%xmm1\n\t" \ |
131 | | |
132 | | unsigned __attribute__((target("sse2"))) oc_enc_frag_border_ssd_sse2(const unsigned char *_src, |
133 | 1.45M | const unsigned char *_ref,int _ystride,ogg_int64_t _mask){ |
134 | 1.45M | ptrdiff_t ystride; |
135 | 1.45M | unsigned ret; |
136 | 1.45M | int i; |
137 | 1.45M | ystride=_ystride; |
138 | | /*Store intermediate values across __asm__ blocks*/ |
139 | 1.45M | register sse2_reg cr; |
140 | 1.45M | register sse2_reg mr; |
141 | 1.45M | __asm__ __volatile__( |
142 | 1.45M | "pxor %[mr],%[mr]\n\t" |
143 | 1.45M | "movq %[c],%[cr]\n\t" |
144 | 1.45M | :[cr]"=x"(cr), [mr]"=x"(mr) |
145 | 1.45M | :[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8)) |
146 | 1.45M | ); |
147 | 7.27M | for(i=0;i<4;i++){ |
148 | 5.82M | unsigned m; |
149 | 5.82M | m=_mask&0xFFFF; |
150 | 5.82M | _mask>>=16; |
151 | 5.82M | if(m){ |
152 | 3.43M | __asm__ __volatile__( |
153 | 3.43M | OC_LOAD_SUB_MASK_2x8 |
154 | 3.43M | "pmaddwd %%xmm0,%%xmm0\n\t" |
155 | 3.43M | "pmaddwd %%xmm1,%%xmm1\n\t" |
156 | 3.43M | "paddd %%xmm0,%[mr]\n\t" |
157 | 3.43M | "paddd %%xmm1,%[mr]\n\t" |
158 | 3.43M | :[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m),[mr]"+x"(mr) |
159 | 3.43M | :[cr]"x"(cr) |
160 | 3.43M | :"%xmm0", "%xmm1", "%xmm2", "%xmm3", |
161 | 3.43M | "%xmm4"/*, "%xmm5", "%xmm6", "%xmm7"*/ |
162 | 3.43M | ); |
163 | 3.43M | } |
164 | 5.82M | _src+=2*ystride; |
165 | 5.82M | _ref+=2*ystride; |
166 | 5.82M | } |
167 | 1.45M | __asm__ __volatile__( |
168 | 1.45M | "movdqa %[mr],%%xmm6\n\t" |
169 | 1.45M | "punpckhqdq %[mr],%%xmm7\n\t" |
170 | 1.45M | "paddd %%xmm6,%%xmm7\n\t" |
171 | 1.45M | "pshufd $1,%%xmm7,%%xmm6\n\t" |
172 | 1.45M | "paddd %%xmm6,%%xmm7\n\t" |
173 | 1.45M | "movd %%xmm7,%[ret]\n\t" |
174 | 1.45M | :[ret]"=a"(ret) |
175 | 1.45M | :[mr]"x"(mr) |
176 | 1.45M | :"%xmm6", "%xmm7" |
177 | 1.45M | ); |
178 | 1.45M | return ret; |
179 | 1.45M | } |
180 | | |
181 | | |
182 | | /*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their |
183 | | 16-bit difference in %%xmm0...%%xmm7.*/ |
184 | | #define OC_LOAD_SUB_8x8 \ |
185 | | "#OC_LOAD_SUB_8x8\n\t" \ |
186 | | "movq (%[src]),%%xmm0\n\t" \ |
187 | | "movq (%[ref]),%%xmm4\n\t" \ |
188 | | "movq (%[src],%[src_ystride]),%%xmm1\n\t" \ |
189 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
190 | | "movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \ |
191 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
192 | | "movq (%[src]),%%xmm2\n\t" \ |
193 | | "movq (%[ref]),%%xmm7\n\t" \ |
194 | | "movq (%[src],%[src_ystride]),%%xmm3\n\t" \ |
195 | | "movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \ |
196 | | "punpcklbw %%xmm4,%%xmm0\n\t" \ |
197 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
198 | | "punpcklbw %%xmm4,%%xmm4\n\t" \ |
199 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
200 | | "psubw %%xmm4,%%xmm0\n\t" \ |
201 | | "movq (%[src]),%%xmm4\n\t" \ |
202 | | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
203 | | "movq (%[ref]),%%xmm0\n\t" \ |
204 | | "punpcklbw %%xmm5,%%xmm1\n\t" \ |
205 | | "punpcklbw %%xmm5,%%xmm5\n\t" \ |
206 | | "psubw %%xmm5,%%xmm1\n\t" \ |
207 | | "movq (%[src],%[src_ystride]),%%xmm5\n\t" \ |
208 | | "punpcklbw %%xmm7,%%xmm2\n\t" \ |
209 | | "punpcklbw %%xmm7,%%xmm7\n\t" \ |
210 | | "psubw %%xmm7,%%xmm2\n\t" \ |
211 | | "movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \ |
212 | | "punpcklbw %%xmm6,%%xmm3\n\t" \ |
213 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
214 | | "punpcklbw %%xmm6,%%xmm6\n\t" \ |
215 | | "psubw %%xmm6,%%xmm3\n\t" \ |
216 | | "movq (%[src]),%%xmm6\n\t" \ |
217 | | "punpcklbw %%xmm0,%%xmm4\n\t" \ |
218 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
219 | | "punpcklbw %%xmm0,%%xmm0\n\t" \ |
220 | | "lea (%[src],%[src_ystride],2),%[src]\n\t" \ |
221 | | "psubw %%xmm0,%%xmm4\n\t" \ |
222 | | "movq (%[ref]),%%xmm0\n\t" \ |
223 | | "punpcklbw %%xmm7,%%xmm5\n\t" \ |
224 | | "neg %[src_ystride]\n\t" \ |
225 | | "punpcklbw %%xmm7,%%xmm7\n\t" \ |
226 | | "psubw %%xmm7,%%xmm5\n\t" \ |
227 | | "movq (%[src],%[src_ystride]),%%xmm7\n\t" \ |
228 | | "punpcklbw %%xmm0,%%xmm6\n\t" \ |
229 | | "lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \ |
230 | | "punpcklbw %%xmm0,%%xmm0\n\t" \ |
231 | | "neg %[ref_ystride]\n\t" \ |
232 | | "psubw %%xmm0,%%xmm6\n\t" \ |
233 | | "movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \ |
234 | | "punpcklbw %%xmm0,%%xmm7\n\t" \ |
235 | | "punpcklbw %%xmm0,%%xmm0\n\t" \ |
236 | | "psubw %%xmm0,%%xmm7\n\t" \ |
237 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ |
238 | | |
239 | | /*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/ |
240 | | #define OC_LOAD_8x8 \ |
241 | | "#OC_LOAD_8x8\n\t" \ |
242 | | "movq (%[src]),%%xmm0\n\t" \ |
243 | | "movq (%[src],%[ystride]),%%xmm1\n\t" \ |
244 | | "movq (%[src],%[ystride],2),%%xmm2\n\t" \ |
245 | | "pxor %%xmm7,%%xmm7\n\t" \ |
246 | | "movq (%[src],%[ystride3]),%%xmm3\n\t" \ |
247 | | "punpcklbw %%xmm7,%%xmm0\n\t" \ |
248 | | "movq (%[src4]),%%xmm4\n\t" \ |
249 | | "punpcklbw %%xmm7,%%xmm1\n\t" \ |
250 | | "movq (%[src4],%[ystride]),%%xmm5\n\t" \ |
251 | | "punpcklbw %%xmm7,%%xmm2\n\t" \ |
252 | | "movq (%[src4],%[ystride],2),%%xmm6\n\t" \ |
253 | | "punpcklbw %%xmm7,%%xmm3\n\t" \ |
254 | | "movq (%[src4],%[ystride3]),%%xmm7\n\t" \ |
255 | | "punpcklbw %%xmm4,%%xmm4\n\t" \ |
256 | | "punpcklbw %%xmm5,%%xmm5\n\t" \ |
257 | | "psrlw $8,%%xmm4\n\t" \ |
258 | | "psrlw $8,%%xmm5\n\t" \ |
259 | | "punpcklbw %%xmm6,%%xmm6\n\t" \ |
260 | | "punpcklbw %%xmm7,%%xmm7\n\t" \ |
261 | | "psrlw $8,%%xmm6\n\t" \ |
262 | | "psrlw $8,%%xmm7\n\t" \ |
263 | | |
264 | | /*Performs the first two stages of an 8-point 1-D Hadamard transform in place. |
265 | | Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to |
266 | | perform this stage in place with no temporary registers).*/ |
267 | | #define OC_HADAMARD_AB_8x8 \ |
268 | | "#OC_HADAMARD_AB_8x8\n\t" \ |
269 | | /*Stage A:*/ \ |
270 | | "paddw %%xmm5,%%xmm1\n\t" \ |
271 | | "paddw %%xmm6,%%xmm2\n\t" \ |
272 | | "paddw %%xmm5,%%xmm5\n\t" \ |
273 | | "paddw %%xmm6,%%xmm6\n\t" \ |
274 | | "psubw %%xmm1,%%xmm5\n\t" \ |
275 | | "psubw %%xmm2,%%xmm6\n\t" \ |
276 | | "paddw %%xmm7,%%xmm3\n\t" \ |
277 | | "paddw %%xmm4,%%xmm0\n\t" \ |
278 | | "paddw %%xmm7,%%xmm7\n\t" \ |
279 | | "paddw %%xmm4,%%xmm4\n\t" \ |
280 | | "psubw %%xmm3,%%xmm7\n\t" \ |
281 | | "psubw %%xmm0,%%xmm4\n\t" \ |
282 | | /*Stage B:*/ \ |
283 | | "paddw %%xmm2,%%xmm0\n\t" \ |
284 | | "paddw %%xmm3,%%xmm1\n\t" \ |
285 | | "paddw %%xmm6,%%xmm4\n\t" \ |
286 | | "paddw %%xmm7,%%xmm5\n\t" \ |
287 | | "paddw %%xmm2,%%xmm2\n\t" \ |
288 | | "paddw %%xmm3,%%xmm3\n\t" \ |
289 | | "paddw %%xmm6,%%xmm6\n\t" \ |
290 | | "paddw %%xmm7,%%xmm7\n\t" \ |
291 | | "psubw %%xmm0,%%xmm2\n\t" \ |
292 | | "psubw %%xmm1,%%xmm3\n\t" \ |
293 | | "psubw %%xmm4,%%xmm6\n\t" \ |
294 | | "psubw %%xmm5,%%xmm7\n\t" \ |
295 | | |
296 | | /*Performs the last stage of an 8-point 1-D Hadamard transform in place. |
297 | | Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in |
298 | | place with no temporary registers).*/ |
299 | | #define OC_HADAMARD_C_8x8 \ |
300 | | "#OC_HADAMARD_C_8x8\n\t" \ |
301 | | /*Stage C:*/ \ |
302 | | "paddw %%xmm1,%%xmm0\n\t" \ |
303 | | "paddw %%xmm3,%%xmm2\n\t" \ |
304 | | "paddw %%xmm5,%%xmm4\n\t" \ |
305 | | "paddw %%xmm7,%%xmm6\n\t" \ |
306 | | "paddw %%xmm1,%%xmm1\n\t" \ |
307 | | "paddw %%xmm3,%%xmm3\n\t" \ |
308 | | "paddw %%xmm5,%%xmm5\n\t" \ |
309 | | "paddw %%xmm7,%%xmm7\n\t" \ |
310 | | "psubw %%xmm0,%%xmm1\n\t" \ |
311 | | "psubw %%xmm2,%%xmm3\n\t" \ |
312 | | "psubw %%xmm4,%%xmm5\n\t" \ |
313 | | "psubw %%xmm6,%%xmm7\n\t" \ |
314 | | |
315 | | /*Performs an 8-point 1-D Hadamard transform in place. |
316 | | Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform |
317 | | in place with no temporary registers).*/ |
318 | | #define OC_HADAMARD_8x8 \ |
319 | | OC_HADAMARD_AB_8x8 \ |
320 | | OC_HADAMARD_C_8x8 \ |
321 | | |
322 | | /*Performs the first part of the final stage of the Hadamard transform and |
323 | | summing of absolute values. |
324 | | At the end of this part, %%xmm1 will contain the DC coefficient of the |
325 | | transform.*/ |
326 | | #define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \ |
327 | | /*We use the fact that \ |
328 | | (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \ |
329 | | to merge the final butterfly with the abs and the first stage of \ |
330 | | accumulation. \ |
331 | | Thus we can avoid using pabsw, which is not available until SSSE3. \ |
332 | | Emulating pabsw takes 3 instructions, so the straightforward SSE2 \ |
333 | | implementation would be (3+3)*8+7=55 instructions (+4 for spilling \ |
334 | | registers). \ |
335 | | Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \ |
336 | | This implementation is only 26 (+4 for spilling registers).*/ \ |
337 | | "#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \ |
338 | | "movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
339 | | "movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
340 | | /*xmm7={0x7FFF}x4 \ |
341 | | xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \ |
342 | | "pcmpeqb %%xmm7,%%xmm7\n\t" \ |
343 | | "movdqa %%xmm4,%%xmm6\n\t" \ |
344 | | "psrlw $1,%%xmm7\n\t" \ |
345 | | "paddw %%xmm5,%%xmm6\n\t" \ |
346 | | "pmaxsw %%xmm5,%%xmm4\n\t" \ |
347 | | "paddsw %%xmm7,%%xmm6\n\t" \ |
348 | | "psubw %%xmm6,%%xmm4\n\t" \ |
349 | | /*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \ |
350 | | xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \ |
351 | | "movdqa %%xmm2,%%xmm6\n\t" \ |
352 | | "movdqa %%xmm0,%%xmm5\n\t" \ |
353 | | "pmaxsw %%xmm3,%%xmm2\n\t" \ |
354 | | "pmaxsw %%xmm1,%%xmm0\n\t" \ |
355 | | "paddw %%xmm3,%%xmm6\n\t" \ |
356 | | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \ |
357 | | "paddw %%xmm5,%%xmm1\n\t" \ |
358 | | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \ |
359 | | |
360 | | /*Performs the second part of the final stage of the Hadamard transform and |
361 | | summing of absolute values.*/ |
362 | | #define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \ |
363 | | "#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \ |
364 | | "paddsw %%xmm7,%%xmm6\n\t" \ |
365 | | "paddsw %%xmm7,%%xmm1\n\t" \ |
366 | | "psubw %%xmm6,%%xmm2\n\t" \ |
367 | | "psubw %%xmm1,%%xmm0\n\t" \ |
368 | | /*xmm7={1}x4 (needed for the horizontal add that follows) \ |
369 | | xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \ |
370 | | "movdqa %%xmm3,%%xmm6\n\t" \ |
371 | | "pmaxsw %%xmm5,%%xmm3\n\t" \ |
372 | | "paddw %%xmm2,%%xmm0\n\t" \ |
373 | | "paddw %%xmm5,%%xmm6\n\t" \ |
374 | | "paddw %%xmm4,%%xmm0\n\t" \ |
375 | | "paddsw %%xmm7,%%xmm6\n\t" \ |
376 | | "paddw %%xmm3,%%xmm0\n\t" \ |
377 | | "psrlw $14,%%xmm7\n\t" \ |
378 | | "psubw %%xmm6,%%xmm0\n\t" \ |
379 | | |
380 | | /*Performs the last stage of an 8-point 1-D Hadamard transform, takes the |
381 | | absolute value of each component, and accumulates everything into xmm0.*/ |
382 | | #define OC_HADAMARD_C_ABS_ACCUM_8x8 \ |
383 | | OC_HADAMARD_C_ABS_ACCUM_A_8x8 \ |
384 | | OC_HADAMARD_C_ABS_ACCUM_B_8x8 \ |
385 | | |
386 | | /*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each |
387 | | component, and accumulates everything into xmm0. |
388 | | Note that xmm0 will have an extra 4 added to each column, and that after |
389 | | removing this value, the remainder will be half the conventional value.*/ |
390 | | #define OC_HADAMARD_ABS_ACCUM_8x8 \ |
391 | | OC_HADAMARD_AB_8x8 \ |
392 | | OC_HADAMARD_C_ABS_ACCUM_8x8 |
393 | | |
394 | | static unsigned __attribute__((target("sse2"))) oc_int_frag_satd_sse2(int *_dc, |
395 | | const unsigned char *_src,int _src_ystride, |
396 | 29.2M | const unsigned char *_ref,int _ref_ystride){ |
397 | 29.2M | OC_ALIGN16(ogg_int16_t buf[16]); |
398 | 29.2M | unsigned ret; |
399 | 29.2M | unsigned ret2; |
400 | 29.2M | int dc; |
401 | 29.2M | __asm__ __volatile__( |
402 | 29.2M | OC_LOAD_SUB_8x8 |
403 | 29.2M | OC_HADAMARD_8x8 |
404 | 29.2M | OC_TRANSPOSE_8x8 |
405 | | /*We split out the stages here so we can save the DC coefficient in the |
406 | | middle.*/ |
407 | 29.2M | OC_HADAMARD_AB_8x8 |
408 | 29.2M | OC_HADAMARD_C_ABS_ACCUM_A_8x8 |
409 | 29.2M | "movd %%xmm1,%[dc]\n\t" |
410 | 29.2M | OC_HADAMARD_C_ABS_ACCUM_B_8x8 |
411 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
412 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
413 | | for the factor of two we dropped + 3 for the vertical accumulation). |
414 | | Now we finally have to promote things to dwords. |
415 | | We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long |
416 | | latency of pmaddwd by starting to compute abs(dc) here.*/ |
417 | 29.2M | "pmaddwd %%xmm7,%%xmm0\n\t" |
418 | 29.2M | "movsx %w[dc],%[dc]\n\t" |
419 | 29.2M | "cdq\n\t" |
420 | 29.2M | "movdqa %%xmm0,%%xmm1\n\t" |
421 | 29.2M | "punpckhqdq %%xmm0,%%xmm0\n\t" |
422 | 29.2M | "paddd %%xmm1,%%xmm0\n\t" |
423 | 29.2M | "pshuflw $0xE,%%xmm0,%%xmm1\n\t" |
424 | 29.2M | "paddd %%xmm1,%%xmm0\n\t" |
425 | 29.2M | "movd %%xmm0,%[ret]\n\t" |
426 | | /*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4 |
427 | | added to them, a factor of two removed, and the DC value included; |
428 | | correct the final sum here.*/ |
429 | 29.2M | "lea -64(%[ret2],%[ret],2),%[ret]\n\t" |
430 | 29.2M | "xor %[dc],%[ret2]\n\t" |
431 | 29.2M | "sub %[ret2],%[ret]\n\t" |
432 | | /*Although it looks like we're using 7 registers here, gcc can alias %[ret] |
433 | | and %[dc] with some of the inputs, since for once we don't write to |
434 | | them until after we're done using everything but %[buf].*/ |
435 | | /*Note that _src_ystride and _ref_ystride must be given non-overlapping |
436 | | constraints, otherwise if gcc can prove they're equal it will allocate |
437 | | them to the same register (which is bad); _src and _ref face a similar |
438 | | problem. |
439 | | All four are destructively modified, but if we list them as output |
440 | | constraints, gcc can't alias them with other outputs.*/ |
441 | 29.2M | :[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc), |
442 | 29.2M | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)) |
443 | 29.2M | :[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride), |
444 | 29.2M | [ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride) |
445 | | /*We have to use neg, so we actually clobber the condition codes for once |
446 | | (not to mention sub, and add).*/ |
447 | 29.2M | :"cc", |
448 | 29.2M | "%xmm0", "%xmm1", "%xmm2", "%xmm3", |
449 | 29.2M | "%xmm4", "%xmm5", "%xmm6", "%xmm7" |
450 | 29.2M | ); |
451 | 29.2M | *_dc=dc; |
452 | 29.2M | return ret; |
453 | 29.2M | } |
454 | | |
455 | | unsigned __attribute__((target("sse2"))) oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src, |
456 | 14.5M | const unsigned char *_ref,int _ystride){ |
457 | 14.5M | return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride); |
458 | 14.5M | } |
459 | | |
460 | | unsigned __attribute__((target("sse2"))) oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src, |
461 | 14.6M | const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){ |
462 | 14.6M | OC_ALIGN8(unsigned char ref[64]); |
463 | 14.6M | oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride); |
464 | 14.6M | return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8); |
465 | 14.6M | } |
466 | | |
467 | | unsigned __attribute__((target("sse2"))) oc_enc_frag_intra_satd_sse2(int *_dc, |
468 | 21.1M | const unsigned char *_src,int _ystride){ |
469 | 21.1M | OC_ALIGN16(ogg_int16_t buf[16]); |
470 | 21.1M | unsigned ret; |
471 | 21.1M | int dc; |
472 | 21.1M | __asm__ __volatile__( |
473 | 21.1M | OC_LOAD_8x8 |
474 | 21.1M | OC_HADAMARD_8x8 |
475 | 21.1M | OC_TRANSPOSE_8x8 |
476 | | /*We split out the stages here so we can save the DC coefficient in the |
477 | | middle.*/ |
478 | 21.1M | OC_HADAMARD_AB_8x8 |
479 | 21.1M | OC_HADAMARD_C_ABS_ACCUM_A_8x8 |
480 | 21.1M | "movd %%xmm1,%[dc]\n\t" |
481 | 21.1M | OC_HADAMARD_C_ABS_ACCUM_B_8x8 |
482 | | /*Up to this point, everything fit in 16 bits (8 input + 1 for the |
483 | | difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1 |
484 | | for the factor of two we dropped + 3 for the vertical accumulation). |
485 | | Now we finally have to promote things to dwords.*/ |
486 | 21.1M | "pmaddwd %%xmm7,%%xmm0\n\t" |
487 | | /*We assume that the DC coefficient is always positive (which is true, |
488 | | because the input to the INTRA transform was not a difference).*/ |
489 | 21.1M | "movzx %w[dc],%[dc]\n\t" |
490 | 21.1M | "movdqa %%xmm0,%%xmm1\n\t" |
491 | 21.1M | "punpckhqdq %%xmm0,%%xmm0\n\t" |
492 | 21.1M | "paddd %%xmm1,%%xmm0\n\t" |
493 | 21.1M | "pshuflw $0xE,%%xmm0,%%xmm1\n\t" |
494 | 21.1M | "paddd %%xmm1,%%xmm0\n\t" |
495 | 21.1M | "movd %%xmm0,%[ret]\n\t" |
496 | 21.1M | "lea -64(%[ret],%[ret]),%[ret]\n\t" |
497 | 21.1M | "sub %[dc],%[ret]\n\t" |
498 | | /*Although it looks like we're using 7 registers here, gcc can alias %[ret] |
499 | | and %[dc] with some of the inputs, since for once we don't write to |
500 | | them until after we're done using everything but %[buf].*/ |
501 | 21.1M | :[ret]"=a"(ret),[dc]"=r"(dc), |
502 | 21.1M | [buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)) |
503 | 21.1M | :[src]"r"(_src),[src4]"r"(_src+4*_ystride), |
504 | 21.1M | [ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride) |
505 | | /*We have to use sub, so we actually clobber the condition codes for once.*/ |
506 | 21.1M | :"cc", |
507 | 21.1M | "%xmm0", "%xmm1", "%xmm2", "%xmm3", |
508 | 21.1M | "%xmm4", "%xmm5", "%xmm6", "%xmm7" |
509 | 21.1M | ); |
510 | 21.1M | *_dc=dc; |
511 | 21.1M | return ret; |
512 | 21.1M | } |
513 | | |
514 | | #endif |