/src/mozilla-central/gfx/ycbcr/yuv_row_posix.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
2 | | // Use of this source code is governed by a BSD-style license that can be |
3 | | // found in the LICENSE file. |
4 | | |
5 | | #include "yuv_row.h" |
6 | | #include "mozilla/SSE.h" |
7 | | |
8 | | #define DCHECK(a) |
9 | | |
10 | | extern "C" { |
11 | | |
12 | | #if defined(ARCH_CPU_X86_64) |
13 | | |
14 | | // We don't need CPUID guards here, since x86-64 implies SSE2. |
15 | | |
16 | | // AMD64 ABI uses register paremters. |
17 | | void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi |
18 | | const uint8* u_buf, // rsi |
19 | | const uint8* v_buf, // rdx |
20 | | uint8* rgb_buf, // rcx |
21 | 0 | int width) { // r8 |
22 | 0 | asm volatile( |
23 | 0 | "jmp 1f\n" |
24 | 0 | "0:" |
25 | 0 | "movzb (%[u_buf]),%%r10\n" |
26 | 0 | "add $0x1,%[u_buf]\n" |
27 | 0 | "movzb (%[v_buf]),%%r11\n" |
28 | 0 | "add $0x1,%[v_buf]\n" |
29 | 0 | "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" |
30 | 0 | "movzb (%[y_buf]),%%r10\n" |
31 | 0 | "movq 4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n" |
32 | 0 | "movzb 0x1(%[y_buf]),%%r11\n" |
33 | 0 | "paddsw %%xmm1,%%xmm0\n" |
34 | 0 | "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n" |
35 | 0 | "add $0x2,%[y_buf]\n" |
36 | 0 | "movq (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n" |
37 | 0 | "paddsw %%xmm0,%%xmm2\n" |
38 | 0 | "paddsw %%xmm0,%%xmm3\n" |
39 | 0 | "shufps $0x44,%%xmm3,%%xmm2\n" |
40 | 0 | "psraw $0x6,%%xmm2\n" |
41 | 0 | "packuswb %%xmm2,%%xmm2\n" |
42 | 0 | "movq %%xmm2,0x0(%[rgb_buf])\n" |
43 | 0 | "add $0x8,%[rgb_buf]\n" |
44 | 0 | "1:" |
45 | 0 | "sub $0x2,%[width]\n" |
46 | 0 | "jns 0b\n" |
47 | 0 |
|
48 | 0 | "2:" |
49 | 0 | "add $0x1,%[width]\n" |
50 | 0 | "js 3f\n" |
51 | 0 |
|
52 | 0 | "movzb (%[u_buf]),%%r10\n" |
53 | 0 | "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" |
54 | 0 | "movzb (%[v_buf]),%%r10\n" |
55 | 0 | "movq 4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" |
56 | 0 | "paddsw %%xmm1,%%xmm0\n" |
57 | 0 | "movzb (%[y_buf]),%%r10\n" |
58 | 0 | "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" |
59 | 0 | "paddsw %%xmm0,%%xmm1\n" |
60 | 0 | "psraw $0x6,%%xmm1\n" |
61 | 0 | "packuswb %%xmm1,%%xmm1\n" |
62 | 0 | "movd %%xmm1,0x0(%[rgb_buf])\n" |
63 | 0 | "3:" |
64 | 0 | : [y_buf] "+r"(y_buf), |
65 | 0 | [u_buf] "+r"(u_buf), |
66 | 0 | [v_buf] "+r"(v_buf), |
67 | 0 | [rgb_buf] "+r"(rgb_buf), |
68 | 0 | [width] "+r"(width) |
69 | 0 | : [kCoefficientsRgbY] "r" (kCoefficientsRgbY) |
70 | 0 | : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" |
71 | 0 | ); |
72 | 0 | } |
73 | | |
74 | | void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi |
75 | | const uint8* u_buf, // rsi |
76 | | const uint8* v_buf, // rdx |
77 | | uint8* rgb_buf, // rcx |
78 | | int width, // r8 |
79 | 0 | int source_dx) { // r9 |
80 | 0 | asm volatile( |
81 | 0 | "xor %%r11,%%r11\n" |
82 | 0 | "sub $0x2,%[width]\n" |
83 | 0 | "js 1f\n" |
84 | 0 |
|
85 | 0 | "0:" |
86 | 0 | "mov %%r11,%%r10\n" |
87 | 0 | "sar $0x11,%%r10\n" |
88 | 0 | "movzb (%[u_buf],%%r10,1),%%rax\n" |
89 | 0 | "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" |
90 | 0 | "movzb (%[v_buf],%%r10,1),%%rax\n" |
91 | 0 | "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" |
92 | 0 | "lea (%%r11,%[source_dx]),%%r10\n" |
93 | 0 | "sar $0x10,%%r11\n" |
94 | 0 | "movzb (%[y_buf],%%r11,1),%%rax\n" |
95 | 0 | "paddsw %%xmm1,%%xmm0\n" |
96 | 0 | "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" |
97 | 0 | "lea (%%r10,%[source_dx]),%%r11\n" |
98 | 0 | "sar $0x10,%%r10\n" |
99 | 0 | "movzb (%[y_buf],%%r10,1),%%rax\n" |
100 | 0 | "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n" |
101 | 0 | "paddsw %%xmm0,%%xmm1\n" |
102 | 0 | "paddsw %%xmm0,%%xmm2\n" |
103 | 0 | "shufps $0x44,%%xmm2,%%xmm1\n" |
104 | 0 | "psraw $0x6,%%xmm1\n" |
105 | 0 | "packuswb %%xmm1,%%xmm1\n" |
106 | 0 | "movq %%xmm1,0x0(%[rgb_buf])\n" |
107 | 0 | "add $0x8,%[rgb_buf]\n" |
108 | 0 | "sub $0x2,%[width]\n" |
109 | 0 | "jns 0b\n" |
110 | 0 |
|
111 | 0 | "1:" |
112 | 0 | "add $0x1,%[width]\n" |
113 | 0 | "js 2f\n" |
114 | 0 |
|
115 | 0 | "mov %%r11,%%r10\n" |
116 | 0 | "sar $0x11,%%r10\n" |
117 | 0 | "movzb (%[u_buf],%%r10,1),%%rax\n" |
118 | 0 | "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" |
119 | 0 | "movzb (%[v_buf],%%r10,1),%%rax\n" |
120 | 0 | "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" |
121 | 0 | "paddsw %%xmm1,%%xmm0\n" |
122 | 0 | "sar $0x10,%%r11\n" |
123 | 0 | "movzb (%[y_buf],%%r11,1),%%rax\n" |
124 | 0 | "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" |
125 | 0 | "paddsw %%xmm0,%%xmm1\n" |
126 | 0 | "psraw $0x6,%%xmm1\n" |
127 | 0 | "packuswb %%xmm1,%%xmm1\n" |
128 | 0 | "movd %%xmm1,0x0(%[rgb_buf])\n" |
129 | 0 |
|
130 | 0 | "2:" |
131 | 0 | : [rgb_buf] "+r"(rgb_buf), |
132 | 0 | [width] "+r"(width) |
133 | 0 | : [y_buf] "r"(y_buf), |
134 | 0 | [u_buf] "r"(u_buf), |
135 | 0 | [v_buf] "r"(v_buf), |
136 | 0 | [kCoefficientsRgbY] "r" (kCoefficientsRgbY), |
137 | 0 | [source_dx] "r"(static_cast<long>(source_dx)) |
138 | 0 | : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" |
139 | 0 | ); |
140 | 0 | } |
141 | | |
142 | | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
143 | | const uint8* u_buf, |
144 | | const uint8* v_buf, |
145 | | uint8* rgb_buf, |
146 | | int width, |
147 | 0 | int source_dx) { |
148 | 0 | asm volatile( |
149 | 0 | "xor %%r11,%%r11\n" // x = 0 |
150 | 0 | "sub $0x2,%[width]\n" |
151 | 0 | "js 2f\n" |
152 | 0 | "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0 |
153 | 0 | "jl 0f\n" |
154 | 0 | "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less |
155 | 0 | "0:" |
156 | 0 |
|
157 | 0 | "1:" |
158 | 0 | "mov %%r11,%%r10\n" |
159 | 0 | "sar $0x11,%%r10\n" |
160 | 0 |
|
161 | 0 | "movzb (%[u_buf], %%r10, 1), %%r13 \n" |
162 | 0 | "movzb 1(%[u_buf], %%r10, 1), %%r14 \n" |
163 | 0 | "mov %%r11, %%rax \n" |
164 | 0 | "and $0x1fffe, %%rax \n" |
165 | 0 | "imul %%rax, %%r14 \n" |
166 | 0 | "xor $0x1fffe, %%rax \n" |
167 | 0 | "imul %%rax, %%r13 \n" |
168 | 0 | "add %%r14, %%r13 \n" |
169 | 0 | "shr $17, %%r13 \n" |
170 | 0 | "movq 2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n" |
171 | 0 |
|
172 | 0 | "movzb (%[v_buf], %%r10, 1), %%r13 \n" |
173 | 0 | "movzb 1(%[v_buf], %%r10, 1), %%r14 \n" |
174 | 0 | "mov %%r11, %%rax \n" |
175 | 0 | "and $0x1fffe, %%rax \n" |
176 | 0 | "imul %%rax, %%r14 \n" |
177 | 0 | "xor $0x1fffe, %%rax \n" |
178 | 0 | "imul %%rax, %%r13 \n" |
179 | 0 | "add %%r14, %%r13 \n" |
180 | 0 | "shr $17, %%r13 \n" |
181 | 0 | "movq 4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n" |
182 | 0 |
|
183 | 0 | "mov %%r11, %%rax \n" |
184 | 0 | "lea (%%r11,%[source_dx]),%%r10\n" |
185 | 0 | "sar $0x10,%%r11\n" |
186 | 0 | "paddsw %%xmm1,%%xmm0\n" |
187 | 0 |
|
188 | 0 | "movzb (%[y_buf], %%r11, 1), %%r13 \n" |
189 | 0 | "movzb 1(%[y_buf], %%r11, 1), %%r14 \n" |
190 | 0 | "and $0xffff, %%rax \n" |
191 | 0 | "imul %%rax, %%r14 \n" |
192 | 0 | "xor $0xffff, %%rax \n" |
193 | 0 | "imul %%rax, %%r13 \n" |
194 | 0 | "add %%r14, %%r13 \n" |
195 | 0 | "shr $16, %%r13 \n" |
196 | 0 | "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" |
197 | 0 |
|
198 | 0 | "mov %%r10, %%rax \n" |
199 | 0 | "lea (%%r10,%[source_dx]),%%r11\n" |
200 | 0 | "sar $0x10,%%r10\n" |
201 | 0 |
|
202 | 0 | "movzb (%[y_buf],%%r10,1), %%r13 \n" |
203 | 0 | "movzb 1(%[y_buf],%%r10,1), %%r14 \n" |
204 | 0 | "and $0xffff, %%rax \n" |
205 | 0 | "imul %%rax, %%r14 \n" |
206 | 0 | "xor $0xffff, %%rax \n" |
207 | 0 | "imul %%rax, %%r13 \n" |
208 | 0 | "add %%r14, %%r13 \n" |
209 | 0 | "shr $16, %%r13 \n" |
210 | 0 | "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n" |
211 | 0 |
|
212 | 0 | "paddsw %%xmm0,%%xmm1\n" |
213 | 0 | "paddsw %%xmm0,%%xmm2\n" |
214 | 0 | "shufps $0x44,%%xmm2,%%xmm1\n" |
215 | 0 | "psraw $0x6,%%xmm1\n" |
216 | 0 | "packuswb %%xmm1,%%xmm1\n" |
217 | 0 | "movq %%xmm1,0x0(%[rgb_buf])\n" |
218 | 0 | "add $0x8,%[rgb_buf]\n" |
219 | 0 | "sub $0x2,%[width]\n" |
220 | 0 | "jns 1b\n" |
221 | 0 |
|
222 | 0 | "2:" |
223 | 0 | "add $0x1,%[width]\n" |
224 | 0 | "js 3f\n" |
225 | 0 |
|
226 | 0 | "mov %%r11,%%r10\n" |
227 | 0 | "sar $0x11,%%r10\n" |
228 | 0 |
|
229 | 0 | "movzb (%[u_buf],%%r10,1), %%r13 \n" |
230 | 0 | "movq 2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n" |
231 | 0 |
|
232 | 0 | "movzb (%[v_buf],%%r10,1), %%r13 \n" |
233 | 0 | "movq 4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" |
234 | 0 |
|
235 | 0 | "paddsw %%xmm1,%%xmm0\n" |
236 | 0 | "sar $0x10,%%r11\n" |
237 | 0 |
|
238 | 0 | "movzb (%[y_buf],%%r11,1), %%r13 \n" |
239 | 0 | "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" |
240 | 0 |
|
241 | 0 | "paddsw %%xmm0,%%xmm1\n" |
242 | 0 | "psraw $0x6,%%xmm1\n" |
243 | 0 | "packuswb %%xmm1,%%xmm1\n" |
244 | 0 | "movd %%xmm1,0x0(%[rgb_buf])\n" |
245 | 0 |
|
246 | 0 | "3:" |
247 | 0 | : [rgb_buf] "+r"(rgb_buf), |
248 | 0 | [width] "+r"(width) |
249 | 0 | : [y_buf] "r"(y_buf), |
250 | 0 | [u_buf] "r"(u_buf), |
251 | 0 | [v_buf] "r"(v_buf), |
252 | 0 | [kCoefficientsRgbY] "r" (kCoefficientsRgbY), |
253 | 0 | [source_dx] "r"(static_cast<long>(source_dx)) |
254 | 0 | : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" |
255 | 0 | ); |
256 | 0 | } |
257 | | |
258 | | #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) |
259 | | |
260 | | // PIC version is slower because less registers are available, so |
261 | | // non-PIC is used on platforms where it is possible. |
262 | | void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
263 | | const uint8* u_buf, |
264 | | const uint8* v_buf, |
265 | | uint8* rgb_buf, |
266 | | int width); |
267 | | asm( |
268 | | ".text\n" |
269 | | ".global FastConvertYUVToRGB32Row_SSE\n" |
270 | | ".type FastConvertYUVToRGB32Row_SSE, @function\n" |
271 | | "FastConvertYUVToRGB32Row_SSE:\n" |
272 | | "pusha\n" |
273 | | "mov 0x24(%esp),%edx\n" |
274 | | "mov 0x28(%esp),%edi\n" |
275 | | "mov 0x2c(%esp),%esi\n" |
276 | | "mov 0x30(%esp),%ebp\n" |
277 | | "mov 0x34(%esp),%ecx\n" |
278 | | "jmp 1f\n" |
279 | | |
280 | | "0:" |
281 | | "movzbl (%edi),%eax\n" |
282 | | "add $0x1,%edi\n" |
283 | | "movzbl (%esi),%ebx\n" |
284 | | "add $0x1,%esi\n" |
285 | | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
286 | | "movzbl (%edx),%eax\n" |
287 | | "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" |
288 | | "movzbl 0x1(%edx),%ebx\n" |
289 | | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
290 | | "add $0x2,%edx\n" |
291 | | "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" |
292 | | "paddsw %mm0,%mm1\n" |
293 | | "paddsw %mm0,%mm2\n" |
294 | | "psraw $0x6,%mm1\n" |
295 | | "psraw $0x6,%mm2\n" |
296 | | "packuswb %mm2,%mm1\n" |
297 | | "movntq %mm1,0x0(%ebp)\n" |
298 | | "add $0x8,%ebp\n" |
299 | | "1:" |
300 | | "sub $0x2,%ecx\n" |
301 | | "jns 0b\n" |
302 | | |
303 | | "and $0x1,%ecx\n" |
304 | | "je 2f\n" |
305 | | |
306 | | "movzbl (%edi),%eax\n" |
307 | | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
308 | | "movzbl (%esi),%eax\n" |
309 | | "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
310 | | "movzbl (%edx),%eax\n" |
311 | | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
312 | | "paddsw %mm0,%mm1\n" |
313 | | "psraw $0x6,%mm1\n" |
314 | | "packuswb %mm1,%mm1\n" |
315 | | "movd %mm1,0x0(%ebp)\n" |
316 | | "2:" |
317 | | "popa\n" |
318 | | "ret\n" |
319 | | #if !defined(XP_MACOSX) |
320 | | ".previous\n" |
321 | | #endif |
322 | | ); |
323 | | |
324 | | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
325 | | const uint8* u_buf, |
326 | | const uint8* v_buf, |
327 | | uint8* rgb_buf, |
328 | | int width) |
329 | | { |
330 | | if (mozilla::supports_sse()) { |
331 | | FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
332 | | return; |
333 | | } |
334 | | |
335 | | FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
336 | | } |
337 | | |
338 | | |
339 | | void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
340 | | const uint8* u_buf, |
341 | | const uint8* v_buf, |
342 | | uint8* rgb_buf, |
343 | | int width, |
344 | | int source_dx); |
345 | | asm( |
346 | | ".text\n" |
347 | | ".global ScaleYUVToRGB32Row_SSE\n" |
348 | | ".type ScaleYUVToRGB32Row_SSE, @function\n" |
349 | | "ScaleYUVToRGB32Row_SSE:\n" |
350 | | "pusha\n" |
351 | | "mov 0x24(%esp),%edx\n" |
352 | | "mov 0x28(%esp),%edi\n" |
353 | | "mov 0x2c(%esp),%esi\n" |
354 | | "mov 0x30(%esp),%ebp\n" |
355 | | "mov 0x34(%esp),%ecx\n" |
356 | | "xor %ebx,%ebx\n" |
357 | | "jmp 1f\n" |
358 | | |
359 | | "0:" |
360 | | "mov %ebx,%eax\n" |
361 | | "sar $0x11,%eax\n" |
362 | | "movzbl (%edi,%eax,1),%eax\n" |
363 | | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
364 | | "mov %ebx,%eax\n" |
365 | | "sar $0x11,%eax\n" |
366 | | "movzbl (%esi,%eax,1),%eax\n" |
367 | | "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
368 | | "mov %ebx,%eax\n" |
369 | | "add 0x38(%esp),%ebx\n" |
370 | | "sar $0x10,%eax\n" |
371 | | "movzbl (%edx,%eax,1),%eax\n" |
372 | | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
373 | | "mov %ebx,%eax\n" |
374 | | "add 0x38(%esp),%ebx\n" |
375 | | "sar $0x10,%eax\n" |
376 | | "movzbl (%edx,%eax,1),%eax\n" |
377 | | "movq kCoefficientsRgbY(,%eax,8),%mm2\n" |
378 | | "paddsw %mm0,%mm1\n" |
379 | | "paddsw %mm0,%mm2\n" |
380 | | "psraw $0x6,%mm1\n" |
381 | | "psraw $0x6,%mm2\n" |
382 | | "packuswb %mm2,%mm1\n" |
383 | | "movntq %mm1,0x0(%ebp)\n" |
384 | | "add $0x8,%ebp\n" |
385 | | "1:" |
386 | | "sub $0x2,%ecx\n" |
387 | | "jns 0b\n" |
388 | | |
389 | | "and $0x1,%ecx\n" |
390 | | "je 2f\n" |
391 | | |
392 | | "mov %ebx,%eax\n" |
393 | | "sar $0x11,%eax\n" |
394 | | "movzbl (%edi,%eax,1),%eax\n" |
395 | | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
396 | | "mov %ebx,%eax\n" |
397 | | "sar $0x11,%eax\n" |
398 | | "movzbl (%esi,%eax,1),%eax\n" |
399 | | "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
400 | | "mov %ebx,%eax\n" |
401 | | "sar $0x10,%eax\n" |
402 | | "movzbl (%edx,%eax,1),%eax\n" |
403 | | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
404 | | "paddsw %mm0,%mm1\n" |
405 | | "psraw $0x6,%mm1\n" |
406 | | "packuswb %mm1,%mm1\n" |
407 | | "movd %mm1,0x0(%ebp)\n" |
408 | | |
409 | | "2:" |
410 | | "popa\n" |
411 | | "ret\n" |
412 | | #if !defined(XP_MACOSX) |
413 | | ".previous\n" |
414 | | #endif |
415 | | ); |
416 | | |
417 | | void ScaleYUVToRGB32Row(const uint8* y_buf, |
418 | | const uint8* u_buf, |
419 | | const uint8* v_buf, |
420 | | uint8* rgb_buf, |
421 | | int width, |
422 | | int source_dx) |
423 | | { |
424 | | if (mozilla::supports_sse()) { |
425 | | ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
426 | | width, source_dx); |
427 | | return; |
428 | | } |
429 | | |
430 | | ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
431 | | width, source_dx); |
432 | | } |
433 | | |
434 | | void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
435 | | const uint8* u_buf, |
436 | | const uint8* v_buf, |
437 | | uint8* rgb_buf, |
438 | | int width, |
439 | | int source_dx); |
440 | | asm( |
441 | | ".text\n" |
442 | | ".global LinearScaleYUVToRGB32Row_SSE\n" |
443 | | ".type LinearScaleYUVToRGB32Row_SSE, @function\n" |
444 | | "LinearScaleYUVToRGB32Row_SSE:\n" |
445 | | "pusha\n" |
446 | | "mov 0x24(%esp),%edx\n" |
447 | | "mov 0x28(%esp),%edi\n" |
448 | | "mov 0x30(%esp),%ebp\n" |
449 | | |
450 | | // source_width = width * source_dx + ebx |
451 | | "mov 0x34(%esp), %ecx\n" |
452 | | "imull 0x38(%esp), %ecx\n" |
453 | | "mov %ecx, 0x34(%esp)\n" |
454 | | |
455 | | "mov 0x38(%esp), %ecx\n" |
456 | | "xor %ebx,%ebx\n" // x = 0 |
457 | | "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
458 | | "jl 1f\n" |
459 | | "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
460 | | "jmp 1f\n" |
461 | | |
462 | | "0:" |
463 | | "mov %ebx,%eax\n" |
464 | | "sar $0x11,%eax\n" |
465 | | |
466 | | "movzbl (%edi,%eax,1),%ecx\n" |
467 | | "movzbl 1(%edi,%eax,1),%esi\n" |
468 | | "mov %ebx,%eax\n" |
469 | | "andl $0x1fffe, %eax \n" |
470 | | "imul %eax, %esi \n" |
471 | | "xorl $0x1fffe, %eax \n" |
472 | | "imul %eax, %ecx \n" |
473 | | "addl %esi, %ecx \n" |
474 | | "shrl $17, %ecx \n" |
475 | | "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" |
476 | | |
477 | | "mov 0x2c(%esp),%esi\n" |
478 | | "mov %ebx,%eax\n" |
479 | | "sar $0x11,%eax\n" |
480 | | |
481 | | "movzbl (%esi,%eax,1),%ecx\n" |
482 | | "movzbl 1(%esi,%eax,1),%esi\n" |
483 | | "mov %ebx,%eax\n" |
484 | | "andl $0x1fffe, %eax \n" |
485 | | "imul %eax, %esi \n" |
486 | | "xorl $0x1fffe, %eax \n" |
487 | | "imul %eax, %ecx \n" |
488 | | "addl %esi, %ecx \n" |
489 | | "shrl $17, %ecx \n" |
490 | | "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" |
491 | | |
492 | | "mov %ebx,%eax\n" |
493 | | "sar $0x10,%eax\n" |
494 | | "movzbl (%edx,%eax,1),%ecx\n" |
495 | | "movzbl 1(%edx,%eax,1),%esi\n" |
496 | | "mov %ebx,%eax\n" |
497 | | "add 0x38(%esp),%ebx\n" |
498 | | "andl $0xffff, %eax \n" |
499 | | "imul %eax, %esi \n" |
500 | | "xorl $0xffff, %eax \n" |
501 | | "imul %eax, %ecx \n" |
502 | | "addl %esi, %ecx \n" |
503 | | "shrl $16, %ecx \n" |
504 | | "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" |
505 | | |
506 | | "cmp 0x34(%esp), %ebx\n" |
507 | | "jge 2f\n" |
508 | | |
509 | | "mov %ebx,%eax\n" |
510 | | "sar $0x10,%eax\n" |
511 | | "movzbl (%edx,%eax,1),%ecx\n" |
512 | | "movzbl 1(%edx,%eax,1),%esi\n" |
513 | | "mov %ebx,%eax\n" |
514 | | "add 0x38(%esp),%ebx\n" |
515 | | "andl $0xffff, %eax \n" |
516 | | "imul %eax, %esi \n" |
517 | | "xorl $0xffff, %eax \n" |
518 | | "imul %eax, %ecx \n" |
519 | | "addl %esi, %ecx \n" |
520 | | "shrl $16, %ecx \n" |
521 | | "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" |
522 | | |
523 | | "paddsw %mm0,%mm1\n" |
524 | | "paddsw %mm0,%mm2\n" |
525 | | "psraw $0x6,%mm1\n" |
526 | | "psraw $0x6,%mm2\n" |
527 | | "packuswb %mm2,%mm1\n" |
528 | | "movntq %mm1,0x0(%ebp)\n" |
529 | | "add $0x8,%ebp\n" |
530 | | |
531 | | "1:" |
532 | | "cmp 0x34(%esp), %ebx\n" |
533 | | "jl 0b\n" |
534 | | "popa\n" |
535 | | "ret\n" |
536 | | |
537 | | "2:" |
538 | | "paddsw %mm0, %mm1\n" |
539 | | "psraw $6, %mm1\n" |
540 | | "packuswb %mm1, %mm1\n" |
541 | | "movd %mm1, (%ebp)\n" |
542 | | "popa\n" |
543 | | "ret\n" |
544 | | #if !defined(XP_MACOSX) |
545 | | ".previous\n" |
546 | | #endif |
547 | | ); |
548 | | |
549 | | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
550 | | const uint8* u_buf, |
551 | | const uint8* v_buf, |
552 | | uint8* rgb_buf, |
553 | | int width, |
554 | | int source_dx) |
555 | | { |
556 | | if (mozilla::supports_sse()) { |
557 | | LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
558 | | width, source_dx); |
559 | | return; |
560 | | } |
561 | | |
562 | | LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
563 | | width, source_dx); |
564 | | } |
565 | | |
566 | | #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) |
567 | | |
568 | | void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
569 | | const uint8* u_buf, |
570 | | const uint8* v_buf, |
571 | | uint8* rgb_buf, |
572 | | int width, |
573 | | const int16 *kCoefficientsRgbY); |
574 | | |
575 | | asm( |
576 | | ".text\n" |
577 | | #if defined(XP_MACOSX) |
578 | | "_PICConvertYUVToRGB32Row_SSE:\n" |
579 | | #else |
580 | | "PICConvertYUVToRGB32Row_SSE:\n" |
581 | | #endif |
582 | | "pusha\n" |
583 | | "mov 0x24(%esp),%edx\n" |
584 | | "mov 0x28(%esp),%edi\n" |
585 | | "mov 0x2c(%esp),%esi\n" |
586 | | "mov 0x30(%esp),%ebp\n" |
587 | | "mov 0x38(%esp),%ecx\n" |
588 | | |
589 | | "jmp 1f\n" |
590 | | |
591 | | "0:" |
592 | | "movzbl (%edi),%eax\n" |
593 | | "add $0x1,%edi\n" |
594 | | "movzbl (%esi),%ebx\n" |
595 | | "add $0x1,%esi\n" |
596 | | "movq 2048(%ecx,%eax,8),%mm0\n" |
597 | | "movzbl (%edx),%eax\n" |
598 | | "paddsw 4096(%ecx,%ebx,8),%mm0\n" |
599 | | "movzbl 0x1(%edx),%ebx\n" |
600 | | "movq 0(%ecx,%eax,8),%mm1\n" |
601 | | "add $0x2,%edx\n" |
602 | | "movq 0(%ecx,%ebx,8),%mm2\n" |
603 | | "paddsw %mm0,%mm1\n" |
604 | | "paddsw %mm0,%mm2\n" |
605 | | "psraw $0x6,%mm1\n" |
606 | | "psraw $0x6,%mm2\n" |
607 | | "packuswb %mm2,%mm1\n" |
608 | | "movntq %mm1,0x0(%ebp)\n" |
609 | | "add $0x8,%ebp\n" |
610 | | "1:" |
611 | | "subl $0x2,0x34(%esp)\n" |
612 | | "jns 0b\n" |
613 | | |
614 | | "andl $0x1,0x34(%esp)\n" |
615 | | "je 2f\n" |
616 | | |
617 | | "movzbl (%edi),%eax\n" |
618 | | "movq 2048(%ecx,%eax,8),%mm0\n" |
619 | | "movzbl (%esi),%eax\n" |
620 | | "paddsw 4096(%ecx,%eax,8),%mm0\n" |
621 | | "movzbl (%edx),%eax\n" |
622 | | "movq 0(%ecx,%eax,8),%mm1\n" |
623 | | "paddsw %mm0,%mm1\n" |
624 | | "psraw $0x6,%mm1\n" |
625 | | "packuswb %mm1,%mm1\n" |
626 | | "movd %mm1,0x0(%ebp)\n" |
627 | | "2:" |
628 | | "popa\n" |
629 | | "ret\n" |
630 | | #if !defined(XP_MACOSX) |
631 | | ".previous\n" |
632 | | #endif |
633 | | ); |
634 | | |
635 | | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
636 | | const uint8* u_buf, |
637 | | const uint8* v_buf, |
638 | | uint8* rgb_buf, |
639 | | int width) |
640 | | { |
641 | | if (mozilla::supports_sse()) { |
642 | | PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
643 | | &kCoefficientsRgbY[0][0]); |
644 | | return; |
645 | | } |
646 | | |
647 | | FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
648 | | } |
649 | | |
650 | | void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
651 | | const uint8* u_buf, |
652 | | const uint8* v_buf, |
653 | | uint8* rgb_buf, |
654 | | int width, |
655 | | int source_dx, |
656 | | const int16 *kCoefficientsRgbY); |
657 | | |
658 | | asm( |
659 | | ".text\n" |
660 | | #if defined(XP_MACOSX) |
661 | | "_PICScaleYUVToRGB32Row_SSE:\n" |
662 | | #else |
663 | | "PICScaleYUVToRGB32Row_SSE:\n" |
664 | | #endif |
665 | | "pusha\n" |
666 | | "mov 0x24(%esp),%edx\n" |
667 | | "mov 0x28(%esp),%edi\n" |
668 | | "mov 0x2c(%esp),%esi\n" |
669 | | "mov 0x30(%esp),%ebp\n" |
670 | | "mov 0x3c(%esp),%ecx\n" |
671 | | "xor %ebx,%ebx\n" |
672 | | "jmp 1f\n" |
673 | | |
674 | | "0:" |
675 | | "mov %ebx,%eax\n" |
676 | | "sar $0x11,%eax\n" |
677 | | "movzbl (%edi,%eax,1),%eax\n" |
678 | | "movq 2048(%ecx,%eax,8),%mm0\n" |
679 | | "mov %ebx,%eax\n" |
680 | | "sar $0x11,%eax\n" |
681 | | "movzbl (%esi,%eax,1),%eax\n" |
682 | | "paddsw 4096(%ecx,%eax,8),%mm0\n" |
683 | | "mov %ebx,%eax\n" |
684 | | "add 0x38(%esp),%ebx\n" |
685 | | "sar $0x10,%eax\n" |
686 | | "movzbl (%edx,%eax,1),%eax\n" |
687 | | "movq 0(%ecx,%eax,8),%mm1\n" |
688 | | "mov %ebx,%eax\n" |
689 | | "add 0x38(%esp),%ebx\n" |
690 | | "sar $0x10,%eax\n" |
691 | | "movzbl (%edx,%eax,1),%eax\n" |
692 | | "movq 0(%ecx,%eax,8),%mm2\n" |
693 | | "paddsw %mm0,%mm1\n" |
694 | | "paddsw %mm0,%mm2\n" |
695 | | "psraw $0x6,%mm1\n" |
696 | | "psraw $0x6,%mm2\n" |
697 | | "packuswb %mm2,%mm1\n" |
698 | | "movntq %mm1,0x0(%ebp)\n" |
699 | | "add $0x8,%ebp\n" |
700 | | "1:" |
701 | | "subl $0x2,0x34(%esp)\n" |
702 | | "jns 0b\n" |
703 | | |
704 | | "andl $0x1,0x34(%esp)\n" |
705 | | "je 2f\n" |
706 | | |
707 | | "mov %ebx,%eax\n" |
708 | | "sar $0x11,%eax\n" |
709 | | "movzbl (%edi,%eax,1),%eax\n" |
710 | | "movq 2048(%ecx,%eax,8),%mm0\n" |
711 | | "mov %ebx,%eax\n" |
712 | | "sar $0x11,%eax\n" |
713 | | "movzbl (%esi,%eax,1),%eax\n" |
714 | | "paddsw 4096(%ecx,%eax,8),%mm0\n" |
715 | | "mov %ebx,%eax\n" |
716 | | "sar $0x10,%eax\n" |
717 | | "movzbl (%edx,%eax,1),%eax\n" |
718 | | "movq 0(%ecx,%eax,8),%mm1\n" |
719 | | "paddsw %mm0,%mm1\n" |
720 | | "psraw $0x6,%mm1\n" |
721 | | "packuswb %mm1,%mm1\n" |
722 | | "movd %mm1,0x0(%ebp)\n" |
723 | | |
724 | | "2:" |
725 | | "popa\n" |
726 | | "ret\n" |
727 | | #if !defined(XP_MACOSX) |
728 | | ".previous\n" |
729 | | #endif |
730 | | ); |
731 | | |
732 | | void ScaleYUVToRGB32Row(const uint8* y_buf, |
733 | | const uint8* u_buf, |
734 | | const uint8* v_buf, |
735 | | uint8* rgb_buf, |
736 | | int width, |
737 | | int source_dx) |
738 | | { |
739 | | if (mozilla::supports_sse()) { |
740 | | PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
741 | | &kCoefficientsRgbY[0][0]); |
742 | | return; |
743 | | } |
744 | | |
745 | | ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
746 | | } |
747 | | |
748 | | void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
749 | | const uint8* u_buf, |
750 | | const uint8* v_buf, |
751 | | uint8* rgb_buf, |
752 | | int width, |
753 | | int source_dx, |
754 | | const int16 *kCoefficientsRgbY); |
755 | | |
756 | | asm( |
757 | | ".text\n" |
758 | | #if defined(XP_MACOSX) |
759 | | "_PICLinearScaleYUVToRGB32Row_SSE:\n" |
760 | | #else |
761 | | "PICLinearScaleYUVToRGB32Row_SSE:\n" |
762 | | #endif |
763 | | "pusha\n" |
764 | | "mov 0x24(%esp),%edx\n" |
765 | | "mov 0x30(%esp),%ebp\n" |
766 | | "mov 0x34(%esp),%ecx\n" |
767 | | "mov 0x3c(%esp),%edi\n" |
768 | | "xor %ebx,%ebx\n" |
769 | | |
770 | | // source_width = width * source_dx + ebx |
771 | | "mov 0x34(%esp), %ecx\n" |
772 | | "imull 0x38(%esp), %ecx\n" |
773 | | "mov %ecx, 0x34(%esp)\n" |
774 | | |
775 | | "mov 0x38(%esp), %ecx\n" |
776 | | "xor %ebx,%ebx\n" // x = 0 |
777 | | "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
778 | | "jl 1f\n" |
779 | | "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
780 | | "jmp 1f\n" |
781 | | |
782 | | "0:" |
783 | | "mov 0x28(%esp),%esi\n" |
784 | | "mov %ebx,%eax\n" |
785 | | "sar $0x11,%eax\n" |
786 | | |
787 | | "movzbl (%esi,%eax,1),%ecx\n" |
788 | | "movzbl 1(%esi,%eax,1),%esi\n" |
789 | | "mov %ebx,%eax\n" |
790 | | "andl $0x1fffe, %eax \n" |
791 | | "imul %eax, %esi \n" |
792 | | "xorl $0x1fffe, %eax \n" |
793 | | "imul %eax, %ecx \n" |
794 | | "addl %esi, %ecx \n" |
795 | | "shrl $17, %ecx \n" |
796 | | "movq 2048(%edi,%ecx,8),%mm0\n" |
797 | | |
798 | | "mov 0x2c(%esp),%esi\n" |
799 | | "mov %ebx,%eax\n" |
800 | | "sar $0x11,%eax\n" |
801 | | |
802 | | "movzbl (%esi,%eax,1),%ecx\n" |
803 | | "movzbl 1(%esi,%eax,1),%esi\n" |
804 | | "mov %ebx,%eax\n" |
805 | | "andl $0x1fffe, %eax \n" |
806 | | "imul %eax, %esi \n" |
807 | | "xorl $0x1fffe, %eax \n" |
808 | | "imul %eax, %ecx \n" |
809 | | "addl %esi, %ecx \n" |
810 | | "shrl $17, %ecx \n" |
811 | | "paddsw 4096(%edi,%ecx,8),%mm0\n" |
812 | | |
813 | | "mov %ebx,%eax\n" |
814 | | "sar $0x10,%eax\n" |
815 | | "movzbl (%edx,%eax,1),%ecx\n" |
816 | | "movzbl 1(%edx,%eax,1),%esi\n" |
817 | | "mov %ebx,%eax\n" |
818 | | "add 0x38(%esp),%ebx\n" |
819 | | "andl $0xffff, %eax \n" |
820 | | "imul %eax, %esi \n" |
821 | | "xorl $0xffff, %eax \n" |
822 | | "imul %eax, %ecx \n" |
823 | | "addl %esi, %ecx \n" |
824 | | "shrl $16, %ecx \n" |
825 | | "movq (%edi,%ecx,8),%mm1\n" |
826 | | |
827 | | "cmp 0x34(%esp), %ebx\n" |
828 | | "jge 2f\n" |
829 | | |
830 | | "mov %ebx,%eax\n" |
831 | | "sar $0x10,%eax\n" |
832 | | "movzbl (%edx,%eax,1),%ecx\n" |
833 | | "movzbl 1(%edx,%eax,1),%esi\n" |
834 | | "mov %ebx,%eax\n" |
835 | | "add 0x38(%esp),%ebx\n" |
836 | | "andl $0xffff, %eax \n" |
837 | | "imul %eax, %esi \n" |
838 | | "xorl $0xffff, %eax \n" |
839 | | "imul %eax, %ecx \n" |
840 | | "addl %esi, %ecx \n" |
841 | | "shrl $16, %ecx \n" |
842 | | "movq (%edi,%ecx,8),%mm2\n" |
843 | | |
844 | | "paddsw %mm0,%mm1\n" |
845 | | "paddsw %mm0,%mm2\n" |
846 | | "psraw $0x6,%mm1\n" |
847 | | "psraw $0x6,%mm2\n" |
848 | | "packuswb %mm2,%mm1\n" |
849 | | "movntq %mm1,0x0(%ebp)\n" |
850 | | "add $0x8,%ebp\n" |
851 | | |
852 | | "1:" |
853 | | "cmp %ebx, 0x34(%esp)\n" |
854 | | "jg 0b\n" |
855 | | "popa\n" |
856 | | "ret\n" |
857 | | |
858 | | "2:" |
859 | | "paddsw %mm0, %mm1\n" |
860 | | "psraw $6, %mm1\n" |
861 | | "packuswb %mm1, %mm1\n" |
862 | | "movd %mm1, (%ebp)\n" |
863 | | "popa\n" |
864 | | "ret\n" |
865 | | #if !defined(XP_MACOSX) |
866 | | ".previous\n" |
867 | | #endif |
868 | | ); |
869 | | |
870 | | |
871 | | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
872 | | const uint8* u_buf, |
873 | | const uint8* v_buf, |
874 | | uint8* rgb_buf, |
875 | | int width, |
876 | | int source_dx) |
877 | | { |
878 | | if (mozilla::supports_sse()) { |
879 | | PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
880 | | source_dx, &kCoefficientsRgbY[0][0]); |
881 | | return; |
882 | | } |
883 | | |
884 | | LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
885 | | } |
886 | | #else |
887 | | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
888 | | const uint8* u_buf, |
889 | | const uint8* v_buf, |
890 | | uint8* rgb_buf, |
891 | | int width) { |
892 | | FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
893 | | } |
894 | | |
895 | | void ScaleYUVToRGB32Row(const uint8* y_buf, |
896 | | const uint8* u_buf, |
897 | | const uint8* v_buf, |
898 | | uint8* rgb_buf, |
899 | | int width, |
900 | | int source_dx) { |
901 | | ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
902 | | } |
903 | | |
904 | | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
905 | | const uint8* u_buf, |
906 | | const uint8* v_buf, |
907 | | uint8* rgb_buf, |
908 | | int width, |
909 | | int source_dx) { |
910 | | LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
911 | | } |
912 | | #endif |
913 | | |
914 | | } |