Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/gfx/ycbcr/yuv_row_posix.cpp
Line
Count
Source (jump to first uncovered line)
1
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
#include "yuv_row.h"
6
#include "mozilla/SSE.h"
7
8
#define DCHECK(a)
9
10
extern "C" {
11
12
#if defined(ARCH_CPU_X86_64)
13
14
// We don't need CPUID guards here, since x86-64 implies SSE2.
15
16
// AMD64 ABI uses register paremters.
17
void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
18
                              const uint8* u_buf,  // rsi
19
                              const uint8* v_buf,  // rdx
20
                              uint8* rgb_buf,      // rcx
21
0
                              int width) {         // r8
22
0
  asm volatile(
23
0
  "jmp    1f\n"
24
0
"0:"
25
0
  "movzb  (%[u_buf]),%%r10\n"
26
0
  "add    $0x1,%[u_buf]\n"
27
0
  "movzb  (%[v_buf]),%%r11\n"
28
0
  "add    $0x1,%[v_buf]\n"
29
0
  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
30
0
  "movzb  (%[y_buf]),%%r10\n"
31
0
  "movq   4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n"
32
0
  "movzb  0x1(%[y_buf]),%%r11\n"
33
0
  "paddsw %%xmm1,%%xmm0\n"
34
0
  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n"
35
0
  "add    $0x2,%[y_buf]\n"
36
0
  "movq   (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n"
37
0
  "paddsw %%xmm0,%%xmm2\n"
38
0
  "paddsw %%xmm0,%%xmm3\n"
39
0
  "shufps $0x44,%%xmm3,%%xmm2\n"
40
0
  "psraw  $0x6,%%xmm2\n"
41
0
  "packuswb %%xmm2,%%xmm2\n"
42
0
  "movq   %%xmm2,0x0(%[rgb_buf])\n"
43
0
  "add    $0x8,%[rgb_buf]\n"
44
0
"1:"
45
0
  "sub    $0x2,%[width]\n"
46
0
  "jns    0b\n"
47
0
48
0
"2:"
49
0
  "add    $0x1,%[width]\n"
50
0
  "js     3f\n"
51
0
52
0
  "movzb  (%[u_buf]),%%r10\n"
53
0
  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
54
0
  "movzb  (%[v_buf]),%%r10\n"
55
0
  "movq   4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
56
0
  "paddsw %%xmm1,%%xmm0\n"
57
0
  "movzb  (%[y_buf]),%%r10\n"
58
0
  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
59
0
  "paddsw %%xmm0,%%xmm1\n"
60
0
  "psraw  $0x6,%%xmm1\n"
61
0
  "packuswb %%xmm1,%%xmm1\n"
62
0
  "movd   %%xmm1,0x0(%[rgb_buf])\n"
63
0
"3:"
64
0
  : [y_buf] "+r"(y_buf),
65
0
    [u_buf] "+r"(u_buf),
66
0
    [v_buf] "+r"(v_buf),
67
0
    [rgb_buf] "+r"(rgb_buf),
68
0
    [width] "+r"(width)
69
0
  : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
70
0
  : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
71
0
);
72
0
}
73
74
void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
75
                        const uint8* u_buf,  // rsi
76
                        const uint8* v_buf,  // rdx
77
                        uint8* rgb_buf,      // rcx
78
                        int width,           // r8
79
0
                        int source_dx) {     // r9
80
0
  asm volatile(
81
0
  "xor    %%r11,%%r11\n"
82
0
  "sub    $0x2,%[width]\n"
83
0
  "js     1f\n"
84
0
85
0
"0:"
86
0
  "mov    %%r11,%%r10\n"
87
0
  "sar    $0x11,%%r10\n"
88
0
  "movzb  (%[u_buf],%%r10,1),%%rax\n"
89
0
  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
90
0
  "movzb  (%[v_buf],%%r10,1),%%rax\n"
91
0
  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
92
0
  "lea    (%%r11,%[source_dx]),%%r10\n"
93
0
  "sar    $0x10,%%r11\n"
94
0
  "movzb  (%[y_buf],%%r11,1),%%rax\n"
95
0
  "paddsw %%xmm1,%%xmm0\n"
96
0
  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
97
0
  "lea    (%%r10,%[source_dx]),%%r11\n"
98
0
  "sar    $0x10,%%r10\n"
99
0
  "movzb  (%[y_buf],%%r10,1),%%rax\n"
100
0
  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n"
101
0
  "paddsw %%xmm0,%%xmm1\n"
102
0
  "paddsw %%xmm0,%%xmm2\n"
103
0
  "shufps $0x44,%%xmm2,%%xmm1\n"
104
0
  "psraw  $0x6,%%xmm1\n"
105
0
  "packuswb %%xmm1,%%xmm1\n"
106
0
  "movq   %%xmm1,0x0(%[rgb_buf])\n"
107
0
  "add    $0x8,%[rgb_buf]\n"
108
0
  "sub    $0x2,%[width]\n"
109
0
  "jns    0b\n"
110
0
111
0
"1:"
112
0
  "add    $0x1,%[width]\n"
113
0
  "js     2f\n"
114
0
115
0
  "mov    %%r11,%%r10\n"
116
0
  "sar    $0x11,%%r10\n"
117
0
  "movzb  (%[u_buf],%%r10,1),%%rax\n"
118
0
  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
119
0
  "movzb  (%[v_buf],%%r10,1),%%rax\n"
120
0
  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
121
0
  "paddsw %%xmm1,%%xmm0\n"
122
0
  "sar    $0x10,%%r11\n"
123
0
  "movzb  (%[y_buf],%%r11,1),%%rax\n"
124
0
  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
125
0
  "paddsw %%xmm0,%%xmm1\n"
126
0
  "psraw  $0x6,%%xmm1\n"
127
0
  "packuswb %%xmm1,%%xmm1\n"
128
0
  "movd   %%xmm1,0x0(%[rgb_buf])\n"
129
0
130
0
"2:"
131
0
  : [rgb_buf] "+r"(rgb_buf),
132
0
    [width] "+r"(width)
133
0
  : [y_buf] "r"(y_buf),
134
0
    [u_buf] "r"(u_buf),
135
0
    [v_buf] "r"(v_buf),
136
0
    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
137
0
    [source_dx] "r"(static_cast<long>(source_dx))
138
0
  : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
139
0
);
140
0
}
141
142
void LinearScaleYUVToRGB32Row(const uint8* y_buf,
143
                              const uint8* u_buf,
144
                              const uint8* v_buf,
145
                              uint8* rgb_buf,
146
                              int width,
147
0
                              int source_dx) {
148
0
  asm volatile(
149
0
  "xor    %%r11,%%r11\n"   // x = 0
150
0
  "sub    $0x2,%[width]\n"
151
0
  "js     2f\n"
152
0
  "cmp    $0x20000,%[source_dx]\n"   // if source_dx >= 2.0
153
0
  "jl     0f\n"
154
0
  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
155
0
"0:"
156
0
157
0
"1:"
158
0
  "mov    %%r11,%%r10\n"
159
0
  "sar    $0x11,%%r10\n"
160
0
161
0
  "movzb  (%[u_buf], %%r10, 1), %%r13 \n"
162
0
  "movzb  1(%[u_buf], %%r10, 1), %%r14 \n"
163
0
  "mov    %%r11, %%rax \n"
164
0
  "and    $0x1fffe, %%rax \n"
165
0
  "imul   %%rax, %%r14 \n"
166
0
  "xor    $0x1fffe, %%rax \n"
167
0
  "imul   %%rax, %%r13 \n"
168
0
  "add    %%r14, %%r13 \n"
169
0
  "shr    $17, %%r13 \n"
170
0
  "movq   2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n"
171
0
172
0
  "movzb  (%[v_buf], %%r10, 1), %%r13 \n"
173
0
  "movzb  1(%[v_buf], %%r10, 1), %%r14 \n"
174
0
  "mov    %%r11, %%rax \n"
175
0
  "and    $0x1fffe, %%rax \n"
176
0
  "imul   %%rax, %%r14 \n"
177
0
  "xor    $0x1fffe, %%rax \n"
178
0
  "imul   %%rax, %%r13 \n"
179
0
  "add    %%r14, %%r13 \n"
180
0
  "shr    $17, %%r13 \n"
181
0
  "movq   4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n"
182
0
183
0
  "mov    %%r11, %%rax \n"
184
0
  "lea    (%%r11,%[source_dx]),%%r10\n"
185
0
  "sar    $0x10,%%r11\n"
186
0
  "paddsw %%xmm1,%%xmm0\n"
187
0
188
0
  "movzb  (%[y_buf], %%r11, 1), %%r13 \n"
189
0
  "movzb  1(%[y_buf], %%r11, 1), %%r14 \n"
190
0
  "and    $0xffff, %%rax \n"
191
0
  "imul   %%rax, %%r14 \n"
192
0
  "xor    $0xffff, %%rax \n"
193
0
  "imul   %%rax, %%r13 \n"
194
0
  "add    %%r14, %%r13 \n"
195
0
  "shr    $16, %%r13 \n"
196
0
  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
197
0
198
0
  "mov    %%r10, %%rax \n"
199
0
  "lea    (%%r10,%[source_dx]),%%r11\n"
200
0
  "sar    $0x10,%%r10\n"
201
0
202
0
  "movzb  (%[y_buf],%%r10,1), %%r13 \n"
203
0
  "movzb  1(%[y_buf],%%r10,1), %%r14 \n"
204
0
  "and    $0xffff, %%rax \n"
205
0
  "imul   %%rax, %%r14 \n"
206
0
  "xor    $0xffff, %%rax \n"
207
0
  "imul   %%rax, %%r13 \n"
208
0
  "add    %%r14, %%r13 \n"
209
0
  "shr    $16, %%r13 \n"
210
0
  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n"
211
0
212
0
  "paddsw %%xmm0,%%xmm1\n"
213
0
  "paddsw %%xmm0,%%xmm2\n"
214
0
  "shufps $0x44,%%xmm2,%%xmm1\n"
215
0
  "psraw  $0x6,%%xmm1\n"
216
0
  "packuswb %%xmm1,%%xmm1\n"
217
0
  "movq   %%xmm1,0x0(%[rgb_buf])\n"
218
0
  "add    $0x8,%[rgb_buf]\n"
219
0
  "sub    $0x2,%[width]\n"
220
0
  "jns    1b\n"
221
0
222
0
"2:"
223
0
  "add    $0x1,%[width]\n"
224
0
  "js     3f\n"
225
0
226
0
  "mov    %%r11,%%r10\n"
227
0
  "sar    $0x11,%%r10\n"
228
0
229
0
  "movzb  (%[u_buf],%%r10,1), %%r13 \n"
230
0
  "movq   2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n"
231
0
232
0
  "movzb  (%[v_buf],%%r10,1), %%r13 \n"
233
0
  "movq   4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
234
0
235
0
  "paddsw %%xmm1,%%xmm0\n"
236
0
  "sar    $0x10,%%r11\n"
237
0
238
0
  "movzb  (%[y_buf],%%r11,1), %%r13 \n"
239
0
  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
240
0
241
0
  "paddsw %%xmm0,%%xmm1\n"
242
0
  "psraw  $0x6,%%xmm1\n"
243
0
  "packuswb %%xmm1,%%xmm1\n"
244
0
  "movd   %%xmm1,0x0(%[rgb_buf])\n"
245
0
246
0
"3:"
247
0
  : [rgb_buf] "+r"(rgb_buf),
248
0
    [width] "+r"(width)
249
0
  : [y_buf] "r"(y_buf),
250
0
    [u_buf] "r"(u_buf),
251
0
    [v_buf] "r"(v_buf),
252
0
    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
253
0
    [source_dx] "r"(static_cast<long>(source_dx))
254
0
  : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
255
0
);
256
0
}
257
258
#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
259
260
// PIC version is slower because less registers are available, so
261
// non-PIC is used on platforms where it is possible.
262
void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
263
                                  const uint8* u_buf,
264
                                  const uint8* v_buf,
265
                                  uint8* rgb_buf,
266
                                  int width);
267
  asm(
268
  ".text\n"
269
  ".global FastConvertYUVToRGB32Row_SSE\n"
270
  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
271
"FastConvertYUVToRGB32Row_SSE:\n"
272
  "pusha\n"
273
  "mov    0x24(%esp),%edx\n"
274
  "mov    0x28(%esp),%edi\n"
275
  "mov    0x2c(%esp),%esi\n"
276
  "mov    0x30(%esp),%ebp\n"
277
  "mov    0x34(%esp),%ecx\n"
278
  "jmp    1f\n"
279
280
"0:"
281
  "movzbl (%edi),%eax\n"
282
  "add    $0x1,%edi\n"
283
  "movzbl (%esi),%ebx\n"
284
  "add    $0x1,%esi\n"
285
  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
286
  "movzbl (%edx),%eax\n"
287
  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
288
  "movzbl 0x1(%edx),%ebx\n"
289
  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
290
  "add    $0x2,%edx\n"
291
  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
292
  "paddsw %mm0,%mm1\n"
293
  "paddsw %mm0,%mm2\n"
294
  "psraw  $0x6,%mm1\n"
295
  "psraw  $0x6,%mm2\n"
296
  "packuswb %mm2,%mm1\n"
297
  "movntq %mm1,0x0(%ebp)\n"
298
  "add    $0x8,%ebp\n"
299
"1:"
300
  "sub    $0x2,%ecx\n"
301
  "jns    0b\n"
302
303
  "and    $0x1,%ecx\n"
304
  "je     2f\n"
305
306
  "movzbl (%edi),%eax\n"
307
  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
308
  "movzbl (%esi),%eax\n"
309
  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
310
  "movzbl (%edx),%eax\n"
311
  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
312
  "paddsw %mm0,%mm1\n"
313
  "psraw  $0x6,%mm1\n"
314
  "packuswb %mm1,%mm1\n"
315
  "movd   %mm1,0x0(%ebp)\n"
316
"2:"
317
  "popa\n"
318
  "ret\n"
319
#if !defined(XP_MACOSX)
320
  ".previous\n"
321
#endif
322
);
323
324
void FastConvertYUVToRGB32Row(const uint8* y_buf,
325
                              const uint8* u_buf,
326
                              const uint8* v_buf,
327
                              uint8* rgb_buf,
328
                              int width)
329
{
330
  if (mozilla::supports_sse()) {
331
    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
332
    return;
333
  }
334
335
  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
336
}
337
338
339
void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
340
                            const uint8* u_buf,
341
                            const uint8* v_buf,
342
                            uint8* rgb_buf,
343
                            int width,
344
                            int source_dx);
345
  asm(
346
  ".text\n"
347
  ".global ScaleYUVToRGB32Row_SSE\n"
348
  ".type ScaleYUVToRGB32Row_SSE, @function\n"
349
"ScaleYUVToRGB32Row_SSE:\n"
350
  "pusha\n"
351
  "mov    0x24(%esp),%edx\n"
352
  "mov    0x28(%esp),%edi\n"
353
  "mov    0x2c(%esp),%esi\n"
354
  "mov    0x30(%esp),%ebp\n"
355
  "mov    0x34(%esp),%ecx\n"
356
  "xor    %ebx,%ebx\n"
357
  "jmp    1f\n"
358
359
"0:"
360
  "mov    %ebx,%eax\n"
361
  "sar    $0x11,%eax\n"
362
  "movzbl (%edi,%eax,1),%eax\n"
363
  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
364
  "mov    %ebx,%eax\n"
365
  "sar    $0x11,%eax\n"
366
  "movzbl (%esi,%eax,1),%eax\n"
367
  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
368
  "mov    %ebx,%eax\n"
369
  "add    0x38(%esp),%ebx\n"
370
  "sar    $0x10,%eax\n"
371
  "movzbl (%edx,%eax,1),%eax\n"
372
  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
373
  "mov    %ebx,%eax\n"
374
  "add    0x38(%esp),%ebx\n"
375
  "sar    $0x10,%eax\n"
376
  "movzbl (%edx,%eax,1),%eax\n"
377
  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
378
  "paddsw %mm0,%mm1\n"
379
  "paddsw %mm0,%mm2\n"
380
  "psraw  $0x6,%mm1\n"
381
  "psraw  $0x6,%mm2\n"
382
  "packuswb %mm2,%mm1\n"
383
  "movntq %mm1,0x0(%ebp)\n"
384
  "add    $0x8,%ebp\n"
385
"1:"
386
  "sub    $0x2,%ecx\n"
387
  "jns    0b\n"
388
389
  "and    $0x1,%ecx\n"
390
  "je     2f\n"
391
392
  "mov    %ebx,%eax\n"
393
  "sar    $0x11,%eax\n"
394
  "movzbl (%edi,%eax,1),%eax\n"
395
  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
396
  "mov    %ebx,%eax\n"
397
  "sar    $0x11,%eax\n"
398
  "movzbl (%esi,%eax,1),%eax\n"
399
  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
400
  "mov    %ebx,%eax\n"
401
  "sar    $0x10,%eax\n"
402
  "movzbl (%edx,%eax,1),%eax\n"
403
  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
404
  "paddsw %mm0,%mm1\n"
405
  "psraw  $0x6,%mm1\n"
406
  "packuswb %mm1,%mm1\n"
407
  "movd   %mm1,0x0(%ebp)\n"
408
409
"2:"
410
  "popa\n"
411
  "ret\n"
412
#if !defined(XP_MACOSX)
413
  ".previous\n"
414
#endif
415
);
416
417
void ScaleYUVToRGB32Row(const uint8* y_buf,
418
                        const uint8* u_buf,
419
                        const uint8* v_buf,
420
                        uint8* rgb_buf,
421
                        int width,
422
                        int source_dx)
423
{
424
  if (mozilla::supports_sse()) {
425
    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
426
                           width, source_dx);
427
    return;
428
  }
429
430
  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
431
                       width, source_dx);
432
}
433
434
void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
435
                                  const uint8* u_buf,
436
                                  const uint8* v_buf,
437
                                  uint8* rgb_buf,
438
                                  int width,
439
                                  int source_dx);
440
  asm(
441
  ".text\n"
442
  ".global LinearScaleYUVToRGB32Row_SSE\n"
443
  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
444
"LinearScaleYUVToRGB32Row_SSE:\n"
445
  "pusha\n"
446
  "mov    0x24(%esp),%edx\n"
447
  "mov    0x28(%esp),%edi\n"
448
  "mov    0x30(%esp),%ebp\n"
449
450
  // source_width = width * source_dx + ebx
451
  "mov    0x34(%esp), %ecx\n"
452
  "imull  0x38(%esp), %ecx\n"
453
  "mov    %ecx, 0x34(%esp)\n"
454
455
  "mov    0x38(%esp), %ecx\n"
456
  "xor    %ebx,%ebx\n"     // x = 0
457
  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
458
  "jl     1f\n"
459
  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
460
  "jmp    1f\n"
461
462
"0:"
463
  "mov    %ebx,%eax\n"
464
  "sar    $0x11,%eax\n"
465
466
  "movzbl (%edi,%eax,1),%ecx\n"
467
  "movzbl 1(%edi,%eax,1),%esi\n"
468
  "mov    %ebx,%eax\n"
469
  "andl   $0x1fffe, %eax \n"
470
  "imul   %eax, %esi \n"
471
  "xorl   $0x1fffe, %eax \n"
472
  "imul   %eax, %ecx \n"
473
  "addl   %esi, %ecx \n"
474
  "shrl   $17, %ecx \n"
475
  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
476
477
  "mov    0x2c(%esp),%esi\n"
478
  "mov    %ebx,%eax\n"
479
  "sar    $0x11,%eax\n"
480
481
  "movzbl (%esi,%eax,1),%ecx\n"
482
  "movzbl 1(%esi,%eax,1),%esi\n"
483
  "mov    %ebx,%eax\n"
484
  "andl   $0x1fffe, %eax \n"
485
  "imul   %eax, %esi \n"
486
  "xorl   $0x1fffe, %eax \n"
487
  "imul   %eax, %ecx \n"
488
  "addl   %esi, %ecx \n"
489
  "shrl   $17, %ecx \n"
490
  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
491
492
  "mov    %ebx,%eax\n"
493
  "sar    $0x10,%eax\n"
494
  "movzbl (%edx,%eax,1),%ecx\n"
495
  "movzbl 1(%edx,%eax,1),%esi\n"
496
  "mov    %ebx,%eax\n"
497
  "add    0x38(%esp),%ebx\n"
498
  "andl   $0xffff, %eax \n"
499
  "imul   %eax, %esi \n"
500
  "xorl   $0xffff, %eax \n"
501
  "imul   %eax, %ecx \n"
502
  "addl   %esi, %ecx \n"
503
  "shrl   $16, %ecx \n"
504
  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
505
506
  "cmp    0x34(%esp), %ebx\n"
507
  "jge    2f\n"
508
509
  "mov    %ebx,%eax\n"
510
  "sar    $0x10,%eax\n"
511
  "movzbl (%edx,%eax,1),%ecx\n"
512
  "movzbl 1(%edx,%eax,1),%esi\n"
513
  "mov    %ebx,%eax\n"
514
  "add    0x38(%esp),%ebx\n"
515
  "andl   $0xffff, %eax \n"
516
  "imul   %eax, %esi \n"
517
  "xorl   $0xffff, %eax \n"
518
  "imul   %eax, %ecx \n"
519
  "addl   %esi, %ecx \n"
520
  "shrl   $16, %ecx \n"
521
  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
522
523
  "paddsw %mm0,%mm1\n"
524
  "paddsw %mm0,%mm2\n"
525
  "psraw  $0x6,%mm1\n"
526
  "psraw  $0x6,%mm2\n"
527
  "packuswb %mm2,%mm1\n"
528
  "movntq %mm1,0x0(%ebp)\n"
529
  "add    $0x8,%ebp\n"
530
531
"1:"
532
  "cmp    0x34(%esp), %ebx\n"
533
  "jl     0b\n"
534
  "popa\n"
535
  "ret\n"
536
537
"2:"
538
  "paddsw %mm0, %mm1\n"
539
  "psraw $6, %mm1\n"
540
  "packuswb %mm1, %mm1\n"
541
  "movd %mm1, (%ebp)\n"
542
  "popa\n"
543
  "ret\n"
544
#if !defined(XP_MACOSX)
545
  ".previous\n"
546
#endif
547
);
548
549
void LinearScaleYUVToRGB32Row(const uint8* y_buf,
550
                              const uint8* u_buf,
551
                              const uint8* v_buf,
552
                              uint8* rgb_buf,
553
                              int width,
554
                              int source_dx)
555
{
556
  if (mozilla::supports_sse()) {
557
    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
558
                                 width, source_dx);
559
    return;
560
  }
561
562
  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
563
                             width, source_dx);
564
}
565
566
#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
567
568
void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
569
                                 const uint8* u_buf,
570
                                 const uint8* v_buf,
571
                                 uint8* rgb_buf,
572
                                 int width,
573
                                 const int16 *kCoefficientsRgbY);
574
575
  asm(
576
  ".text\n"
577
#if defined(XP_MACOSX)
578
"_PICConvertYUVToRGB32Row_SSE:\n"
579
#else
580
"PICConvertYUVToRGB32Row_SSE:\n"
581
#endif
582
  "pusha\n"
583
  "mov    0x24(%esp),%edx\n"
584
  "mov    0x28(%esp),%edi\n"
585
  "mov    0x2c(%esp),%esi\n"
586
  "mov    0x30(%esp),%ebp\n"
587
  "mov    0x38(%esp),%ecx\n"
588
589
  "jmp    1f\n"
590
591
"0:"
592
  "movzbl (%edi),%eax\n"
593
  "add    $0x1,%edi\n"
594
  "movzbl (%esi),%ebx\n"
595
  "add    $0x1,%esi\n"
596
  "movq   2048(%ecx,%eax,8),%mm0\n"
597
  "movzbl (%edx),%eax\n"
598
  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
599
  "movzbl 0x1(%edx),%ebx\n"
600
  "movq   0(%ecx,%eax,8),%mm1\n"
601
  "add    $0x2,%edx\n"
602
  "movq   0(%ecx,%ebx,8),%mm2\n"
603
  "paddsw %mm0,%mm1\n"
604
  "paddsw %mm0,%mm2\n"
605
  "psraw  $0x6,%mm1\n"
606
  "psraw  $0x6,%mm2\n"
607
  "packuswb %mm2,%mm1\n"
608
  "movntq %mm1,0x0(%ebp)\n"
609
  "add    $0x8,%ebp\n"
610
"1:"
611
  "subl   $0x2,0x34(%esp)\n"
612
  "jns    0b\n"
613
614
  "andl   $0x1,0x34(%esp)\n"
615
  "je     2f\n"
616
617
  "movzbl (%edi),%eax\n"
618
  "movq   2048(%ecx,%eax,8),%mm0\n"
619
  "movzbl (%esi),%eax\n"
620
  "paddsw 4096(%ecx,%eax,8),%mm0\n"
621
  "movzbl (%edx),%eax\n"
622
  "movq   0(%ecx,%eax,8),%mm1\n"
623
  "paddsw %mm0,%mm1\n"
624
  "psraw  $0x6,%mm1\n"
625
  "packuswb %mm1,%mm1\n"
626
  "movd   %mm1,0x0(%ebp)\n"
627
"2:"
628
  "popa\n"
629
  "ret\n"
630
#if !defined(XP_MACOSX)
631
  ".previous\n"
632
#endif
633
);
634
635
void FastConvertYUVToRGB32Row(const uint8* y_buf,
636
                              const uint8* u_buf,
637
                              const uint8* v_buf,
638
                              uint8* rgb_buf,
639
                              int width)
640
{
641
  if (mozilla::supports_sse()) {
642
    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
643
                                &kCoefficientsRgbY[0][0]);
644
    return;
645
  }
646
647
  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
648
}
649
650
void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
651
                               const uint8* u_buf,
652
                               const uint8* v_buf,
653
                               uint8* rgb_buf,
654
                               int width,
655
                               int source_dx,
656
                               const int16 *kCoefficientsRgbY);
657
658
  asm(
659
  ".text\n"
660
#if defined(XP_MACOSX)
661
"_PICScaleYUVToRGB32Row_SSE:\n"
662
#else
663
"PICScaleYUVToRGB32Row_SSE:\n"
664
#endif
665
  "pusha\n"
666
  "mov    0x24(%esp),%edx\n"
667
  "mov    0x28(%esp),%edi\n"
668
  "mov    0x2c(%esp),%esi\n"
669
  "mov    0x30(%esp),%ebp\n"
670
  "mov    0x3c(%esp),%ecx\n"
671
  "xor    %ebx,%ebx\n"
672
  "jmp    1f\n"
673
674
"0:"
675
  "mov    %ebx,%eax\n"
676
  "sar    $0x11,%eax\n"
677
  "movzbl (%edi,%eax,1),%eax\n"
678
  "movq   2048(%ecx,%eax,8),%mm0\n"
679
  "mov    %ebx,%eax\n"
680
  "sar    $0x11,%eax\n"
681
  "movzbl (%esi,%eax,1),%eax\n"
682
  "paddsw 4096(%ecx,%eax,8),%mm0\n"
683
  "mov    %ebx,%eax\n"
684
  "add    0x38(%esp),%ebx\n"
685
  "sar    $0x10,%eax\n"
686
  "movzbl (%edx,%eax,1),%eax\n"
687
  "movq   0(%ecx,%eax,8),%mm1\n"
688
  "mov    %ebx,%eax\n"
689
  "add    0x38(%esp),%ebx\n"
690
  "sar    $0x10,%eax\n"
691
  "movzbl (%edx,%eax,1),%eax\n"
692
  "movq   0(%ecx,%eax,8),%mm2\n"
693
  "paddsw %mm0,%mm1\n"
694
  "paddsw %mm0,%mm2\n"
695
  "psraw  $0x6,%mm1\n"
696
  "psraw  $0x6,%mm2\n"
697
  "packuswb %mm2,%mm1\n"
698
  "movntq %mm1,0x0(%ebp)\n"
699
  "add    $0x8,%ebp\n"
700
"1:"
701
  "subl   $0x2,0x34(%esp)\n"
702
  "jns    0b\n"
703
704
  "andl   $0x1,0x34(%esp)\n"
705
  "je     2f\n"
706
707
  "mov    %ebx,%eax\n"
708
  "sar    $0x11,%eax\n"
709
  "movzbl (%edi,%eax,1),%eax\n"
710
  "movq   2048(%ecx,%eax,8),%mm0\n"
711
  "mov    %ebx,%eax\n"
712
  "sar    $0x11,%eax\n"
713
  "movzbl (%esi,%eax,1),%eax\n"
714
  "paddsw 4096(%ecx,%eax,8),%mm0\n"
715
  "mov    %ebx,%eax\n"
716
  "sar    $0x10,%eax\n"
717
  "movzbl (%edx,%eax,1),%eax\n"
718
  "movq   0(%ecx,%eax,8),%mm1\n"
719
  "paddsw %mm0,%mm1\n"
720
  "psraw  $0x6,%mm1\n"
721
  "packuswb %mm1,%mm1\n"
722
  "movd   %mm1,0x0(%ebp)\n"
723
724
"2:"
725
  "popa\n"
726
  "ret\n"
727
#if !defined(XP_MACOSX)
728
  ".previous\n"
729
#endif
730
);
731
732
void ScaleYUVToRGB32Row(const uint8* y_buf,
733
                        const uint8* u_buf,
734
                        const uint8* v_buf,
735
                        uint8* rgb_buf,
736
                        int width,
737
                        int source_dx)
738
{
739
  if (mozilla::supports_sse()) {
740
    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
741
                              &kCoefficientsRgbY[0][0]);
742
    return;
743
  }
744
745
  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
746
}
747
748
void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
749
                                     const uint8* u_buf,
750
                                     const uint8* v_buf,
751
                                     uint8* rgb_buf,
752
                                     int width,
753
                                     int source_dx,
754
                                     const int16 *kCoefficientsRgbY);
755
756
  asm(
757
  ".text\n"
758
#if defined(XP_MACOSX)
759
"_PICLinearScaleYUVToRGB32Row_SSE:\n"
760
#else
761
"PICLinearScaleYUVToRGB32Row_SSE:\n"
762
#endif
763
  "pusha\n"
764
  "mov    0x24(%esp),%edx\n"
765
  "mov    0x30(%esp),%ebp\n"
766
  "mov    0x34(%esp),%ecx\n"
767
  "mov    0x3c(%esp),%edi\n"
768
  "xor    %ebx,%ebx\n"
769
770
  // source_width = width * source_dx + ebx
771
  "mov    0x34(%esp), %ecx\n"
772
  "imull  0x38(%esp), %ecx\n"
773
  "mov    %ecx, 0x34(%esp)\n"
774
775
  "mov    0x38(%esp), %ecx\n"
776
  "xor    %ebx,%ebx\n"     // x = 0
777
  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
778
  "jl     1f\n"
779
  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
780
  "jmp    1f\n"
781
782
"0:"
783
  "mov    0x28(%esp),%esi\n"
784
  "mov    %ebx,%eax\n"
785
  "sar    $0x11,%eax\n"
786
787
  "movzbl (%esi,%eax,1),%ecx\n"
788
  "movzbl 1(%esi,%eax,1),%esi\n"
789
  "mov    %ebx,%eax\n"
790
  "andl   $0x1fffe, %eax \n"
791
  "imul   %eax, %esi \n"
792
  "xorl   $0x1fffe, %eax \n"
793
  "imul   %eax, %ecx \n"
794
  "addl   %esi, %ecx \n"
795
  "shrl   $17, %ecx \n"
796
  "movq   2048(%edi,%ecx,8),%mm0\n"
797
798
  "mov    0x2c(%esp),%esi\n"
799
  "mov    %ebx,%eax\n"
800
  "sar    $0x11,%eax\n"
801
802
  "movzbl (%esi,%eax,1),%ecx\n"
803
  "movzbl 1(%esi,%eax,1),%esi\n"
804
  "mov    %ebx,%eax\n"
805
  "andl   $0x1fffe, %eax \n"
806
  "imul   %eax, %esi \n"
807
  "xorl   $0x1fffe, %eax \n"
808
  "imul   %eax, %ecx \n"
809
  "addl   %esi, %ecx \n"
810
  "shrl   $17, %ecx \n"
811
  "paddsw 4096(%edi,%ecx,8),%mm0\n"
812
813
  "mov    %ebx,%eax\n"
814
  "sar    $0x10,%eax\n"
815
  "movzbl (%edx,%eax,1),%ecx\n"
816
  "movzbl 1(%edx,%eax,1),%esi\n"
817
  "mov    %ebx,%eax\n"
818
  "add    0x38(%esp),%ebx\n"
819
  "andl   $0xffff, %eax \n"
820
  "imul   %eax, %esi \n"
821
  "xorl   $0xffff, %eax \n"
822
  "imul   %eax, %ecx \n"
823
  "addl   %esi, %ecx \n"
824
  "shrl   $16, %ecx \n"
825
  "movq   (%edi,%ecx,8),%mm1\n"
826
827
  "cmp    0x34(%esp), %ebx\n"
828
  "jge    2f\n"
829
830
  "mov    %ebx,%eax\n"
831
  "sar    $0x10,%eax\n"
832
  "movzbl (%edx,%eax,1),%ecx\n"
833
  "movzbl 1(%edx,%eax,1),%esi\n"
834
  "mov    %ebx,%eax\n"
835
  "add    0x38(%esp),%ebx\n"
836
  "andl   $0xffff, %eax \n"
837
  "imul   %eax, %esi \n"
838
  "xorl   $0xffff, %eax \n"
839
  "imul   %eax, %ecx \n"
840
  "addl   %esi, %ecx \n"
841
  "shrl   $16, %ecx \n"
842
  "movq   (%edi,%ecx,8),%mm2\n"
843
844
  "paddsw %mm0,%mm1\n"
845
  "paddsw %mm0,%mm2\n"
846
  "psraw  $0x6,%mm1\n"
847
  "psraw  $0x6,%mm2\n"
848
  "packuswb %mm2,%mm1\n"
849
  "movntq %mm1,0x0(%ebp)\n"
850
  "add    $0x8,%ebp\n"
851
852
"1:"
853
  "cmp    %ebx, 0x34(%esp)\n"
854
  "jg     0b\n"
855
  "popa\n"
856
  "ret\n"
857
858
"2:"
859
  "paddsw %mm0, %mm1\n"
860
  "psraw $6, %mm1\n"
861
  "packuswb %mm1, %mm1\n"
862
  "movd %mm1, (%ebp)\n"
863
  "popa\n"
864
  "ret\n"
865
#if !defined(XP_MACOSX)
866
  ".previous\n"
867
#endif
868
);
869
870
871
void LinearScaleYUVToRGB32Row(const uint8* y_buf,
872
                              const uint8* u_buf,
873
                              const uint8* v_buf,
874
                              uint8* rgb_buf,
875
                              int width,
876
                              int source_dx)
877
{
878
  if (mozilla::supports_sse()) {
879
    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
880
                                    source_dx, &kCoefficientsRgbY[0][0]);
881
    return;
882
  }
883
884
  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
885
}
886
#else
887
void FastConvertYUVToRGB32Row(const uint8* y_buf,
888
                              const uint8* u_buf,
889
                              const uint8* v_buf,
890
                              uint8* rgb_buf,
891
                              int width) {
892
  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
893
}
894
895
void ScaleYUVToRGB32Row(const uint8* y_buf,
896
                        const uint8* u_buf,
897
                        const uint8* v_buf,
898
                        uint8* rgb_buf,
899
                        int width,
900
                        int source_dx) {
901
  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
902
}
903
904
void LinearScaleYUVToRGB32Row(const uint8* y_buf,
905
                              const uint8* u_buf,
906
                              const uint8* v_buf,
907
                              uint8* rgb_buf,
908
                              int width,
909
                              int source_dx) {
910
  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
911
}
912
#endif
913
914
}