Coverage Report

Created: 2025-11-16 07:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libswscale/x86/rgb2rgb.c
Line
Count
Source
1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 *
9
 * This file is part of FFmpeg.
10
 *
11
 * FFmpeg is free software; you can redistribute it and/or
12
 * modify it under the terms of the GNU Lesser General Public
13
 * License as published by the Free Software Foundation; either
14
 * version 2.1 of the License, or (at your option) any later version.
15
 *
16
 * FFmpeg is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
 * Lesser General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU Lesser General Public
22
 * License along with FFmpeg; if not, write to the Free Software
23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
 */
25
26
#include <stdint.h>
27
28
#include "config.h"
29
#include "libavutil/attributes.h"
30
#include "libavutil/x86/cpu.h"
31
#include "libavutil/cpu.h"
32
#include "libavutil/bswap.h"
33
#include "libavutil/mem_internal.h"
34
35
#include "libswscale/rgb2rgb.h"
36
#include "libswscale/swscale.h"
37
#include "libswscale/swscale_internal.h"
38
39
#if HAVE_INLINE_ASM
40
#include "libavutil/x86/asm.h"
41
42
DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
43
DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
44
DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
45
DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
46
DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
47
DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
48
DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
49
DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
50
DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
51
DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
52
DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
53
DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
54
DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
55
DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
56
DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
57
DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
58
DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
59
DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
60
13.5k
#define mask16b mask15b
61
DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
62
DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
63
1.03k
#define red_16mask mask3215g
64
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
65
DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
66
DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
67
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
68
6.82k
#define blue_15mask blue_16mask
69
DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
70
DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
71
DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
72
73
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
74
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
75
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
76
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
77
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
78
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
79
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
80
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
81
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
82
83
// MMXEXT versions
84
#define PREFETCH "prefetchnta"
85
#define PAVGB     "pavgb"
86
#define MOVNTQ "movntq"
87
#define SFENCE "sfence"
88
89
#define EMMS     "emms"
90
91
static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
92
1.36k
{
93
1.36k
    uint8_t *dest = dst;
94
1.36k
    const uint8_t *s = src;
95
1.36k
    const uint8_t *end;
96
1.36k
    const uint8_t *mm_end;
97
1.36k
    end = s + src_size;
98
1.36k
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
99
1.36k
    mm_end = end - 23;
100
1.36k
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
101
3.34k
    while (s < mm_end) {
102
1.97k
        __asm__ volatile(
103
1.97k
            PREFETCH"  32(%1)           \n\t"
104
1.97k
            "movd        (%1), %%mm0    \n\t"
105
1.97k
            "punpckldq  3(%1), %%mm0    \n\t"
106
1.97k
            "movd       6(%1), %%mm1    \n\t"
107
1.97k
            "punpckldq  9(%1), %%mm1    \n\t"
108
1.97k
            "movd      12(%1), %%mm2    \n\t"
109
1.97k
            "punpckldq 15(%1), %%mm2    \n\t"
110
1.97k
            "movd      18(%1), %%mm3    \n\t"
111
1.97k
            "punpckldq 21(%1), %%mm3    \n\t"
112
1.97k
            "por        %%mm7, %%mm0    \n\t"
113
1.97k
            "por        %%mm7, %%mm1    \n\t"
114
1.97k
            "por        %%mm7, %%mm2    \n\t"
115
1.97k
            "por        %%mm7, %%mm3    \n\t"
116
1.97k
            MOVNTQ"     %%mm0,   (%0)   \n\t"
117
1.97k
            MOVNTQ"     %%mm1,  8(%0)   \n\t"
118
1.97k
            MOVNTQ"     %%mm2, 16(%0)   \n\t"
119
1.97k
            MOVNTQ"     %%mm3, 24(%0)"
120
1.97k
            :: "r"(dest), "r"(s)
121
1.97k
            :"memory");
122
1.97k
        dest += 32;
123
1.97k
        s += 24;
124
1.97k
    }
125
1.36k
    __asm__ volatile(SFENCE:::"memory");
126
1.36k
    __asm__ volatile(EMMS:::"memory");
127
3.08k
    while (s < end) {
128
1.71k
        *dest++ = *s++;
129
1.71k
        *dest++ = *s++;
130
1.71k
        *dest++ = *s++;
131
1.71k
        *dest++ = 255;
132
1.71k
    }
133
1.36k
}
134
135
#define STORE_BGR24_MMX \
136
            "psrlq         $8, %%mm2    \n\t" \
137
            "psrlq         $8, %%mm3    \n\t" \
138
            "psrlq         $8, %%mm6    \n\t" \
139
            "psrlq         $8, %%mm7    \n\t" \
140
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
141
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
142
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
143
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
144
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
145
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
146
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
147
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
148
            "por        %%mm2, %%mm0    \n\t" \
149
            "por        %%mm3, %%mm1    \n\t" \
150
            "por        %%mm6, %%mm4    \n\t" \
151
            "por        %%mm7, %%mm5    \n\t" \
152
 \
153
            "movq       %%mm1, %%mm2    \n\t" \
154
            "movq       %%mm4, %%mm3    \n\t" \
155
            "psllq        $48, %%mm2    \n\t" \
156
            "psllq        $32, %%mm3    \n\t" \
157
            "por        %%mm2, %%mm0    \n\t" \
158
            "psrlq        $16, %%mm1    \n\t" \
159
            "psrlq        $32, %%mm4    \n\t" \
160
            "psllq        $16, %%mm5    \n\t" \
161
            "por        %%mm3, %%mm1    \n\t" \
162
            "por        %%mm5, %%mm4    \n\t" \
163
 \
164
            MOVNTQ"     %%mm0,   (%0)    \n\t" \
165
            MOVNTQ"     %%mm1,  8(%0)    \n\t" \
166
            MOVNTQ"     %%mm4, 16(%0)"
167
168
169
static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
170
1.04k
{
171
1.04k
    uint8_t *dest = dst;
172
1.04k
    const uint8_t *s = src;
173
1.04k
    const uint8_t *end;
174
1.04k
    const uint8_t *mm_end;
175
1.04k
    end = s + src_size;
176
1.04k
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
177
1.04k
    mm_end = end - 31;
178
6.02k
    while (s < mm_end) {
179
4.98k
        __asm__ volatile(
180
4.98k
            PREFETCH"  32(%1)           \n\t"
181
4.98k
            "movq        (%1), %%mm0    \n\t"
182
4.98k
            "movq       8(%1), %%mm1    \n\t"
183
4.98k
            "movq      16(%1), %%mm4    \n\t"
184
4.98k
            "movq      24(%1), %%mm5    \n\t"
185
4.98k
            "movq       %%mm0, %%mm2    \n\t"
186
4.98k
            "movq       %%mm1, %%mm3    \n\t"
187
4.98k
            "movq       %%mm4, %%mm6    \n\t"
188
4.98k
            "movq       %%mm5, %%mm7    \n\t"
189
4.98k
            STORE_BGR24_MMX
190
4.98k
            :: "r"(dest), "r"(s)
191
4.98k
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
192
4.98k
            :"memory");
193
4.98k
        dest += 24;
194
4.98k
        s += 32;
195
4.98k
    }
196
1.04k
    __asm__ volatile(SFENCE:::"memory");
197
1.04k
    __asm__ volatile(EMMS:::"memory");
198
3.05k
    while (s < end) {
199
2.01k
        *dest++ = *s++;
200
2.01k
        *dest++ = *s++;
201
2.01k
        *dest++ = *s++;
202
2.01k
        s++;
203
2.01k
    }
204
1.04k
}
205
206
/*
207
 original by Strepto/Astral
208
 ported to gcc & bugfixed: A'rpi
209
 MMXEXT, 3DNOW optimization by Nick Kurshev
210
 32-bit C version, and and&add trick by Michael Niedermayer
211
*/
212
static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
213
20
{
214
20
    register const uint8_t* s=src;
215
20
    register uint8_t* d=dst;
216
20
    register const uint8_t *end;
217
20
    const uint8_t *mm_end;
218
20
    end = s + src_size;
219
20
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
220
20
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
221
20
    mm_end = end - 15;
222
957
    while (s<mm_end) {
223
937
        __asm__ volatile(
224
937
            PREFETCH" 32(%1)        \n\t"
225
937
            "movq      (%1), %%mm0  \n\t"
226
937
            "movq     8(%1), %%mm2  \n\t"
227
937
            "movq     %%mm0, %%mm1  \n\t"
228
937
            "movq     %%mm2, %%mm3  \n\t"
229
937
            "pand     %%mm4, %%mm0  \n\t"
230
937
            "pand     %%mm4, %%mm2  \n\t"
231
937
            "paddw    %%mm1, %%mm0  \n\t"
232
937
            "paddw    %%mm3, %%mm2  \n\t"
233
937
            MOVNTQ"   %%mm0,  (%0)  \n\t"
234
937
            MOVNTQ"   %%mm2, 8(%0)"
235
937
            :: "r"(d), "r"(s)
236
937
        );
237
937
        d+=16;
238
937
        s+=16;
239
937
    }
240
20
    __asm__ volatile(SFENCE:::"memory");
241
20
    __asm__ volatile(EMMS:::"memory");
242
20
    mm_end = end - 3;
243
50
    while (s < mm_end) {
244
30
        register unsigned x= *((const uint32_t *)s);
245
30
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
246
30
        d+=4;
247
30
        s+=4;
248
30
    }
249
20
    if (s < end) {
250
6
        register unsigned short x= *((const uint16_t *)s);
251
6
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
252
6
    }
253
20
}
254
255
static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
256
1.38k
{
257
1.38k
    register const uint8_t* s=src;
258
1.38k
    register uint8_t* d=dst;
259
1.38k
    register const uint8_t *end;
260
1.38k
    const uint8_t *mm_end;
261
1.38k
    end = s + src_size;
262
1.38k
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
263
1.38k
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
264
1.38k
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
265
1.38k
    mm_end = end - 15;
266
5.28k
    while (s<mm_end) {
267
3.90k
        __asm__ volatile(
268
3.90k
            PREFETCH" 32(%1)        \n\t"
269
3.90k
            "movq      (%1), %%mm0  \n\t"
270
3.90k
            "movq     8(%1), %%mm2  \n\t"
271
3.90k
            "movq     %%mm0, %%mm1  \n\t"
272
3.90k
            "movq     %%mm2, %%mm3  \n\t"
273
3.90k
            "psrlq       $1, %%mm0  \n\t"
274
3.90k
            "psrlq       $1, %%mm2  \n\t"
275
3.90k
            "pand     %%mm7, %%mm0  \n\t"
276
3.90k
            "pand     %%mm7, %%mm2  \n\t"
277
3.90k
            "pand     %%mm6, %%mm1  \n\t"
278
3.90k
            "pand     %%mm6, %%mm3  \n\t"
279
3.90k
            "por      %%mm1, %%mm0  \n\t"
280
3.90k
            "por      %%mm3, %%mm2  \n\t"
281
3.90k
            MOVNTQ"   %%mm0,  (%0)  \n\t"
282
3.90k
            MOVNTQ"   %%mm2, 8(%0)"
283
3.90k
            :: "r"(d), "r"(s)
284
3.90k
        );
285
3.90k
        d+=16;
286
3.90k
        s+=16;
287
3.90k
    }
288
1.38k
    __asm__ volatile(SFENCE:::"memory");
289
1.38k
    __asm__ volatile(EMMS:::"memory");
290
1.38k
    mm_end = end - 3;
291
1.70k
    while (s < mm_end) {
292
324
        register uint32_t x= *((const uint32_t*)s);
293
324
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
294
324
        s+=4;
295
324
        d+=4;
296
324
    }
297
1.38k
    if (s < end) {
298
470
        register uint16_t x= *((const uint16_t*)s);
299
470
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300
470
    }
301
1.38k
}
302
303
static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
304
10
{
305
10
    const uint8_t *s = src;
306
10
    const uint8_t *end;
307
10
    const uint8_t *mm_end;
308
10
    uint16_t *d = (uint16_t *)dst;
309
10
    end = s + src_size;
310
10
    mm_end = end - 15;
311
10
    __asm__ volatile(
312
10
        "movq           %3, %%mm5   \n\t"
313
10
        "movq           %4, %%mm6   \n\t"
314
10
        "movq           %5, %%mm7   \n\t"
315
10
        "jmp 2f                     \n\t"
316
10
        ".p2align        4          \n\t"
317
10
        "1:                         \n\t"
318
10
        PREFETCH"   32(%1)          \n\t"
319
10
        "movd         (%1), %%mm0   \n\t"
320
10
        "movd        4(%1), %%mm3   \n\t"
321
10
        "punpckldq   8(%1), %%mm0   \n\t"
322
10
        "punpckldq  12(%1), %%mm3   \n\t"
323
10
        "movq        %%mm0, %%mm1   \n\t"
324
10
        "movq        %%mm3, %%mm4   \n\t"
325
10
        "pand        %%mm6, %%mm0   \n\t"
326
10
        "pand        %%mm6, %%mm3   \n\t"
327
10
        "pmaddwd     %%mm7, %%mm0   \n\t"
328
10
        "pmaddwd     %%mm7, %%mm3   \n\t"
329
10
        "pand        %%mm5, %%mm1   \n\t"
330
10
        "pand        %%mm5, %%mm4   \n\t"
331
10
        "por         %%mm1, %%mm0   \n\t"
332
10
        "por         %%mm4, %%mm3   \n\t"
333
10
        "psrld          $5, %%mm0   \n\t"
334
10
        "pslld         $11, %%mm3   \n\t"
335
10
        "por         %%mm3, %%mm0   \n\t"
336
10
        MOVNTQ"      %%mm0, (%0)    \n\t"
337
10
        "add           $16,  %1     \n\t"
338
10
        "add            $8,  %0     \n\t"
339
10
        "2:                         \n\t"
340
10
        "cmp            %2,  %1     \n\t"
341
10
        " jb            1b          \n\t"
342
10
        : "+r" (d), "+r"(s)
343
10
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
344
10
    );
345
10
    __asm__ volatile(SFENCE:::"memory");
346
10
    __asm__ volatile(EMMS:::"memory");
347
22
    while (s < end) {
348
12
        register int rgb = *(const uint32_t*)s; s += 4;
349
12
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
350
12
    }
351
10
}
352
353
static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
354
28
{
355
28
    const uint8_t *s = src;
356
28
    const uint8_t *end;
357
28
    const uint8_t *mm_end;
358
28
    uint16_t *d = (uint16_t *)dst;
359
28
    end = s + src_size;
360
28
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
361
28
    __asm__ volatile(
362
28
        "movq          %0, %%mm7    \n\t"
363
28
        "movq          %1, %%mm6    \n\t"
364
28
        ::"m"(red_16mask),"m"(green_16mask));
365
28
    mm_end = end - 15;
366
3.54k
    while (s < mm_end) {
367
3.51k
        __asm__ volatile(
368
3.51k
            PREFETCH"  32(%1)           \n\t"
369
3.51k
            "movd        (%1), %%mm0    \n\t"
370
3.51k
            "movd       4(%1), %%mm3    \n\t"
371
3.51k
            "punpckldq  8(%1), %%mm0    \n\t"
372
3.51k
            "punpckldq 12(%1), %%mm3    \n\t"
373
3.51k
            "movq       %%mm0, %%mm1    \n\t"
374
3.51k
            "movq       %%mm0, %%mm2    \n\t"
375
3.51k
            "movq       %%mm3, %%mm4    \n\t"
376
3.51k
            "movq       %%mm3, %%mm5    \n\t"
377
3.51k
            "psllq         $8, %%mm0    \n\t"
378
3.51k
            "psllq         $8, %%mm3    \n\t"
379
3.51k
            "pand       %%mm7, %%mm0    \n\t"
380
3.51k
            "pand       %%mm7, %%mm3    \n\t"
381
3.51k
            "psrlq         $5, %%mm1    \n\t"
382
3.51k
            "psrlq         $5, %%mm4    \n\t"
383
3.51k
            "pand       %%mm6, %%mm1    \n\t"
384
3.51k
            "pand       %%mm6, %%mm4    \n\t"
385
3.51k
            "psrlq        $19, %%mm2    \n\t"
386
3.51k
            "psrlq        $19, %%mm5    \n\t"
387
3.51k
            "pand          %2, %%mm2    \n\t"
388
3.51k
            "pand          %2, %%mm5    \n\t"
389
3.51k
            "por        %%mm1, %%mm0    \n\t"
390
3.51k
            "por        %%mm4, %%mm3    \n\t"
391
3.51k
            "por        %%mm2, %%mm0    \n\t"
392
3.51k
            "por        %%mm5, %%mm3    \n\t"
393
3.51k
            "psllq        $16, %%mm3    \n\t"
394
3.51k
            "por        %%mm3, %%mm0    \n\t"
395
3.51k
            MOVNTQ"     %%mm0, (%0)     \n\t"
396
3.51k
            :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
397
3.51k
        d += 4;
398
3.51k
        s += 16;
399
3.51k
    }
400
28
    __asm__ volatile(SFENCE:::"memory");
401
28
    __asm__ volatile(EMMS:::"memory");
402
58
    while (s < end) {
403
30
        register int rgb = *(const uint32_t*)s; s += 4;
404
30
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
405
30
    }
406
28
}
407
408
static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
409
1.70k
{
410
1.70k
    const uint8_t *s = src;
411
1.70k
    const uint8_t *end;
412
1.70k
    const uint8_t *mm_end;
413
1.70k
    uint16_t *d = (uint16_t *)dst;
414
1.70k
    end = s + src_size;
415
1.70k
    mm_end = end - 15;
416
1.70k
    __asm__ volatile(
417
1.70k
        "movq           %3, %%mm5   \n\t"
418
1.70k
        "movq           %4, %%mm6   \n\t"
419
1.70k
        "movq           %5, %%mm7   \n\t"
420
1.70k
        "jmp            2f          \n\t"
421
1.70k
        ".p2align        4          \n\t"
422
1.70k
        "1:                         \n\t"
423
1.70k
        PREFETCH"   32(%1)          \n\t"
424
1.70k
        "movd         (%1), %%mm0   \n\t"
425
1.70k
        "movd        4(%1), %%mm3   \n\t"
426
1.70k
        "punpckldq   8(%1), %%mm0   \n\t"
427
1.70k
        "punpckldq  12(%1), %%mm3   \n\t"
428
1.70k
        "movq        %%mm0, %%mm1   \n\t"
429
1.70k
        "movq        %%mm3, %%mm4   \n\t"
430
1.70k
        "pand        %%mm6, %%mm0   \n\t"
431
1.70k
        "pand        %%mm6, %%mm3   \n\t"
432
1.70k
        "pmaddwd     %%mm7, %%mm0   \n\t"
433
1.70k
        "pmaddwd     %%mm7, %%mm3   \n\t"
434
1.70k
        "pand        %%mm5, %%mm1   \n\t"
435
1.70k
        "pand        %%mm5, %%mm4   \n\t"
436
1.70k
        "por         %%mm1, %%mm0   \n\t"
437
1.70k
        "por         %%mm4, %%mm3   \n\t"
438
1.70k
        "psrld          $6, %%mm0   \n\t"
439
1.70k
        "pslld         $10, %%mm3   \n\t"
440
1.70k
        "por         %%mm3, %%mm0   \n\t"
441
1.70k
        MOVNTQ"      %%mm0, (%0)    \n\t"
442
1.70k
        "add           $16,  %1     \n\t"
443
1.70k
        "add            $8,  %0     \n\t"
444
1.70k
        "2:                         \n\t"
445
1.70k
        "cmp            %2,  %1     \n\t"
446
1.70k
        " jb            1b          \n\t"
447
1.70k
        : "+r" (d), "+r"(s)
448
1.70k
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
449
1.70k
    );
450
1.70k
    __asm__ volatile(SFENCE:::"memory");
451
1.70k
    __asm__ volatile(EMMS:::"memory");
452
3.14k
    while (s < end) {
453
1.43k
        register int rgb = *(const uint32_t*)s; s += 4;
454
1.43k
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
455
1.43k
    }
456
1.70k
}
457
458
static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
459
598
{
460
598
    const uint8_t *s = src;
461
598
    const uint8_t *end;
462
598
    const uint8_t *mm_end;
463
598
    uint16_t *d = (uint16_t *)dst;
464
598
    end = s + src_size;
465
598
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
466
598
    __asm__ volatile(
467
598
        "movq          %0, %%mm7    \n\t"
468
598
        "movq          %1, %%mm6    \n\t"
469
598
        ::"m"(red_15mask),"m"(green_15mask));
470
598
    mm_end = end - 15;
471
3.32k
    while (s < mm_end) {
472
2.72k
        __asm__ volatile(
473
2.72k
            PREFETCH"  32(%1)           \n\t"
474
2.72k
            "movd        (%1), %%mm0    \n\t"
475
2.72k
            "movd       4(%1), %%mm3    \n\t"
476
2.72k
            "punpckldq  8(%1), %%mm0    \n\t"
477
2.72k
            "punpckldq 12(%1), %%mm3    \n\t"
478
2.72k
            "movq       %%mm0, %%mm1    \n\t"
479
2.72k
            "movq       %%mm0, %%mm2    \n\t"
480
2.72k
            "movq       %%mm3, %%mm4    \n\t"
481
2.72k
            "movq       %%mm3, %%mm5    \n\t"
482
2.72k
            "psllq         $7, %%mm0    \n\t"
483
2.72k
            "psllq         $7, %%mm3    \n\t"
484
2.72k
            "pand       %%mm7, %%mm0    \n\t"
485
2.72k
            "pand       %%mm7, %%mm3    \n\t"
486
2.72k
            "psrlq         $6, %%mm1    \n\t"
487
2.72k
            "psrlq         $6, %%mm4    \n\t"
488
2.72k
            "pand       %%mm6, %%mm1    \n\t"
489
2.72k
            "pand       %%mm6, %%mm4    \n\t"
490
2.72k
            "psrlq        $19, %%mm2    \n\t"
491
2.72k
            "psrlq        $19, %%mm5    \n\t"
492
2.72k
            "pand          %2, %%mm2    \n\t"
493
2.72k
            "pand          %2, %%mm5    \n\t"
494
2.72k
            "por        %%mm1, %%mm0    \n\t"
495
2.72k
            "por        %%mm4, %%mm3    \n\t"
496
2.72k
            "por        %%mm2, %%mm0    \n\t"
497
2.72k
            "por        %%mm5, %%mm3    \n\t"
498
2.72k
            "psllq        $16, %%mm3    \n\t"
499
2.72k
            "por        %%mm3, %%mm0    \n\t"
500
2.72k
            MOVNTQ"     %%mm0, (%0)     \n\t"
501
2.72k
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
502
2.72k
        d += 4;
503
2.72k
        s += 16;
504
2.72k
    }
505
598
    __asm__ volatile(SFENCE:::"memory");
506
598
    __asm__ volatile(EMMS:::"memory");
507
954
    while (s < end) {
508
356
        register int rgb = *(const uint32_t*)s; s += 4;
509
356
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
510
356
    }
511
598
}
512
513
static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
514
130
{
515
130
    const uint8_t *s = src;
516
130
    const uint8_t *end;
517
130
    const uint8_t *mm_end;
518
130
    uint16_t *d = (uint16_t *)dst;
519
130
    end = s + src_size;
520
130
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
521
130
    __asm__ volatile(
522
130
        "movq         %0, %%mm7     \n\t"
523
130
        "movq         %1, %%mm6     \n\t"
524
130
        ::"m"(red_16mask),"m"(green_16mask));
525
130
    mm_end = end - 11;
526
1.91k
    while (s < mm_end) {
527
1.78k
        __asm__ volatile(
528
1.78k
            PREFETCH"  32(%1)           \n\t"
529
1.78k
            "movd        (%1), %%mm0    \n\t"
530
1.78k
            "movd       3(%1), %%mm3    \n\t"
531
1.78k
            "punpckldq  6(%1), %%mm0    \n\t"
532
1.78k
            "punpckldq  9(%1), %%mm3    \n\t"
533
1.78k
            "movq       %%mm0, %%mm1    \n\t"
534
1.78k
            "movq       %%mm0, %%mm2    \n\t"
535
1.78k
            "movq       %%mm3, %%mm4    \n\t"
536
1.78k
            "movq       %%mm3, %%mm5    \n\t"
537
1.78k
            "psrlq         $3, %%mm0    \n\t"
538
1.78k
            "psrlq         $3, %%mm3    \n\t"
539
1.78k
            "pand          %2, %%mm0    \n\t"
540
1.78k
            "pand          %2, %%mm3    \n\t"
541
1.78k
            "psrlq         $5, %%mm1    \n\t"
542
1.78k
            "psrlq         $5, %%mm4    \n\t"
543
1.78k
            "pand       %%mm6, %%mm1    \n\t"
544
1.78k
            "pand       %%mm6, %%mm4    \n\t"
545
1.78k
            "psrlq         $8, %%mm2    \n\t"
546
1.78k
            "psrlq         $8, %%mm5    \n\t"
547
1.78k
            "pand       %%mm7, %%mm2    \n\t"
548
1.78k
            "pand       %%mm7, %%mm5    \n\t"
549
1.78k
            "por        %%mm1, %%mm0    \n\t"
550
1.78k
            "por        %%mm4, %%mm3    \n\t"
551
1.78k
            "por        %%mm2, %%mm0    \n\t"
552
1.78k
            "por        %%mm5, %%mm3    \n\t"
553
1.78k
            "psllq        $16, %%mm3    \n\t"
554
1.78k
            "por        %%mm3, %%mm0    \n\t"
555
1.78k
            MOVNTQ"     %%mm0, (%0)     \n\t"
556
1.78k
            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
557
1.78k
        d += 4;
558
1.78k
        s += 12;
559
1.78k
    }
560
130
    __asm__ volatile(SFENCE:::"memory");
561
130
    __asm__ volatile(EMMS:::"memory");
562
478
    while (s < end) {
563
348
        const int b = *s++;
564
348
        const int g = *s++;
565
348
        const int r = *s++;
566
348
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
567
348
    }
568
130
}
569
570
static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
571
872
{
572
872
    const uint8_t *s = src;
573
872
    const uint8_t *end;
574
872
    const uint8_t *mm_end;
575
872
    uint16_t *d = (uint16_t *)dst;
576
872
    end = s + src_size;
577
872
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
578
872
    __asm__ volatile(
579
872
        "movq         %0, %%mm7     \n\t"
580
872
        "movq         %1, %%mm6     \n\t"
581
872
        ::"m"(red_16mask),"m"(green_16mask));
582
872
    mm_end = end - 15;
583
6.29k
    while (s < mm_end) {
584
5.42k
        __asm__ volatile(
585
5.42k
            PREFETCH"  32(%1)           \n\t"
586
5.42k
            "movd        (%1), %%mm0    \n\t"
587
5.42k
            "movd       3(%1), %%mm3    \n\t"
588
5.42k
            "punpckldq  6(%1), %%mm0    \n\t"
589
5.42k
            "punpckldq  9(%1), %%mm3    \n\t"
590
5.42k
            "movq       %%mm0, %%mm1    \n\t"
591
5.42k
            "movq       %%mm0, %%mm2    \n\t"
592
5.42k
            "movq       %%mm3, %%mm4    \n\t"
593
5.42k
            "movq       %%mm3, %%mm5    \n\t"
594
5.42k
            "psllq         $8, %%mm0    \n\t"
595
5.42k
            "psllq         $8, %%mm3    \n\t"
596
5.42k
            "pand       %%mm7, %%mm0    \n\t"
597
5.42k
            "pand       %%mm7, %%mm3    \n\t"
598
5.42k
            "psrlq         $5, %%mm1    \n\t"
599
5.42k
            "psrlq         $5, %%mm4    \n\t"
600
5.42k
            "pand       %%mm6, %%mm1    \n\t"
601
5.42k
            "pand       %%mm6, %%mm4    \n\t"
602
5.42k
            "psrlq        $19, %%mm2    \n\t"
603
5.42k
            "psrlq        $19, %%mm5    \n\t"
604
5.42k
            "pand          %2, %%mm2    \n\t"
605
5.42k
            "pand          %2, %%mm5    \n\t"
606
5.42k
            "por        %%mm1, %%mm0    \n\t"
607
5.42k
            "por        %%mm4, %%mm3    \n\t"
608
5.42k
            "por        %%mm2, %%mm0    \n\t"
609
5.42k
            "por        %%mm5, %%mm3    \n\t"
610
5.42k
            "psllq        $16, %%mm3    \n\t"
611
5.42k
            "por        %%mm3, %%mm0    \n\t"
612
5.42k
            MOVNTQ"     %%mm0, (%0)     \n\t"
613
5.42k
            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
614
5.42k
        d += 4;
615
5.42k
        s += 12;
616
5.42k
    }
617
872
    __asm__ volatile(SFENCE:::"memory");
618
872
    __asm__ volatile(EMMS:::"memory");
619
3.45k
    while (s < end) {
620
2.57k
        const int r = *s++;
621
2.57k
        const int g = *s++;
622
2.57k
        const int b = *s++;
623
2.57k
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
624
2.57k
    }
625
872
}
626
627
static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
628
1.00k
{
629
1.00k
    const uint8_t *s = src;
630
1.00k
    const uint8_t *end;
631
1.00k
    const uint8_t *mm_end;
632
1.00k
    uint16_t *d = (uint16_t *)dst;
633
1.00k
    end = s + src_size;
634
1.00k
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
635
1.00k
    __asm__ volatile(
636
1.00k
        "movq          %0, %%mm7    \n\t"
637
1.00k
        "movq          %1, %%mm6    \n\t"
638
1.00k
        ::"m"(red_15mask),"m"(green_15mask));
639
1.00k
    mm_end = end - 11;
640
2.20k
    while (s < mm_end) {
641
1.19k
        __asm__ volatile(
642
1.19k
            PREFETCH"  32(%1)           \n\t"
643
1.19k
            "movd        (%1), %%mm0    \n\t"
644
1.19k
            "movd       3(%1), %%mm3    \n\t"
645
1.19k
            "punpckldq  6(%1), %%mm0    \n\t"
646
1.19k
            "punpckldq  9(%1), %%mm3    \n\t"
647
1.19k
            "movq       %%mm0, %%mm1    \n\t"
648
1.19k
            "movq       %%mm0, %%mm2    \n\t"
649
1.19k
            "movq       %%mm3, %%mm4    \n\t"
650
1.19k
            "movq       %%mm3, %%mm5    \n\t"
651
1.19k
            "psrlq         $3, %%mm0    \n\t"
652
1.19k
            "psrlq         $3, %%mm3    \n\t"
653
1.19k
            "pand          %2, %%mm0    \n\t"
654
1.19k
            "pand          %2, %%mm3    \n\t"
655
1.19k
            "psrlq         $6, %%mm1    \n\t"
656
1.19k
            "psrlq         $6, %%mm4    \n\t"
657
1.19k
            "pand       %%mm6, %%mm1    \n\t"
658
1.19k
            "pand       %%mm6, %%mm4    \n\t"
659
1.19k
            "psrlq         $9, %%mm2    \n\t"
660
1.19k
            "psrlq         $9, %%mm5    \n\t"
661
1.19k
            "pand       %%mm7, %%mm2    \n\t"
662
1.19k
            "pand       %%mm7, %%mm5    \n\t"
663
1.19k
            "por        %%mm1, %%mm0    \n\t"
664
1.19k
            "por        %%mm4, %%mm3    \n\t"
665
1.19k
            "por        %%mm2, %%mm0    \n\t"
666
1.19k
            "por        %%mm5, %%mm3    \n\t"
667
1.19k
            "psllq        $16, %%mm3    \n\t"
668
1.19k
            "por        %%mm3, %%mm0    \n\t"
669
1.19k
            MOVNTQ"     %%mm0, (%0)     \n\t"
670
1.19k
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
671
1.19k
        d += 4;
672
1.19k
        s += 12;
673
1.19k
    }
674
1.00k
    __asm__ volatile(SFENCE:::"memory");
675
1.00k
    __asm__ volatile(EMMS:::"memory");
676
3.66k
    while (s < end) {
677
2.65k
        const int b = *s++;
678
2.65k
        const int g = *s++;
679
2.65k
        const int r = *s++;
680
2.65k
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
681
2.65k
    }
682
1.00k
}
683
684
static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
685
406
{
686
406
    const uint8_t *s = src;
687
406
    const uint8_t *end;
688
406
    const uint8_t *mm_end;
689
406
    uint16_t *d = (uint16_t *)dst;
690
406
    end = s + src_size;
691
406
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
692
406
    __asm__ volatile(
693
406
        "movq         %0, %%mm7     \n\t"
694
406
        "movq         %1, %%mm6     \n\t"
695
406
        ::"m"(red_15mask),"m"(green_15mask));
696
406
    mm_end = end - 15;
697
3.30k
    while (s < mm_end) {
698
2.90k
        __asm__ volatile(
699
2.90k
            PREFETCH" 32(%1)            \n\t"
700
2.90k
            "movd       (%1), %%mm0     \n\t"
701
2.90k
            "movd      3(%1), %%mm3     \n\t"
702
2.90k
            "punpckldq 6(%1), %%mm0     \n\t"
703
2.90k
            "punpckldq 9(%1), %%mm3     \n\t"
704
2.90k
            "movq      %%mm0, %%mm1     \n\t"
705
2.90k
            "movq      %%mm0, %%mm2     \n\t"
706
2.90k
            "movq      %%mm3, %%mm4     \n\t"
707
2.90k
            "movq      %%mm3, %%mm5     \n\t"
708
2.90k
            "psllq        $7, %%mm0     \n\t"
709
2.90k
            "psllq        $7, %%mm3     \n\t"
710
2.90k
            "pand      %%mm7, %%mm0     \n\t"
711
2.90k
            "pand      %%mm7, %%mm3     \n\t"
712
2.90k
            "psrlq        $6, %%mm1     \n\t"
713
2.90k
            "psrlq        $6, %%mm4     \n\t"
714
2.90k
            "pand      %%mm6, %%mm1     \n\t"
715
2.90k
            "pand      %%mm6, %%mm4     \n\t"
716
2.90k
            "psrlq       $19, %%mm2     \n\t"
717
2.90k
            "psrlq       $19, %%mm5     \n\t"
718
2.90k
            "pand         %2, %%mm2     \n\t"
719
2.90k
            "pand         %2, %%mm5     \n\t"
720
2.90k
            "por       %%mm1, %%mm0     \n\t"
721
2.90k
            "por       %%mm4, %%mm3     \n\t"
722
2.90k
            "por       %%mm2, %%mm0     \n\t"
723
2.90k
            "por       %%mm5, %%mm3     \n\t"
724
2.90k
            "psllq       $16, %%mm3     \n\t"
725
2.90k
            "por       %%mm3, %%mm0     \n\t"
726
2.90k
            MOVNTQ"    %%mm0, (%0)      \n\t"
727
2.90k
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
728
2.90k
        d += 4;
729
2.90k
        s += 12;
730
2.90k
    }
731
406
    __asm__ volatile(SFENCE:::"memory");
732
406
    __asm__ volatile(EMMS:::"memory");
733
1.33k
    while (s < end) {
734
928
        const int r = *s++;
735
928
        const int g = *s++;
736
928
        const int b = *s++;
737
928
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
738
928
    }
739
406
}
740
741
static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
742
88
{
743
88
    const uint16_t *end;
744
88
    const uint16_t *mm_end;
745
88
    uint8_t *d = dst;
746
88
    const uint16_t *s = (const uint16_t*)src;
747
88
    end = s + src_size/2;
748
88
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
749
88
    mm_end = end - 7;
750
5.06k
    while (s < mm_end) {
751
4.97k
        __asm__ volatile(
752
4.97k
            PREFETCH"  32(%1)           \n\t"
753
4.97k
            "movq        (%1), %%mm0    \n\t"
754
4.97k
            "movq        (%1), %%mm1    \n\t"
755
4.97k
            "movq        (%1), %%mm2    \n\t"
756
4.97k
            "pand          %2, %%mm0    \n\t"
757
4.97k
            "pand          %3, %%mm1    \n\t"
758
4.97k
            "pand          %4, %%mm2    \n\t"
759
4.97k
            "psllq         $5, %%mm0    \n\t"
760
4.97k
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
761
4.97k
            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
762
4.97k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
763
4.97k
            "movq       %%mm0, %%mm3    \n\t"
764
4.97k
            "movq       %%mm1, %%mm4    \n\t"
765
4.97k
            "movq       %%mm2, %%mm5    \n\t"
766
4.97k
            "punpcklwd     %5, %%mm0    \n\t"
767
4.97k
            "punpcklwd     %5, %%mm1    \n\t"
768
4.97k
            "punpcklwd     %5, %%mm2    \n\t"
769
4.97k
            "punpckhwd     %5, %%mm3    \n\t"
770
4.97k
            "punpckhwd     %5, %%mm4    \n\t"
771
4.97k
            "punpckhwd     %5, %%mm5    \n\t"
772
4.97k
            "psllq         $8, %%mm1    \n\t"
773
4.97k
            "psllq        $16, %%mm2    \n\t"
774
4.97k
            "por        %%mm1, %%mm0    \n\t"
775
4.97k
            "por        %%mm2, %%mm0    \n\t"
776
4.97k
            "psllq         $8, %%mm4    \n\t"
777
4.97k
            "psllq        $16, %%mm5    \n\t"
778
4.97k
            "por        %%mm4, %%mm3    \n\t"
779
4.97k
            "por        %%mm5, %%mm3    \n\t"
780
781
4.97k
            "movq       %%mm0, %%mm6    \n\t"
782
4.97k
            "movq       %%mm3, %%mm7    \n\t"
783
784
4.97k
            "movq       8(%1), %%mm0    \n\t"
785
4.97k
            "movq       8(%1), %%mm1    \n\t"
786
4.97k
            "movq       8(%1), %%mm2    \n\t"
787
4.97k
            "pand          %2, %%mm0    \n\t"
788
4.97k
            "pand          %3, %%mm1    \n\t"
789
4.97k
            "pand          %4, %%mm2    \n\t"
790
4.97k
            "psllq         $5, %%mm0    \n\t"
791
4.97k
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
792
4.97k
            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
793
4.97k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
794
4.97k
            "movq       %%mm0, %%mm3    \n\t"
795
4.97k
            "movq       %%mm1, %%mm4    \n\t"
796
4.97k
            "movq       %%mm2, %%mm5    \n\t"
797
4.97k
            "punpcklwd     %5, %%mm0    \n\t"
798
4.97k
            "punpcklwd     %5, %%mm1    \n\t"
799
4.97k
            "punpcklwd     %5, %%mm2    \n\t"
800
4.97k
            "punpckhwd     %5, %%mm3    \n\t"
801
4.97k
            "punpckhwd     %5, %%mm4    \n\t"
802
4.97k
            "punpckhwd     %5, %%mm5    \n\t"
803
4.97k
            "psllq         $8, %%mm1    \n\t"
804
4.97k
            "psllq        $16, %%mm2    \n\t"
805
4.97k
            "por        %%mm1, %%mm0    \n\t"
806
4.97k
            "por        %%mm2, %%mm0    \n\t"
807
4.97k
            "psllq         $8, %%mm4    \n\t"
808
4.97k
            "psllq        $16, %%mm5    \n\t"
809
4.97k
            "por        %%mm4, %%mm3    \n\t"
810
4.97k
            "por        %%mm5, %%mm3    \n\t"
811
812
4.97k
            :"=m"(*d)
813
4.97k
            :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
814
4.97k
             NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
815
4.97k
            :"memory");
816
        /* borrowed 32 to 24 */
817
4.97k
        __asm__ volatile(
818
4.97k
            "movq       %%mm0, %%mm4    \n\t"
819
4.97k
            "movq       %%mm3, %%mm5    \n\t"
820
4.97k
            "movq       %%mm6, %%mm0    \n\t"
821
4.97k
            "movq       %%mm7, %%mm1    \n\t"
822
823
4.97k
            "movq       %%mm4, %%mm6    \n\t"
824
4.97k
            "movq       %%mm5, %%mm7    \n\t"
825
4.97k
            "movq       %%mm0, %%mm2    \n\t"
826
4.97k
            "movq       %%mm1, %%mm3    \n\t"
827
828
4.97k
            STORE_BGR24_MMX
829
830
4.97k
            :: "r"(d), "m"(*s)
831
4.97k
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
832
4.97k
            :"memory");
833
4.97k
        d += 24;
834
4.97k
        s += 8;
835
4.97k
    }
836
88
    __asm__ volatile(SFENCE:::"memory");
837
88
    __asm__ volatile(EMMS:::"memory");
838
507
    while (s < end) {
839
419
        register uint16_t bgr;
840
419
        bgr = *s++;
841
419
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
842
419
        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
843
419
        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
844
419
    }
845
88
}
846
847
static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
848
72
{
849
72
    const uint16_t *end;
850
72
    const uint16_t *mm_end;
851
72
    uint8_t *d = (uint8_t *)dst;
852
72
    const uint16_t *s = (const uint16_t *)src;
853
72
    end = s + src_size/2;
854
72
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
855
72
    mm_end = end - 7;
856
2.37k
    while (s < mm_end) {
857
2.30k
        __asm__ volatile(
858
2.30k
            PREFETCH"  32(%1)           \n\t"
859
2.30k
            "movq        (%1), %%mm0    \n\t"
860
2.30k
            "movq        (%1), %%mm1    \n\t"
861
2.30k
            "movq        (%1), %%mm2    \n\t"
862
2.30k
            "pand          %2, %%mm0    \n\t"
863
2.30k
            "pand          %3, %%mm1    \n\t"
864
2.30k
            "pand          %4, %%mm2    \n\t"
865
2.30k
            "psllq         $5, %%mm0    \n\t"
866
2.30k
            "psrlq         $1, %%mm2    \n\t"
867
2.30k
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
868
2.30k
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
869
2.30k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
870
2.30k
            "movq       %%mm0, %%mm3    \n\t"
871
2.30k
            "movq       %%mm1, %%mm4    \n\t"
872
2.30k
            "movq       %%mm2, %%mm5    \n\t"
873
2.30k
            "punpcklwd     %5, %%mm0    \n\t"
874
2.30k
            "punpcklwd     %5, %%mm1    \n\t"
875
2.30k
            "punpcklwd     %5, %%mm2    \n\t"
876
2.30k
            "punpckhwd     %5, %%mm3    \n\t"
877
2.30k
            "punpckhwd     %5, %%mm4    \n\t"
878
2.30k
            "punpckhwd     %5, %%mm5    \n\t"
879
2.30k
            "psllq         $8, %%mm1    \n\t"
880
2.30k
            "psllq        $16, %%mm2    \n\t"
881
2.30k
            "por        %%mm1, %%mm0    \n\t"
882
2.30k
            "por        %%mm2, %%mm0    \n\t"
883
2.30k
            "psllq         $8, %%mm4    \n\t"
884
2.30k
            "psllq        $16, %%mm5    \n\t"
885
2.30k
            "por        %%mm4, %%mm3    \n\t"
886
2.30k
            "por        %%mm5, %%mm3    \n\t"
887
888
2.30k
            "movq       %%mm0, %%mm6    \n\t"
889
2.30k
            "movq       %%mm3, %%mm7    \n\t"
890
891
2.30k
            "movq       8(%1), %%mm0    \n\t"
892
2.30k
            "movq       8(%1), %%mm1    \n\t"
893
2.30k
            "movq       8(%1), %%mm2    \n\t"
894
2.30k
            "pand          %2, %%mm0    \n\t"
895
2.30k
            "pand          %3, %%mm1    \n\t"
896
2.30k
            "pand          %4, %%mm2    \n\t"
897
2.30k
            "psllq         $5, %%mm0    \n\t"
898
2.30k
            "psrlq         $1, %%mm2    \n\t"
899
2.30k
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
900
2.30k
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
901
2.30k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
902
2.30k
            "movq       %%mm0, %%mm3    \n\t"
903
2.30k
            "movq       %%mm1, %%mm4    \n\t"
904
2.30k
            "movq       %%mm2, %%mm5    \n\t"
905
2.30k
            "punpcklwd     %5, %%mm0    \n\t"
906
2.30k
            "punpcklwd     %5, %%mm1    \n\t"
907
2.30k
            "punpcklwd     %5, %%mm2    \n\t"
908
2.30k
            "punpckhwd     %5, %%mm3    \n\t"
909
2.30k
            "punpckhwd     %5, %%mm4    \n\t"
910
2.30k
            "punpckhwd     %5, %%mm5    \n\t"
911
2.30k
            "psllq         $8, %%mm1    \n\t"
912
2.30k
            "psllq        $16, %%mm2    \n\t"
913
2.30k
            "por        %%mm1, %%mm0    \n\t"
914
2.30k
            "por        %%mm2, %%mm0    \n\t"
915
2.30k
            "psllq         $8, %%mm4    \n\t"
916
2.30k
            "psllq        $16, %%mm5    \n\t"
917
2.30k
            "por        %%mm4, %%mm3    \n\t"
918
2.30k
            "por        %%mm5, %%mm3    \n\t"
919
2.30k
            :"=m"(*d)
920
2.30k
            :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
921
2.30k
             NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
922
2.30k
            :"memory");
923
        /* borrowed 32 to 24 */
924
2.30k
        __asm__ volatile(
925
2.30k
            "movq       %%mm0, %%mm4    \n\t"
926
2.30k
            "movq       %%mm3, %%mm5    \n\t"
927
2.30k
            "movq       %%mm6, %%mm0    \n\t"
928
2.30k
            "movq       %%mm7, %%mm1    \n\t"
929
930
2.30k
            "movq       %%mm4, %%mm6    \n\t"
931
2.30k
            "movq       %%mm5, %%mm7    \n\t"
932
2.30k
            "movq       %%mm0, %%mm2    \n\t"
933
2.30k
            "movq       %%mm1, %%mm3    \n\t"
934
935
2.30k
            STORE_BGR24_MMX
936
937
2.30k
            :: "r"(d), "m"(*s)
938
2.30k
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
939
2.30k
            :"memory");
940
2.30k
        d += 24;
941
2.30k
        s += 8;
942
2.30k
    }
943
72
    __asm__ volatile(SFENCE:::"memory");
944
72
    __asm__ volatile(EMMS:::"memory");
945
164
    while (s < end) {
946
92
        register uint16_t bgr;
947
92
        bgr = *s++;
948
92
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
949
92
        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
950
92
        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
951
92
    }
952
72
}
953
954
/*
955
 * mm0 = 00 B3 00 B2 00 B1 00 B0
956
 * mm1 = 00 G3 00 G2 00 G1 00 G0
957
 * mm2 = 00 R3 00 R2 00 R1 00 R0
958
 * mm6 = FF FF FF FF FF FF FF FF
959
 * mm7 = 00 00 00 00 00 00 00 00
960
 */
961
#define PACK_RGB32 \
962
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
963
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
964
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
965
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
966
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
967
    "movq       %%mm0, %%mm3    \n\t"                               \
968
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
969
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
970
    MOVNTQ"     %%mm0,  (%0)    \n\t"                               \
971
    MOVNTQ"     %%mm3, 8(%0)    \n\t"                               \
972
973
static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
974
275
{
975
275
    const uint16_t *end;
976
275
    const uint16_t *mm_end;
977
275
    uint8_t *d = dst;
978
275
    const uint16_t *s = (const uint16_t *)src;
979
275
    end = s + src_size/2;
980
275
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
981
275
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
982
275
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
983
275
    mm_end = end - 3;
984
1.62k
    while (s < mm_end) {
985
1.35k
        __asm__ volatile(
986
1.35k
            PREFETCH"  32(%1)           \n\t"
987
1.35k
            "movq        (%1), %%mm0    \n\t"
988
1.35k
            "movq        (%1), %%mm1    \n\t"
989
1.35k
            "movq        (%1), %%mm2    \n\t"
990
1.35k
            "pand          %2, %%mm0    \n\t"
991
1.35k
            "pand          %3, %%mm1    \n\t"
992
1.35k
            "pand          %4, %%mm2    \n\t"
993
1.35k
            "psllq         $5, %%mm0    \n\t"
994
1.35k
            "pmulhw        %5, %%mm0    \n\t"
995
1.35k
            "pmulhw        %5, %%mm1    \n\t"
996
1.35k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
997
1.35k
            PACK_RGB32
998
1.35k
            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
999
1.35k
              NAMED_CONSTRAINTS_ADD(mul15_hi)
1000
1.35k
            :"memory");
1001
1.35k
        d += 16;
1002
1.35k
        s += 4;
1003
1.35k
    }
1004
275
    __asm__ volatile(SFENCE:::"memory");
1005
275
    __asm__ volatile(EMMS:::"memory");
1006
281
    while (s < end) {
1007
6
        register uint16_t bgr;
1008
6
        bgr = *s++;
1009
6
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1010
6
        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
1011
6
        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
1012
6
        *d++ = 255;
1013
6
    }
1014
275
}
1015
1016
static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1017
292
{
1018
292
    const uint16_t *end;
1019
292
    const uint16_t *mm_end;
1020
292
    uint8_t *d = dst;
1021
292
    const uint16_t *s = (const uint16_t*)src;
1022
292
    end = s + src_size/2;
1023
292
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1024
292
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1025
292
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1026
292
    mm_end = end - 3;
1027
11.5k
    while (s < mm_end) {
1028
11.2k
        __asm__ volatile(
1029
11.2k
            PREFETCH"  32(%1)           \n\t"
1030
11.2k
            "movq        (%1), %%mm0    \n\t"
1031
11.2k
            "movq        (%1), %%mm1    \n\t"
1032
11.2k
            "movq        (%1), %%mm2    \n\t"
1033
11.2k
            "pand          %2, %%mm0    \n\t"
1034
11.2k
            "pand          %3, %%mm1    \n\t"
1035
11.2k
            "pand          %4, %%mm2    \n\t"
1036
11.2k
            "psllq         $5, %%mm0    \n\t"
1037
11.2k
            "psrlq         $1, %%mm2    \n\t"
1038
11.2k
            "pmulhw        %5, %%mm0    \n\t"
1039
11.2k
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
1040
11.2k
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
1041
11.2k
            PACK_RGB32
1042
11.2k
            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1043
11.2k
              NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1044
11.2k
            :"memory");
1045
11.2k
        d += 16;
1046
11.2k
        s += 4;
1047
11.2k
    }
1048
292
    __asm__ volatile(SFENCE:::"memory");
1049
292
    __asm__ volatile(EMMS:::"memory");
1050
991
    while (s < end) {
1051
699
        register uint16_t bgr;
1052
699
        bgr = *s++;
1053
699
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1054
699
        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1055
699
        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1056
699
        *d++ = 255;
1057
699
    }
1058
292
}
1059
1060
static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1061
871
{
1062
871
    x86_reg mmx_size= 23 - src_size;
1063
871
    __asm__ volatile (
1064
871
        "test             %%"FF_REG_a", %%"FF_REG_a"    \n\t"
1065
871
        "jns                     2f                     \n\t"
1066
871
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1067
871
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1068
871
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1069
871
        ".p2align                 4                     \n\t"
1070
871
        "1:                                             \n\t"
1071
871
        PREFETCH" 32(%1, %%"FF_REG_a")                  \n\t"
1072
871
        "movq    (%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1073
871
        "movq    (%1, %%"FF_REG_a"), %%mm1              \n\t" // BGR BGR BG
1074
871
        "movq   2(%1, %%"FF_REG_a"), %%mm2              \n\t" // R BGR BGR B
1075
871
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1076
871
        "pand                 %%mm5, %%mm0              \n\t"
1077
871
        "pand                 %%mm6, %%mm1              \n\t"
1078
871
        "pand                 %%mm7, %%mm2              \n\t"
1079
871
        "por                  %%mm0, %%mm1              \n\t"
1080
871
        "por                  %%mm2, %%mm1              \n\t"
1081
871
        "movq   6(%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1082
871
        MOVNTQ"               %%mm1,(%2, %%"FF_REG_a")  \n\t" // RGB RGB RG
1083
871
        "movq   8(%1, %%"FF_REG_a"), %%mm1              \n\t" // R BGR BGR B
1084
871
        "movq  10(%1, %%"FF_REG_a"), %%mm2              \n\t" // GR BGR BGR
1085
871
        "pand                 %%mm7, %%mm0              \n\t"
1086
871
        "pand                 %%mm5, %%mm1              \n\t"
1087
871
        "pand                 %%mm6, %%mm2              \n\t"
1088
871
        "por                  %%mm0, %%mm1              \n\t"
1089
871
        "por                  %%mm2, %%mm1              \n\t"
1090
871
        "movq  14(%1, %%"FF_REG_a"), %%mm0              \n\t" // R BGR BGR B
1091
871
        MOVNTQ"               %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1092
871
        "movq  16(%1, %%"FF_REG_a"), %%mm1              \n\t" // GR BGR BGR
1093
871
        "movq  18(%1, %%"FF_REG_a"), %%mm2              \n\t" // BGR BGR BG
1094
871
        "pand                 %%mm6, %%mm0              \n\t"
1095
871
        "pand                 %%mm7, %%mm1              \n\t"
1096
871
        "pand                 %%mm5, %%mm2              \n\t"
1097
871
        "por                  %%mm0, %%mm1              \n\t"
1098
871
        "por                  %%mm2, %%mm1              \n\t"
1099
871
        MOVNTQ"               %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1100
871
        "add                    $24, %%"FF_REG_a"       \n\t"
1101
871
        " js                     1b                     \n\t"
1102
871
        "2:                                             \n\t"
1103
871
        : "+a" (mmx_size)
1104
871
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1105
871
          NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1106
871
    );
1107
1108
871
    __asm__ volatile(SFENCE:::"memory");
1109
871
    __asm__ volatile(EMMS:::"memory");
1110
1111
871
    if (mmx_size==23) return; //finished, was multiple of 8
1112
1113
869
    src+= src_size;
1114
869
    dst+= src_size;
1115
869
    src_size= 23-mmx_size;
1116
869
    src-= src_size;
1117
869
    dst-= src_size;
1118
4.86k
    for (unsigned i = 0; i < src_size; i +=3) {
1119
3.99k
        register uint8_t x;
1120
3.99k
        x          = src[i + 2];
1121
3.99k
        dst[i + 1] = src[i + 1];
1122
3.99k
        dst[i + 2] = src[i + 0];
1123
3.99k
        dst[i + 0] = x;
1124
3.99k
    }
1125
869
}
1126
1127
static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1128
                                           int width, int height,
1129
                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1130
13
{
1131
13
    const x86_reg chromWidth= width>>1;
1132
1.11k
    for (int y = 0; y < height; y++) {
1133
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1134
1.09k
        __asm__ volatile(
1135
1.09k
            "xor                 %%"FF_REG_a", %%"FF_REG_a" \n\t"
1136
1.09k
            ".p2align                    4              \n\t"
1137
1.09k
            "1:                                         \n\t"
1138
1.09k
            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1139
1.09k
            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1140
1.09k
            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1141
1.09k
            "movq       (%2, %%"FF_REG_a"), %%mm0       \n\t" // U(0)
1142
1.09k
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1143
1.09k
            "movq       (%3, %%"FF_REG_a"), %%mm1       \n\t" // V(0)
1144
1.09k
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1145
1.09k
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1146
1147
1.09k
            "movq     (%1, %%"FF_REG_a",2), %%mm3       \n\t" // Y(0)
1148
1.09k
            "movq    8(%1, %%"FF_REG_a",2), %%mm5       \n\t" // Y(8)
1149
1.09k
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1150
1.09k
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1151
1.09k
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1152
1.09k
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1153
1.09k
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1154
1.09k
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1155
1156
1.09k
            MOVNTQ"                  %%mm3,   (%0, %%"FF_REG_a", 4)    \n\t"
1157
1.09k
            MOVNTQ"                  %%mm4,  8(%0, %%"FF_REG_a", 4)    \n\t"
1158
1.09k
            MOVNTQ"                  %%mm5, 16(%0, %%"FF_REG_a", 4)    \n\t"
1159
1.09k
            MOVNTQ"                  %%mm6, 24(%0, %%"FF_REG_a", 4)    \n\t"
1160
1161
1.09k
            "add                        $8, %%"FF_REG_a" \n\t"
1162
1.09k
            "cmp                        %4, %%"FF_REG_a" \n\t"
1163
1.09k
            " jb                        1b               \n\t"
1164
1.09k
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1165
1.09k
            : "%"FF_REG_a
1166
1.09k
        );
1167
1.09k
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1168
808
            usrc += chromStride;
1169
808
            vsrc += chromStride;
1170
808
        }
1171
1.09k
        ysrc += lumStride;
1172
1.09k
        dst  += dstStride;
1173
1.09k
    }
1174
13
    __asm__(EMMS"       \n\t"
1175
13
            SFENCE"     \n\t"
1176
13
            :::"memory");
1177
13
}
1178
1179
/**
1180
 * Height should be a multiple of 2 and width should be a multiple of 16.
1181
 * (If this is a problem for anyone then tell me, and I will fix it.)
1182
 */
1183
static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1184
                                      int width, int height,
1185
                                      int lumStride, int chromStride, int dstStride)
1186
7
{
1187
    //FIXME interpolate chroma
1188
7
    yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1189
7
}
1190
1191
static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1192
                                           int width, int height,
1193
                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1194
8
{
1195
8
    const x86_reg chromWidth= width>>1;
1196
206
    for (int y = 0; y < height; y++) {
1197
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1198
198
        __asm__ volatile(
1199
198
            "xor             %%"FF_REG_a", %%"FF_REG_a" \n\t"
1200
198
            ".p2align                   4               \n\t"
1201
198
            "1:                                         \n\t"
1202
198
            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1203
198
            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1204
198
            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1205
198
            "movq      (%2, %%"FF_REG_a"), %%mm0        \n\t" // U(0)
1206
198
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1207
198
            "movq      (%3, %%"FF_REG_a"), %%mm1        \n\t" // V(0)
1208
198
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1209
198
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1210
1211
198
            "movq    (%1, %%"FF_REG_a",2), %%mm3        \n\t" // Y(0)
1212
198
            "movq   8(%1, %%"FF_REG_a",2), %%mm5        \n\t" // Y(8)
1213
198
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1214
198
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1215
198
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1216
198
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1217
198
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1218
198
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1219
1220
198
            MOVNTQ"                 %%mm0,   (%0, %%"FF_REG_a", 4)     \n\t"
1221
198
            MOVNTQ"                 %%mm4,  8(%0, %%"FF_REG_a", 4)     \n\t"
1222
198
            MOVNTQ"                 %%mm2, 16(%0, %%"FF_REG_a", 4)     \n\t"
1223
198
            MOVNTQ"                 %%mm6, 24(%0, %%"FF_REG_a", 4)     \n\t"
1224
1225
198
            "add                       $8, %%"FF_REG_a" \n\t"
1226
198
            "cmp                       %4, %%"FF_REG_a" \n\t"
1227
198
            " jb                       1b               \n\t"
1228
198
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1229
198
            : "%"FF_REG_a
1230
198
        );
1231
198
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1232
114
            usrc += chromStride;
1233
114
            vsrc += chromStride;
1234
114
        }
1235
198
        ysrc += lumStride;
1236
198
        dst += dstStride;
1237
198
    }
1238
8
    __asm__(EMMS"       \n\t"
1239
8
            SFENCE"     \n\t"
1240
8
            :::"memory");
1241
8
}
1242
1243
/**
1244
 * Height should be a multiple of 2 and width should be a multiple of 16
1245
 * (If this is a problem for anyone then tell me, and I will fix it.)
1246
 */
1247
static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1248
                                      int width, int height,
1249
                                      int lumStride, int chromStride, int dstStride)
1250
5
{
1251
    //FIXME interpolate chroma
1252
5
    yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1253
5
}
1254
1255
/**
1256
 * Width should be a multiple of 16.
1257
 */
1258
static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1259
                                         int width, int height,
1260
                                         int lumStride, int chromStride, int dstStride)
1261
3
{
1262
3
    yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1263
3
}
1264
1265
/**
1266
 * Width should be a multiple of 16.
1267
 */
1268
static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1269
                                         int width, int height,
1270
                                         int lumStride, int chromStride, int dstStride)
1271
6
{
1272
6
    yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1273
6
}
1274
1275
/**
1276
 * Height should be a multiple of 2 and width should be a multiple of 16.
1277
 * (If this is a problem for anyone then tell me, and I will fix it.)
1278
 */
1279
static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1280
                                      int width, int height,
1281
                                      int lumStride, int chromStride, int srcStride)
1282
0
{
1283
0
    const x86_reg chromWidth= width>>1;
1284
0
    for (int y = 0; y < height; y += 2) {
1285
0
        __asm__ volatile(
1286
0
            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1287
0
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1288
0
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1289
0
            ".p2align                    4              \n\t"
1290
0
            "1:                \n\t"
1291
0
            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1292
0
            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1293
0
            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1294
0
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1295
0
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1296
0
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1297
0
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1298
0
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1299
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1300
0
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1301
0
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1302
1303
0
            MOVNTQ"                  %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1304
1305
0
            "movq  16(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1306
0
            "movq  24(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1307
0
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1308
0
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1309
0
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1310
0
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1311
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1312
0
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1313
0
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1314
0
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1315
1316
0
            MOVNTQ"                  %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1317
1318
0
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1319
0
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1320
0
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1321
0
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1322
0
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1323
0
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1324
0
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1325
0
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1326
1327
0
            MOVNTQ"                  %%mm0, (%3, %%"FF_REG_a")     \n\t"
1328
0
            MOVNTQ"                  %%mm2, (%2, %%"FF_REG_a")     \n\t"
1329
1330
0
            "add                        $8, %%"FF_REG_a" \n\t"
1331
0
            "cmp                        %4, %%"FF_REG_a" \n\t"
1332
0
            " jb                        1b               \n\t"
1333
0
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1334
0
            : "memory", "%"FF_REG_a
1335
0
        );
1336
1337
0
        ydst += lumStride;
1338
0
        src  += srcStride;
1339
1340
0
        __asm__ volatile(
1341
0
            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1342
0
            ".p2align                    4              \n\t"
1343
0
            "1:                                         \n\t"
1344
0
            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1345
0
            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1346
0
            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1347
0
            "movq  16(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1348
0
            "movq  24(%0, %%"FF_REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1349
0
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1350
0
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1351
0
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1352
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1353
0
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1354
0
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1355
1356
0
            MOVNTQ"                  %%mm0,  (%1, %%"FF_REG_a", 2) \n\t"
1357
0
            MOVNTQ"                  %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1358
1359
0
            "add                        $8, %%"FF_REG_a"\n\t"
1360
0
            "cmp                        %4, %%"FF_REG_a"\n\t"
1361
0
            " jb                        1b              \n\t"
1362
1363
0
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1364
0
            : "memory", "%"FF_REG_a
1365
0
        );
1366
0
        udst += chromStride;
1367
0
        vdst += chromStride;
1368
0
        ydst += lumStride;
1369
0
        src  += srcStride;
1370
0
    }
1371
0
    __asm__ volatile(EMMS"       \n\t"
1372
0
                     SFENCE"     \n\t"
1373
0
                     :::"memory");
1374
0
}
1375
1376
static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1377
18
{
1378
18
    dst[0]= src[0];
1379
1380
    // first line
1381
262
    for (int x = 0; x < srcWidth - 1; x++) {
1382
244
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1383
244
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1384
244
    }
1385
18
    dst[2*srcWidth-1]= src[srcWidth-1];
1386
1387
18
    dst+= dstStride;
1388
1389
2.01k
    for (int y = 1; y < srcHeight; y++) {
1390
1.99k
        x86_reg mmxSize= srcWidth&~15;
1391
1392
1.99k
        if (mmxSize) {
1393
10
        __asm__ volatile(
1394
10
            "mov                       %4, %%"FF_REG_a" \n\t"
1395
10
            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1396
10
            "movq      (%0, %%"FF_REG_a"), %%mm4    \n\t"
1397
10
            "movq                   %%mm4, %%mm2    \n\t"
1398
10
            "psllq                     $8, %%mm4    \n\t"
1399
10
            "pand                   %%mm0, %%mm2    \n\t"
1400
10
            "por                    %%mm2, %%mm4    \n\t"
1401
10
            "movq      (%1, %%"FF_REG_a"), %%mm5    \n\t"
1402
10
            "movq                   %%mm5, %%mm3    \n\t"
1403
10
            "psllq                     $8, %%mm5    \n\t"
1404
10
            "pand                   %%mm0, %%mm3    \n\t"
1405
10
            "por                    %%mm3, %%mm5    \n\t"
1406
10
            "1:                                     \n\t"
1407
10
            "movq      (%0, %%"FF_REG_a"), %%mm0    \n\t"
1408
10
            "movq      (%1, %%"FF_REG_a"), %%mm1    \n\t"
1409
10
            "movq     1(%0, %%"FF_REG_a"), %%mm2    \n\t"
1410
10
            "movq     1(%1, %%"FF_REG_a"), %%mm3    \n\t"
1411
10
            PAVGB"                  %%mm0, %%mm5    \n\t"
1412
10
            PAVGB"                  %%mm0, %%mm3    \n\t"
1413
10
            PAVGB"                  %%mm0, %%mm5    \n\t"
1414
10
            PAVGB"                  %%mm0, %%mm3    \n\t"
1415
10
            PAVGB"                  %%mm1, %%mm4    \n\t"
1416
10
            PAVGB"                  %%mm1, %%mm2    \n\t"
1417
10
            PAVGB"                  %%mm1, %%mm4    \n\t"
1418
10
            PAVGB"                  %%mm1, %%mm2    \n\t"
1419
10
            "movq                   %%mm5, %%mm7    \n\t"
1420
10
            "movq                   %%mm4, %%mm6    \n\t"
1421
10
            "punpcklbw              %%mm3, %%mm5    \n\t"
1422
10
            "punpckhbw              %%mm3, %%mm7    \n\t"
1423
10
            "punpcklbw              %%mm2, %%mm4    \n\t"
1424
10
            "punpckhbw              %%mm2, %%mm6    \n\t"
1425
10
            MOVNTQ"                 %%mm5,  (%2, %%"FF_REG_a", 2)  \n\t"
1426
10
            MOVNTQ"                 %%mm7, 8(%2, %%"FF_REG_a", 2)  \n\t"
1427
10
            MOVNTQ"                 %%mm4,  (%3, %%"FF_REG_a", 2)  \n\t"
1428
10
            MOVNTQ"                 %%mm6, 8(%3, %%"FF_REG_a", 2)  \n\t"
1429
10
            "add                       $8, %%"FF_REG_a"            \n\t"
1430
10
            "movq    -1(%0, %%"FF_REG_a"), %%mm4    \n\t"
1431
10
            "movq    -1(%1, %%"FF_REG_a"), %%mm5    \n\t"
1432
10
            " js                       1b           \n\t"
1433
10
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1434
10
               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1435
10
               "g" (-mmxSize)
1436
10
               NAMED_CONSTRAINTS_ADD(mmx_ff)
1437
10
            : "%"FF_REG_a
1438
10
        );
1439
1.98k
        } else {
1440
1.98k
            mmxSize = 1;
1441
1.98k
            dst[0]         = (src[0] * 3 + src[srcStride]) >> 2;
1442
1.98k
            dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1443
1.98k
        }
1444
1445
2.88k
        for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
1446
894
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1447
894
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1448
894
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1449
894
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1450
894
        }
1451
1.99k
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1452
1.99k
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1453
1454
1.99k
        dst+=dstStride*2;
1455
1.99k
        src+=srcStride;
1456
1.99k
    }
1457
1458
    // last line
1459
18
    dst[0]= src[0];
1460
1461
262
    for (int x = 0; x < srcWidth - 1; x++) {
1462
244
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1463
244
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1464
244
    }
1465
18
    dst[2*srcWidth-1]= src[srcWidth-1];
1466
1467
18
    __asm__ volatile(EMMS"       \n\t"
1468
18
                     SFENCE"     \n\t"
1469
18
                     :::"memory");
1470
18
}
1471
1472
/**
1473
 * Height should be a multiple of 2 and width should be a multiple of 2.
1474
 * (If this is a problem for anyone then tell me, and I will fix it.)
1475
 * Chrominance data is only taken from every second line,
1476
 * others are ignored in the C version.
1477
 * FIXME: Write HQ version.
1478
 */
1479
#if ARCH_X86_32 && HAVE_7REGS
1480
DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset)  = 0x1010101010101010ULL;
1481
DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL;
1482
DECLARE_ASM_CONST(8, uint64_t, w1111)        = 0x0001000100010001ULL;
1483
1484
static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1485
                                       int width, int height,
1486
                                       int lumStride, int chromStride, int srcStride,
1487
                                       const int32_t *rgb2yuv)
1488
{
1489
#define BGR2Y_IDX "16*4+16*32"
1490
#define BGR2U_IDX "16*4+16*33"
1491
#define BGR2V_IDX "16*4+16*34"
1492
    int y;
1493
    const x86_reg chromWidth= width>>1;
1494
1495
    if (height > 2) {
1496
        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1497
        src  += 2*srcStride;
1498
        ydst += 2*lumStride;
1499
        udst += chromStride;
1500
        vdst += chromStride;
1501
        height -= 2;
1502
    }
1503
1504
    for (y = 0; y < height - 2; y += 2) {
1505
        for (int i = 0; i < 2; i++) {
1506
            __asm__ volatile(
1507
                "mov                        %2, %%"FF_REG_a"\n\t"
1508
                "movq          "BGR2Y_IDX"(%3), %%mm6       \n\t"
1509
                "movq          "MANGLE(w1111)", %%mm5       \n\t"
1510
                "pxor                    %%mm7, %%mm7       \n\t"
1511
                "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1512
                ".p2align                    4              \n\t"
1513
                "1:                                         \n\t"
1514
                PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1515
                "movd       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1516
                "movd      3(%0, %%"FF_REG_d"), %%mm1       \n\t"
1517
                "punpcklbw               %%mm7, %%mm0       \n\t"
1518
                "punpcklbw               %%mm7, %%mm1       \n\t"
1519
                "movd      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1520
                "movd      9(%0, %%"FF_REG_d"), %%mm3       \n\t"
1521
                "punpcklbw               %%mm7, %%mm2       \n\t"
1522
                "punpcklbw               %%mm7, %%mm3       \n\t"
1523
                "pmaddwd                 %%mm6, %%mm0       \n\t"
1524
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1525
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1526
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1527
                "psrad                      $8, %%mm0       \n\t"
1528
                "psrad                      $8, %%mm1       \n\t"
1529
                "psrad                      $8, %%mm2       \n\t"
1530
                "psrad                      $8, %%mm3       \n\t"
1531
                "packssdw                %%mm1, %%mm0       \n\t"
1532
                "packssdw                %%mm3, %%mm2       \n\t"
1533
                "pmaddwd                 %%mm5, %%mm0       \n\t"
1534
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1535
                "packssdw                %%mm2, %%mm0       \n\t"
1536
                "psraw                      $7, %%mm0       \n\t"
1537
1538
                "movd     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1539
                "movd     15(%0, %%"FF_REG_d"), %%mm1       \n\t"
1540
                "punpcklbw               %%mm7, %%mm4       \n\t"
1541
                "punpcklbw               %%mm7, %%mm1       \n\t"
1542
                "movd     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1543
                "movd     21(%0, %%"FF_REG_d"), %%mm3       \n\t"
1544
                "punpcklbw               %%mm7, %%mm2       \n\t"
1545
                "punpcklbw               %%mm7, %%mm3       \n\t"
1546
                "pmaddwd                 %%mm6, %%mm4       \n\t"
1547
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1548
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1549
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1550
                "psrad                      $8, %%mm4       \n\t"
1551
                "psrad                      $8, %%mm1       \n\t"
1552
                "psrad                      $8, %%mm2       \n\t"
1553
                "psrad                      $8, %%mm3       \n\t"
1554
                "packssdw                %%mm1, %%mm4       \n\t"
1555
                "packssdw                %%mm3, %%mm2       \n\t"
1556
                "pmaddwd                 %%mm5, %%mm4       \n\t"
1557
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1558
                "add                       $24, %%"FF_REG_d"\n\t"
1559
                "packssdw                %%mm2, %%mm4       \n\t"
1560
                "psraw                      $7, %%mm4       \n\t"
1561
1562
                "packuswb                %%mm4, %%mm0       \n\t"
1563
                "paddusb "MANGLE(bgr2YOffset)", %%mm0       \n\t"
1564
1565
                MOVNTQ"                  %%mm0, (%1, %%"FF_REG_a") \n\t"
1566
                "add                        $8,      %%"FF_REG_a"  \n\t"
1567
                " js                        1b                     \n\t"
1568
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1569
                  NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset)
1570
                : "%"FF_REG_a, "%"FF_REG_d
1571
            );
1572
            ydst += lumStride;
1573
            src  += srcStride;
1574
        }
1575
        src -= srcStride*2;
1576
        __asm__ volatile(
1577
            "mov                        %4, %%"FF_REG_a"\n\t"
1578
            "movq          "MANGLE(w1111)", %%mm5       \n\t"
1579
            "movq          "BGR2U_IDX"(%5), %%mm6       \n\t"
1580
            "pxor                    %%mm7, %%mm7       \n\t"
1581
            "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1582
            "add              %%"FF_REG_d", %%"FF_REG_d"\n\t"
1583
            ".p2align                    4              \n\t"
1584
            "1:                                         \n\t"
1585
            PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1586
            PREFETCH" 64(%1, %%"FF_REG_d")              \n\t"
1587
            "movq       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1588
            "movq       (%1, %%"FF_REG_d"), %%mm1       \n\t"
1589
            "movq      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1590
            "movq      6(%1, %%"FF_REG_d"), %%mm3       \n\t"
1591
            PAVGB"                   %%mm1, %%mm0       \n\t"
1592
            PAVGB"                   %%mm3, %%mm2       \n\t"
1593
            "movq                    %%mm0, %%mm1       \n\t"
1594
            "movq                    %%mm2, %%mm3       \n\t"
1595
            "psrlq                     $24, %%mm0       \n\t"
1596
            "psrlq                     $24, %%mm2       \n\t"
1597
            PAVGB"                   %%mm1, %%mm0       \n\t"
1598
            PAVGB"                   %%mm3, %%mm2       \n\t"
1599
            "punpcklbw               %%mm7, %%mm0       \n\t"
1600
            "punpcklbw               %%mm7, %%mm2       \n\t"
1601
            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1602
            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1603
1604
            "pmaddwd                 %%mm0, %%mm1       \n\t"
1605
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1606
            "pmaddwd                 %%mm6, %%mm0       \n\t"
1607
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1608
            "psrad                      $8, %%mm0       \n\t"
1609
            "psrad                      $8, %%mm1       \n\t"
1610
            "psrad                      $8, %%mm2       \n\t"
1611
            "psrad                      $8, %%mm3       \n\t"
1612
            "packssdw                %%mm2, %%mm0       \n\t"
1613
            "packssdw                %%mm3, %%mm1       \n\t"
1614
            "pmaddwd                 %%mm5, %%mm0       \n\t"
1615
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1616
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1617
            "psraw                      $7, %%mm0       \n\t"
1618
1619
            "movq     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1620
            "movq     12(%1, %%"FF_REG_d"), %%mm1       \n\t"
1621
            "movq     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1622
            "movq     18(%1, %%"FF_REG_d"), %%mm3       \n\t"
1623
            PAVGB"                   %%mm1, %%mm4       \n\t"
1624
            PAVGB"                   %%mm3, %%mm2       \n\t"
1625
            "movq                    %%mm4, %%mm1       \n\t"
1626
            "movq                    %%mm2, %%mm3       \n\t"
1627
            "psrlq                     $24, %%mm4       \n\t"
1628
            "psrlq                     $24, %%mm2       \n\t"
1629
            PAVGB"                   %%mm1, %%mm4       \n\t"
1630
            PAVGB"                   %%mm3, %%mm2       \n\t"
1631
            "punpcklbw               %%mm7, %%mm4       \n\t"
1632
            "punpcklbw               %%mm7, %%mm2       \n\t"
1633
            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1634
            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1635
1636
            "pmaddwd                 %%mm4, %%mm1       \n\t"
1637
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1638
            "pmaddwd                 %%mm6, %%mm4       \n\t"
1639
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1640
            "psrad                      $8, %%mm4       \n\t"
1641
            "psrad                      $8, %%mm1       \n\t"
1642
            "psrad                      $8, %%mm2       \n\t"
1643
            "psrad                      $8, %%mm3       \n\t"
1644
            "packssdw                %%mm2, %%mm4       \n\t"
1645
            "packssdw                %%mm3, %%mm1       \n\t"
1646
            "pmaddwd                 %%mm5, %%mm4       \n\t"
1647
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1648
            "add                       $24, %%"FF_REG_d"\n\t"
1649
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
1650
            "psraw                      $7, %%mm4       \n\t"
1651
1652
            "movq                    %%mm0, %%mm1           \n\t"
1653
            "punpckldq               %%mm4, %%mm0           \n\t"
1654
            "punpckhdq               %%mm4, %%mm1           \n\t"
1655
            "packsswb                %%mm1, %%mm0           \n\t"
1656
            "paddb  "MANGLE(bgr2UVOffset)", %%mm0           \n\t"
1657
            "movd                    %%mm0, (%2, %%"FF_REG_a") \n\t"
1658
            "punpckhdq               %%mm0, %%mm0              \n\t"
1659
            "movd                    %%mm0, (%3, %%"FF_REG_a") \n\t"
1660
            "add                        $4, %%"FF_REG_a"       \n\t"
1661
            " js                        1b              \n\t"
1662
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1663
              NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset)
1664
            : "%"FF_REG_a, "%"FF_REG_d
1665
        );
1666
1667
        udst += chromStride;
1668
        vdst += chromStride;
1669
        src  += srcStride*2;
1670
    }
1671
1672
    __asm__ volatile(EMMS"       \n\t"
1673
                     SFENCE"     \n\t"
1674
                     :::"memory");
1675
1676
     ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1677
}
1678
#endif /* HAVE_7REGS */
1679
1680
static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
1681
                                       uint8_t *dst1, uint8_t *dst2,
1682
                                       int width, int height,
1683
                                       int srcStride1, int srcStride2,
1684
                                       int dstStride1, int dstStride2)
1685
0
{
1686
0
    int w,h;
1687
0
    w=width/2; h=height/2;
1688
0
    __asm__ volatile(
1689
0
        PREFETCH" %0    \n\t"
1690
0
        PREFETCH" %1    \n\t"
1691
0
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1692
0
    for (x86_reg y = 0; y < h; y++) {
1693
0
        const uint8_t* s1=src1+srcStride1*(y>>1);
1694
0
        uint8_t* d=dst1+dstStride1*y;
1695
0
        x86_reg x = 0;
1696
0
        for (;x<w-31;x+=32) {
1697
0
            __asm__ volatile(
1698
0
                PREFETCH"   32(%1,%2)        \n\t"
1699
0
                "movq         (%1,%2), %%mm0 \n\t"
1700
0
                "movq        8(%1,%2), %%mm2 \n\t"
1701
0
                "movq       16(%1,%2), %%mm4 \n\t"
1702
0
                "movq       24(%1,%2), %%mm6 \n\t"
1703
0
                "movq      %%mm0, %%mm1 \n\t"
1704
0
                "movq      %%mm2, %%mm3 \n\t"
1705
0
                "movq      %%mm4, %%mm5 \n\t"
1706
0
                "movq      %%mm6, %%mm7 \n\t"
1707
0
                "punpcklbw %%mm0, %%mm0 \n\t"
1708
0
                "punpckhbw %%mm1, %%mm1 \n\t"
1709
0
                "punpcklbw %%mm2, %%mm2 \n\t"
1710
0
                "punpckhbw %%mm3, %%mm3 \n\t"
1711
0
                "punpcklbw %%mm4, %%mm4 \n\t"
1712
0
                "punpckhbw %%mm5, %%mm5 \n\t"
1713
0
                "punpcklbw %%mm6, %%mm6 \n\t"
1714
0
                "punpckhbw %%mm7, %%mm7 \n\t"
1715
0
                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1716
0
                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1717
0
                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1718
0
                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1719
0
                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1720
0
                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1721
0
                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1722
0
                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1723
0
                :: "r"(d), "r"(s1), "r"(x)
1724
0
                :"memory");
1725
0
        }
1726
0
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1727
0
    }
1728
0
    for (x86_reg y = 0; y < h; y++) {
1729
0
        const uint8_t* s2=src2+srcStride2*(y>>1);
1730
0
        uint8_t* d=dst2+dstStride2*y;
1731
0
        x86_reg x = 0;
1732
0
        for (;x<w-31;x+=32) {
1733
0
            __asm__ volatile(
1734
0
                PREFETCH"   32(%1,%2)        \n\t"
1735
0
                "movq         (%1,%2), %%mm0 \n\t"
1736
0
                "movq        8(%1,%2), %%mm2 \n\t"
1737
0
                "movq       16(%1,%2), %%mm4 \n\t"
1738
0
                "movq       24(%1,%2), %%mm6 \n\t"
1739
0
                "movq      %%mm0, %%mm1 \n\t"
1740
0
                "movq      %%mm2, %%mm3 \n\t"
1741
0
                "movq      %%mm4, %%mm5 \n\t"
1742
0
                "movq      %%mm6, %%mm7 \n\t"
1743
0
                "punpcklbw %%mm0, %%mm0 \n\t"
1744
0
                "punpckhbw %%mm1, %%mm1 \n\t"
1745
0
                "punpcklbw %%mm2, %%mm2 \n\t"
1746
0
                "punpckhbw %%mm3, %%mm3 \n\t"
1747
0
                "punpcklbw %%mm4, %%mm4 \n\t"
1748
0
                "punpckhbw %%mm5, %%mm5 \n\t"
1749
0
                "punpcklbw %%mm6, %%mm6 \n\t"
1750
0
                "punpckhbw %%mm7, %%mm7 \n\t"
1751
0
                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1752
0
                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1753
0
                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1754
0
                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1755
0
                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1756
0
                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1757
0
                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1758
0
                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1759
0
                :: "r"(d), "r"(s2), "r"(x)
1760
0
                :"memory");
1761
0
        }
1762
0
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1763
0
    }
1764
0
    __asm__(
1765
0
            EMMS"       \n\t"
1766
0
            SFENCE"     \n\t"
1767
0
            ::: "memory"
1768
0
        );
1769
0
}
1770
1771
static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1772
                                        uint8_t *dst,
1773
                                        int width, int height,
1774
                                        int srcStride1, int srcStride2,
1775
                                        int srcStride3, int dstStride)
1776
0
{
1777
0
    int w,h;
1778
0
    w=width/2; h=height;
1779
0
    for (int y = 0; y < h; y++) {
1780
0
        const uint8_t* yp=src1+srcStride1*y;
1781
0
        const uint8_t* up=src2+srcStride2*(y>>2);
1782
0
        const uint8_t* vp=src3+srcStride3*(y>>2);
1783
0
        uint8_t* d=dst+dstStride*y;
1784
0
        x86_reg x = 0;
1785
0
        for (;x<w-7;x+=8) {
1786
0
            __asm__ volatile(
1787
0
                PREFETCH"   32(%1, %0)          \n\t"
1788
0
                PREFETCH"   32(%2, %0)          \n\t"
1789
0
                PREFETCH"   32(%3, %0)          \n\t"
1790
0
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1791
0
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
1792
0
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
1793
0
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1794
0
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
1795
0
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
1796
0
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1797
0
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1798
0
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1799
0
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1800
1801
0
                "movq            %%mm1, %%mm6   \n\t"
1802
0
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1803
0
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1804
0
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1805
0
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
1806
0
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
1807
1808
0
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1809
0
                "movq     8(%1, %0, 4), %%mm0   \n\t"
1810
0
                "movq            %%mm0, %%mm3   \n\t"
1811
0
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1812
0
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1813
0
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
1814
0
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
1815
1816
0
                "movq            %%mm4, %%mm6   \n\t"
1817
0
                "movq    16(%1, %0, 4), %%mm0   \n\t"
1818
0
                "movq            %%mm0, %%mm3   \n\t"
1819
0
                "punpcklbw       %%mm5, %%mm4   \n\t"
1820
0
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1821
0
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1822
0
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
1823
0
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
1824
1825
0
                "punpckhbw       %%mm5, %%mm6   \n\t"
1826
0
                "movq    24(%1, %0, 4), %%mm0   \n\t"
1827
0
                "movq            %%mm0, %%mm3   \n\t"
1828
0
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1829
0
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
1830
0
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
1831
0
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
1832
1833
0
                : "+r" (x)
1834
0
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
1835
0
                :"memory");
1836
0
        }
1837
0
        for (; x<w; x++) {
1838
0
            const int x2 = x<<2;
1839
0
            d[8*x+0] = yp[x2];
1840
0
            d[8*x+1] = up[x];
1841
0
            d[8*x+2] = yp[x2+1];
1842
0
            d[8*x+3] = vp[x];
1843
0
            d[8*x+4] = yp[x2+2];
1844
0
            d[8*x+5] = up[x];
1845
0
            d[8*x+6] = yp[x2+3];
1846
0
            d[8*x+7] = vp[x];
1847
0
        }
1848
0
    }
1849
0
    __asm__(
1850
0
            EMMS"       \n\t"
1851
0
            SFENCE"     \n\t"
1852
0
            ::: "memory"
1853
0
        );
1854
0
}
1855
1856
static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1857
2.90k
{
1858
2.90k
    dst +=   count;
1859
2.90k
    src += 2*count;
1860
2.90k
    count= - count;
1861
1862
2.90k
    if(count <= -16) {
1863
504
        count += 15;
1864
504
        __asm__ volatile(
1865
504
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1866
504
            "psrlw            $8, %%mm7        \n\t"
1867
504
            "1:                                \n\t"
1868
504
            "movq -30(%1, %0, 2), %%mm0        \n\t"
1869
504
            "movq -22(%1, %0, 2), %%mm1        \n\t"
1870
504
            "movq -14(%1, %0, 2), %%mm2        \n\t"
1871
504
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
1872
504
            "pand          %%mm7, %%mm0        \n\t"
1873
504
            "pand          %%mm7, %%mm1        \n\t"
1874
504
            "pand          %%mm7, %%mm2        \n\t"
1875
504
            "pand          %%mm7, %%mm3        \n\t"
1876
504
            "packuswb      %%mm1, %%mm0        \n\t"
1877
504
            "packuswb      %%mm3, %%mm2        \n\t"
1878
504
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
1879
504
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
1880
504
            "add             $16, %0           \n\t"
1881
504
            " js 1b                            \n\t"
1882
504
            : "+r"(count)
1883
504
            : "r"(src), "r"(dst)
1884
504
        );
1885
504
        count -= 15;
1886
504
    }
1887
15.9k
    while(count<0) {
1888
13.0k
        dst[count]= src[2*count];
1889
13.0k
        count++;
1890
13.0k
    }
1891
2.90k
}
1892
1893
static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1894
1.38k
{
1895
1.38k
    src ++;
1896
1.38k
    dst +=   count;
1897
1.38k
    src += 2*count;
1898
1.38k
    count= - count;
1899
1900
1.38k
    if(count < -16) {
1901
279
        count += 16;
1902
279
        __asm__ volatile(
1903
279
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1904
279
            "psrlw            $8, %%mm7        \n\t"
1905
279
            "1:                                \n\t"
1906
279
            "movq -32(%1, %0, 2), %%mm0        \n\t"
1907
279
            "movq -24(%1, %0, 2), %%mm1        \n\t"
1908
279
            "movq -16(%1, %0, 2), %%mm2        \n\t"
1909
279
            "movq  -8(%1, %0, 2), %%mm3        \n\t"
1910
279
            "pand          %%mm7, %%mm0        \n\t"
1911
279
            "pand          %%mm7, %%mm1        \n\t"
1912
279
            "pand          %%mm7, %%mm2        \n\t"
1913
279
            "pand          %%mm7, %%mm3        \n\t"
1914
279
            "packuswb      %%mm1, %%mm0        \n\t"
1915
279
            "packuswb      %%mm3, %%mm2        \n\t"
1916
279
            MOVNTQ"        %%mm0,-16(%2, %0)   \n\t"
1917
279
            MOVNTQ"        %%mm2,- 8(%2, %0)   \n\t"
1918
279
            "add             $16, %0           \n\t"
1919
279
            " js 1b                            \n\t"
1920
279
            : "+r"(count)
1921
279
            : "r"(src), "r"(dst)
1922
279
        );
1923
279
        count -= 16;
1924
279
    }
1925
12.6k
    while(count<0) {
1926
11.2k
        dst[count]= src[2*count];
1927
11.2k
        count++;
1928
11.2k
    }
1929
1.38k
}
1930
1931
#if ARCH_X86_32
1932
static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1933
{
1934
    dst0+=   count;
1935
    dst1+=   count;
1936
    src += 4*count;
1937
    count= - count;
1938
    if(count <= -8) {
1939
        count += 7;
1940
        __asm__ volatile(
1941
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1942
            "psrlw            $8, %%mm7        \n\t"
1943
            "1:                                \n\t"
1944
            "movq -28(%1, %0, 4), %%mm0        \n\t"
1945
            "movq -20(%1, %0, 4), %%mm1        \n\t"
1946
            "movq -12(%1, %0, 4), %%mm2        \n\t"
1947
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
1948
            "pand          %%mm7, %%mm0        \n\t"
1949
            "pand          %%mm7, %%mm1        \n\t"
1950
            "pand          %%mm7, %%mm2        \n\t"
1951
            "pand          %%mm7, %%mm3        \n\t"
1952
            "packuswb      %%mm1, %%mm0        \n\t"
1953
            "packuswb      %%mm3, %%mm2        \n\t"
1954
            "movq          %%mm0, %%mm1        \n\t"
1955
            "movq          %%mm2, %%mm3        \n\t"
1956
            "psrlw            $8, %%mm0        \n\t"
1957
            "psrlw            $8, %%mm2        \n\t"
1958
            "pand          %%mm7, %%mm1        \n\t"
1959
            "pand          %%mm7, %%mm3        \n\t"
1960
            "packuswb      %%mm2, %%mm0        \n\t"
1961
            "packuswb      %%mm3, %%mm1        \n\t"
1962
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
1963
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
1964
            "add              $8, %0           \n\t"
1965
            " js 1b                            \n\t"
1966
            : "+r"(count)
1967
            : "r"(src), "r"(dst0), "r"(dst1)
1968
        );
1969
        count -= 7;
1970
    }
1971
    while(count<0) {
1972
        dst0[count]= src[4*count+0];
1973
        dst1[count]= src[4*count+2];
1974
        count++;
1975
    }
1976
}
1977
#endif /* ARCH_X86_32 */
1978
1979
static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1980
690
{
1981
690
    dst0 +=   count;
1982
690
    dst1 +=   count;
1983
690
    src0 += 4*count;
1984
690
    src1 += 4*count;
1985
690
    count= - count;
1986
690
#ifdef PAVGB
1987
690
    if(count <= -8) {
1988
138
        count += 7;
1989
138
        __asm__ volatile(
1990
138
            "pcmpeqw        %%mm7, %%mm7        \n\t"
1991
138
            "psrlw             $8, %%mm7        \n\t"
1992
138
            "1:                                \n\t"
1993
138
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
1994
138
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
1995
138
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
1996
138
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
1997
138
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
1998
138
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
1999
138
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2000
138
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2001
138
            "pand           %%mm7, %%mm0        \n\t"
2002
138
            "pand           %%mm7, %%mm1        \n\t"
2003
138
            "pand           %%mm7, %%mm2        \n\t"
2004
138
            "pand           %%mm7, %%mm3        \n\t"
2005
138
            "packuswb       %%mm1, %%mm0        \n\t"
2006
138
            "packuswb       %%mm3, %%mm2        \n\t"
2007
138
            "movq           %%mm0, %%mm1        \n\t"
2008
138
            "movq           %%mm2, %%mm3        \n\t"
2009
138
            "psrlw             $8, %%mm0        \n\t"
2010
138
            "psrlw             $8, %%mm2        \n\t"
2011
138
            "pand           %%mm7, %%mm1        \n\t"
2012
138
            "pand           %%mm7, %%mm3        \n\t"
2013
138
            "packuswb       %%mm2, %%mm0        \n\t"
2014
138
            "packuswb       %%mm3, %%mm1        \n\t"
2015
138
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2016
138
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2017
138
            "add               $8, %0           \n\t"
2018
138
            " js 1b                            \n\t"
2019
138
            : "+r"(count)
2020
138
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2021
138
        );
2022
138
        count -= 7;
2023
138
    }
2024
690
#endif
2025
3.57k
    while(count<0) {
2026
2.88k
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2027
2.88k
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2028
2.88k
        count++;
2029
2.88k
    }
2030
690
}
2031
2032
static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2033
1.22k
{
2034
1.22k
    dst0+=   count;
2035
1.22k
    dst1+=   count;
2036
1.22k
    src += 4*count;
2037
1.22k
    count= - count;
2038
1.22k
    if(count <= -8) {
2039
8
        count += 7;
2040
8
        __asm__ volatile(
2041
8
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2042
8
            "psrlw            $8, %%mm7        \n\t"
2043
8
            "1:                                \n\t"
2044
8
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2045
8
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2046
8
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2047
8
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2048
8
            "psrlw            $8, %%mm0        \n\t"
2049
8
            "psrlw            $8, %%mm1        \n\t"
2050
8
            "psrlw            $8, %%mm2        \n\t"
2051
8
            "psrlw            $8, %%mm3        \n\t"
2052
8
            "packuswb      %%mm1, %%mm0        \n\t"
2053
8
            "packuswb      %%mm3, %%mm2        \n\t"
2054
8
            "movq          %%mm0, %%mm1        \n\t"
2055
8
            "movq          %%mm2, %%mm3        \n\t"
2056
8
            "psrlw            $8, %%mm0        \n\t"
2057
8
            "psrlw            $8, %%mm2        \n\t"
2058
8
            "pand          %%mm7, %%mm1        \n\t"
2059
8
            "pand          %%mm7, %%mm3        \n\t"
2060
8
            "packuswb      %%mm2, %%mm0        \n\t"
2061
8
            "packuswb      %%mm3, %%mm1        \n\t"
2062
8
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2063
8
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2064
8
            "add              $8, %0           \n\t"
2065
8
            " js 1b                            \n\t"
2066
8
            : "+r"(count)
2067
8
            : "r"(src), "r"(dst0), "r"(dst1)
2068
8
        );
2069
8
        count -= 7;
2070
8
    }
2071
1.22k
    src++;
2072
3.81k
    while(count<0) {
2073
2.59k
        dst0[count]= src[4*count+0];
2074
2.59k
        dst1[count]= src[4*count+2];
2075
2.59k
        count++;
2076
2.59k
    }
2077
1.22k
}
2078
2079
static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2080
832
{
2081
832
    dst0 +=   count;
2082
832
    dst1 +=   count;
2083
832
    src0 += 4*count;
2084
832
    src1 += 4*count;
2085
832
    count= - count;
2086
832
#ifdef PAVGB
2087
832
    if(count <= -8) {
2088
246
        count += 7;
2089
246
        __asm__ volatile(
2090
246
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2091
246
            "psrlw             $8, %%mm7        \n\t"
2092
246
            "1:                                \n\t"
2093
246
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2094
246
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2095
246
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2096
246
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2097
246
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2098
246
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2099
246
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2100
246
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2101
246
            "psrlw             $8, %%mm0        \n\t"
2102
246
            "psrlw             $8, %%mm1        \n\t"
2103
246
            "psrlw             $8, %%mm2        \n\t"
2104
246
            "psrlw             $8, %%mm3        \n\t"
2105
246
            "packuswb       %%mm1, %%mm0        \n\t"
2106
246
            "packuswb       %%mm3, %%mm2        \n\t"
2107
246
            "movq           %%mm0, %%mm1        \n\t"
2108
246
            "movq           %%mm2, %%mm3        \n\t"
2109
246
            "psrlw             $8, %%mm0        \n\t"
2110
246
            "psrlw             $8, %%mm2        \n\t"
2111
246
            "pand           %%mm7, %%mm1        \n\t"
2112
246
            "pand           %%mm7, %%mm3        \n\t"
2113
246
            "packuswb       %%mm2, %%mm0        \n\t"
2114
246
            "packuswb       %%mm3, %%mm1        \n\t"
2115
246
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2116
246
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2117
246
            "add               $8, %0           \n\t"
2118
246
            " js 1b                            \n\t"
2119
246
            : "+r"(count)
2120
246
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2121
246
        );
2122
246
        count -= 7;
2123
246
    }
2124
832
#endif
2125
832
    src0++;
2126
832
    src1++;
2127
2.65k
    while(count<0) {
2128
1.82k
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2129
1.82k
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2130
1.82k
        count++;
2131
1.82k
    }
2132
832
}
2133
2134
static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2135
                                 int width, int height,
2136
                                 int lumStride, int chromStride, int srcStride)
2137
22
{
2138
22
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2139
2140
1.69k
    for (int y = 0; y < height; y++) {
2141
1.67k
        extract_even_mmxext(src, ydst, width);
2142
1.67k
        if(y&1) {
2143
832
            extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2144
832
            udst+= chromStride;
2145
832
            vdst+= chromStride;
2146
832
        }
2147
2148
1.67k
        src += srcStride;
2149
1.67k
        ydst+= lumStride;
2150
1.67k
    }
2151
22
    __asm__(
2152
22
            EMMS"       \n\t"
2153
22
            SFENCE"     \n\t"
2154
22
            ::: "memory"
2155
22
        );
2156
22
}
2157
2158
static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2159
                                 int width, int height,
2160
                                 int lumStride, int chromStride, int srcStride)
2161
13
{
2162
13
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2163
2164
1.24k
    for (int y = 0; y < height; y++) {
2165
1.22k
        extract_even_mmxext(src, ydst, width);
2166
1.22k
        extract_odd2_mmxext(src, udst, vdst, chromWidth);
2167
2168
1.22k
        src += srcStride;
2169
1.22k
        ydst+= lumStride;
2170
1.22k
        udst+= chromStride;
2171
1.22k
        vdst+= chromStride;
2172
1.22k
    }
2173
13
    __asm__(
2174
13
            EMMS"       \n\t"
2175
13
            SFENCE"     \n\t"
2176
13
            ::: "memory"
2177
13
        );
2178
13
}
2179
2180
static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2181
                                 int width, int height,
2182
                                 int lumStride, int chromStride, int srcStride)
2183
14
{
2184
14
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2185
2186
1.40k
    for (int y = 0; y < height; y++) {
2187
1.38k
        extract_odd_mmxext(src, ydst, width);
2188
1.38k
        if(y&1) {
2189
690
            extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2190
690
            udst+= chromStride;
2191
690
            vdst+= chromStride;
2192
690
        }
2193
2194
1.38k
        src += srcStride;
2195
1.38k
        ydst+= lumStride;
2196
1.38k
    }
2197
14
    __asm__(
2198
14
            EMMS"       \n\t"
2199
14
            SFENCE"     \n\t"
2200
14
            ::: "memory"
2201
14
        );
2202
14
}
2203
2204
#if ARCH_X86_32
2205
static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2206
                                 int width, int height,
2207
                                 int lumStride, int chromStride, int srcStride)
2208
{
2209
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2210
2211
    for (int y = 0; y < height; y++) {
2212
        extract_odd_mmxext(src, ydst, width);
2213
        extract_even2_mmxext(src, udst, vdst, chromWidth);
2214
2215
        src += srcStride;
2216
        ydst+= lumStride;
2217
        udst+= chromStride;
2218
        vdst+= chromStride;
2219
    }
2220
    __asm__(
2221
            EMMS"       \n\t"
2222
            SFENCE"     \n\t"
2223
            ::: "memory"
2224
        );
2225
}
2226
#endif /* ARCH_X86_32 */
2227
2228
static av_cold void rgb2rgb_init_mmxext(void)
2229
1
{
2230
1
    rgb15to16          = rgb15to16_mmxext;
2231
1
    rgb15tobgr24       = rgb15tobgr24_mmxext;
2232
1
    rgb15to32          = rgb15to32_mmxext;
2233
1
    rgb16tobgr24       = rgb16tobgr24_mmxext;
2234
1
    rgb16to32          = rgb16to32_mmxext;
2235
1
    rgb16to15          = rgb16to15_mmxext;
2236
1
    rgb24tobgr16       = rgb24tobgr16_mmxext;
2237
1
    rgb24tobgr15       = rgb24tobgr15_mmxext;
2238
1
    rgb24tobgr32       = rgb24tobgr32_mmxext;
2239
1
    rgb32to16          = rgb32to16_mmxext;
2240
1
    rgb32to15          = rgb32to15_mmxext;
2241
1
    rgb32tobgr24       = rgb32tobgr24_mmxext;
2242
1
    rgb24to15          = rgb24to15_mmxext;
2243
1
    rgb24to16          = rgb24to16_mmxext;
2244
1
    rgb24tobgr24       = rgb24tobgr24_mmxext;
2245
1
    rgb32tobgr16       = rgb32tobgr16_mmxext;
2246
1
    rgb32tobgr15       = rgb32tobgr15_mmxext;
2247
1
    yv12toyuy2         = yv12toyuy2_mmxext;
2248
1
    yv12touyvy         = yv12touyvy_mmxext;
2249
1
    yuv422ptoyuy2      = yuv422ptoyuy2_mmxext;
2250
1
    yuv422ptouyvy      = yuv422ptouyvy_mmxext;
2251
1
    yuy2toyv12         = yuy2toyv12_mmxext;
2252
1
    vu9_to_vu12        = vu9_to_vu12_mmxext;
2253
1
    yvu9_to_yuy2       = yvu9_to_yuy2_mmxext;
2254
#if ARCH_X86_32
2255
    uyvytoyuv422       = uyvytoyuv422_mmxext;
2256
#endif
2257
1
    yuyvtoyuv422       = yuyvtoyuv422_mmxext;
2258
2259
1
    planar2x           = planar2x_mmxext;
2260
#if ARCH_X86_32 && HAVE_7REGS
2261
    ff_rgb24toyv12     = rgb24toyv12_mmxext;
2262
#endif /* ARCH_X86_32 && HAVE_7REGS */
2263
2264
1
    yuyvtoyuv420       = yuyvtoyuv420_mmxext;
2265
1
    uyvytoyuv420       = uyvytoyuv420_mmxext;
2266
1
}
2267
2268
//SSE2 versions
2269
static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2270
                                  int width, int height, int src1Stride,
2271
                                  int src2Stride, int dstStride)
2272
7
{
2273
254
    for (int h = 0; h < height; h++) {
2274
247
        if (width >= 16) {
2275
241
            if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
2276
241
        __asm__(
2277
241
            "xor              %%"FF_REG_a", %%"FF_REG_a"  \n\t"
2278
241
            "1:                                     \n\t"
2279
241
            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
2280
241
            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
2281
241
            "movdqa  (%1, %%"FF_REG_a"), %%xmm0     \n\t"
2282
241
            "movdqa  (%1, %%"FF_REG_a"), %%xmm1     \n\t"
2283
241
            "movdqa  (%2, %%"FF_REG_a"), %%xmm2     \n\t"
2284
241
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2285
241
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2286
241
            "movntdq             %%xmm0,   (%0, %%"FF_REG_a", 2) \n\t"
2287
241
            "movntdq             %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
2288
241
            "add                    $16, %%"FF_REG_a"            \n\t"
2289
241
            "cmp                     %3, %%"FF_REG_a"            \n\t"
2290
241
            " jb                     1b             \n\t"
2291
241
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2292
241
            : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
2293
241
        );
2294
241
            } else
2295
0
        __asm__(
2296
0
            "xor %%"FF_REG_a", %%"FF_REG_a"         \n\t"
2297
0
            "1:                                     \n\t"
2298
0
            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
2299
0
            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
2300
0
            "movq    (%1, %%"FF_REG_a"), %%mm0      \n\t"
2301
0
            "movq   8(%1, %%"FF_REG_a"), %%mm2      \n\t"
2302
0
            "movq                 %%mm0, %%mm1      \n\t"
2303
0
            "movq                 %%mm2, %%mm3      \n\t"
2304
0
            "movq    (%2, %%"FF_REG_a"), %%mm4      \n\t"
2305
0
            "movq   8(%2, %%"FF_REG_a"), %%mm5      \n\t"
2306
0
            "punpcklbw            %%mm4, %%mm0      \n\t"
2307
0
            "punpckhbw            %%mm4, %%mm1      \n\t"
2308
0
            "punpcklbw            %%mm5, %%mm2      \n\t"
2309
0
            "punpckhbw            %%mm5, %%mm3      \n\t"
2310
0
            MOVNTQ"               %%mm0,   (%0, %%"FF_REG_a", 2) \n\t"
2311
0
            MOVNTQ"               %%mm1,  8(%0, %%"FF_REG_a", 2) \n\t"
2312
0
            MOVNTQ"               %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
2313
0
            MOVNTQ"               %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
2314
0
            "add                    $16, %%"FF_REG_a"            \n\t"
2315
0
            "cmp                     %3, %%"FF_REG_a"            \n\t"
2316
0
            " jb                     1b                          \n\t"
2317
0
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2318
0
            : "memory", "%"FF_REG_a
2319
0
        );
2320
2321
241
        }
2322
844
        for (int w = (width & (~15)); w < width; w++) {
2323
597
            dest[2*w+0] = src1[w];
2324
597
            dest[2*w+1] = src2[w];
2325
597
        }
2326
247
        dest += dstStride;
2327
247
        src1 += src1Stride;
2328
247
        src2 += src2Stride;
2329
247
    }
2330
7
    __asm__(
2331
7
            EMMS"       \n\t"
2332
7
            SFENCE"     \n\t"
2333
7
            ::: "memory"
2334
7
            );
2335
7
}
2336
2337
/*
2338
 RGB15->RGB16 original by Strepto/Astral
2339
 ported to gcc & bugfixed : A'rpi
2340
 MMXEXT, 3DNOW optimization by Nick Kurshev
2341
 32-bit C version, and and&add trick by Michael Niedermayer
2342
*/
2343
2344
#endif /* HAVE_INLINE_ASM */
2345
2346
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2347
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2348
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2349
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2350
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2351
void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2352
void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2353
void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2354
void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2355
2356
#if ARCH_X86_64
2357
void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2358
void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2359
void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2360
void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2361
void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2362
void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2363
void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2364
void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2365
void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2366
2367
void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2368
void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2369
void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2370
void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2371
void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2372
void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2373
void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2374
void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2375
void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2376
2377
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2378
                          const uint8_t *src, int width, int height,
2379
                          int lumStride, int chromStride, int srcStride);
2380
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2381
                         const uint8_t *src, int width, int height,
2382
                         int lumStride, int chromStride, int srcStride);
2383
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2384
                          const uint8_t *src, int width, int height,
2385
                          int lumStride, int chromStride, int srcStride);
2386
void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2387
                               const uint8_t *src, int width, int height,
2388
                               int lumStride, int chromStride, int srcStride);
2389
#endif
2390
2391
#define DEINTERLEAVE_BYTES(cpuext)                                            \
2392
void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV,                     \
2393
                            const uint8_t *unused,                            \
2394
                            const uint8_t *src1,                              \
2395
                            const uint8_t *src2,                              \
2396
                            int w,                                            \
2397
                            uint32_t *unused2,                                \
2398
                            void *opq);                                       \
2399
static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
2400
                                          int width, int height, int srcStride, \
2401
9
                                          int dst1Stride, int dst2Stride)     \
2402
9
{                                                                             \
2403
213
    for (int h = 0; h < height; h++) {                                        \
2404
204
        if (width >= 16)                                                      \
2405
204
            ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
2406
777
        for (int w = (width & (~15)); w < width; w++) {                       \
2407
573
            dst1[w] = src[2*w+0];                                             \
2408
573
            dst2[w] = src[2*w+1];                                             \
2409
573
        }                                                                     \
2410
204
        src  += srcStride;                                                    \
2411
204
        dst1 += dst1Stride;                                                   \
2412
204
        dst2 += dst2Stride;                                                   \
2413
204
    }                                                                         \
2414
9
}
2415
2416
#if HAVE_SSE2_EXTERNAL
2417
0
DEINTERLEAVE_BYTES(sse2)
2418
#endif
2419
#if HAVE_AVX_EXTERNAL
2420
9
DEINTERLEAVE_BYTES(avx)
2421
#endif
2422
2423
av_cold void rgb2rgb_init_x86(void)
2424
1
{
2425
1
    int cpu_flags = av_get_cpu_flags();
2426
2427
1
#if HAVE_INLINE_ASM
2428
1
    if (INLINE_MMXEXT(cpu_flags))
2429
1
        rgb2rgb_init_mmxext();
2430
1
    if (INLINE_SSE2(cpu_flags))
2431
1
        interleaveBytes = interleave_bytes_sse2;
2432
1
#endif /* HAVE_INLINE_ASM */
2433
2434
1
#if HAVE_SSE2_EXTERNAL
2435
1
    if (EXTERNAL_SSE2(cpu_flags)) {
2436
1
#if ARCH_X86_64
2437
1
        uyvytoyuv422 = ff_uyvytoyuv422_sse2;
2438
1
#endif
2439
1
        deinterleaveBytes = deinterleave_bytes_sse2;
2440
1
    }
2441
1
#endif
2442
1
    if (EXTERNAL_SSSE3(cpu_flags)) {
2443
1
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
2444
1
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
2445
1
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
2446
1
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
2447
1
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
2448
1
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3;
2449
1
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3;
2450
1
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3;
2451
1
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3;
2452
1
    }
2453
1
#if HAVE_AVX_EXTERNAL
2454
1
    if (EXTERNAL_AVX(cpu_flags)) {
2455
1
        deinterleaveBytes = deinterleave_bytes_avx;
2456
1
#if ARCH_X86_64
2457
1
        uyvytoyuv422 = ff_uyvytoyuv422_avx;
2458
1
    }
2459
1
    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
2460
1
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
2461
1
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
2462
1
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
2463
1
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
2464
1
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
2465
1
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
2466
1
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
2467
1
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
2468
1
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
2469
1
    }
2470
1
    if (EXTERNAL_AVX512ICL(cpu_flags)) {
2471
0
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
2472
0
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
2473
0
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
2474
0
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
2475
0
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
2476
0
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
2477
0
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
2478
0
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
2479
0
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
2480
0
    }
2481
1
    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
2482
1
        uyvytoyuv422 = ff_uyvytoyuv422_avx2;
2483
1
    }
2484
1
    if (EXTERNAL_AVX512ICL(cpu_flags)) {
2485
0
        uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
2486
0
#endif
2487
0
    }
2488
1
#endif
2489
1
}