Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libswscale/x86/rgb2rgb.c
Line
Count
Source
1
/*
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 *
9
 * This file is part of FFmpeg.
10
 *
11
 * FFmpeg is free software; you can redistribute it and/or
12
 * modify it under the terms of the GNU Lesser General Public
13
 * License as published by the Free Software Foundation; either
14
 * version 2.1 of the License, or (at your option) any later version.
15
 *
16
 * FFmpeg is distributed in the hope that it will be useful,
17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
 * Lesser General Public License for more details.
20
 *
21
 * You should have received a copy of the GNU Lesser General Public
22
 * License along with FFmpeg; if not, write to the Free Software
23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
 */
25
26
#include <stdint.h>
27
28
#include "config.h"
29
#include "libavutil/attributes.h"
30
#include "libavutil/x86/cpu.h"
31
#include "libavutil/cpu.h"
32
#include "libavutil/bswap.h"
33
#include "libavutil/mem_internal.h"
34
35
#include "libswscale/rgb2rgb.h"
36
#include "libswscale/swscale.h"
37
#include "libswscale/swscale_internal.h"
38
39
#if HAVE_INLINE_ASM
40
#include "libavutil/x86/asm.h"
41
42
DECLARE_ASM_CONST(8, uint64_t, mmx_ff)       = 0x00000000000000FFULL;
43
DECLARE_ASM_CONST(8, uint64_t, mmx_null)     = 0x0000000000000000ULL;
44
DECLARE_ASM_CONST(8, uint64_t, mask32a)      = 0xFF000000FF000000ULL;
45
DECLARE_ASM_CONST(8, uint64_t, mask3216br)   = 0x00F800F800F800F8ULL;
46
DECLARE_ASM_CONST(8, uint64_t, mask3216g)    = 0x0000FC000000FC00ULL;
47
DECLARE_ASM_CONST(8, uint64_t, mask3215g)    = 0x0000F8000000F800ULL;
48
DECLARE_ASM_CONST(8, uint64_t, mul3216)      = 0x2000000420000004ULL;
49
DECLARE_ASM_CONST(8, uint64_t, mul3215)      = 0x2000000820000008ULL;
50
DECLARE_ASM_CONST(8, uint64_t, mask24b)      = 0x00FF0000FF0000FFULL;
51
DECLARE_ASM_CONST(8, uint64_t, mask24g)      = 0xFF0000FF0000FF00ULL;
52
DECLARE_ASM_CONST(8, uint64_t, mask24r)      = 0x0000FF0000FF0000ULL;
53
DECLARE_ASM_CONST(8, uint64_t, mask24l)      = 0x0000000000FFFFFFULL;
54
DECLARE_ASM_CONST(8, uint64_t, mask24h)      = 0x0000FFFFFF000000ULL;
55
DECLARE_ASM_CONST(8, uint64_t, mask15b)      = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
56
DECLARE_ASM_CONST(8, uint64_t, mask15rg)     = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
57
DECLARE_ASM_CONST(8, uint64_t, mask15s)      = 0xFFE0FFE0FFE0FFE0ULL;
58
DECLARE_ASM_CONST(8, uint64_t, mask15g)      = 0x03E003E003E003E0ULL;
59
DECLARE_ASM_CONST(8, uint64_t, mask15r)      = 0x7C007C007C007C00ULL;
60
0
#define mask16b mask15b
61
DECLARE_ASM_CONST(8, uint64_t, mask16g)      = 0x07E007E007E007E0ULL;
62
DECLARE_ASM_CONST(8, uint64_t, mask16r)      = 0xF800F800F800F800ULL;
63
0
#define red_16mask mask3215g
64
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
65
DECLARE_ASM_CONST(8, uint64_t, blue_16mask)  = 0x0000001f0000001fULL;
66
DECLARE_ASM_CONST(8, uint64_t, red_15mask)   = 0x00007c0000007c00ULL;
67
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
68
0
#define blue_15mask blue_16mask
69
DECLARE_ASM_CONST(8, uint64_t, mul15_mid)    = 0x4200420042004200ULL;
70
DECLARE_ASM_CONST(8, uint64_t, mul15_hi)     = 0x0210021002100210ULL;
71
DECLARE_ASM_CONST(8, uint64_t, mul16_mid)    = 0x2080208020802080ULL;
72
73
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
74
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
75
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
76
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
77
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
78
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
79
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
80
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
81
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
82
83
// MMXEXT versions
84
#define PREFETCH "prefetchnta"
85
#define PAVGB     "pavgb"
86
#define MOVNTQ "movntq"
87
#define SFENCE "sfence"
88
89
#define EMMS     "emms"
90
91
static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
92
0
{
93
0
    uint8_t *dest = dst;
94
0
    const uint8_t *s = src;
95
0
    const uint8_t *end;
96
0
    const uint8_t *mm_end;
97
0
    end = s + src_size;
98
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
99
0
    mm_end = end - 23;
100
0
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
101
0
    while (s < mm_end) {
102
0
        __asm__ volatile(
103
0
            PREFETCH"  32(%1)           \n\t"
104
0
            "movd        (%1), %%mm0    \n\t"
105
0
            "punpckldq  3(%1), %%mm0    \n\t"
106
0
            "movd       6(%1), %%mm1    \n\t"
107
0
            "punpckldq  9(%1), %%mm1    \n\t"
108
0
            "movd      12(%1), %%mm2    \n\t"
109
0
            "punpckldq 15(%1), %%mm2    \n\t"
110
0
            "movd      18(%1), %%mm3    \n\t"
111
0
            "punpckldq 21(%1), %%mm3    \n\t"
112
0
            "por        %%mm7, %%mm0    \n\t"
113
0
            "por        %%mm7, %%mm1    \n\t"
114
0
            "por        %%mm7, %%mm2    \n\t"
115
0
            "por        %%mm7, %%mm3    \n\t"
116
0
            MOVNTQ"     %%mm0,   (%0)   \n\t"
117
0
            MOVNTQ"     %%mm1,  8(%0)   \n\t"
118
0
            MOVNTQ"     %%mm2, 16(%0)   \n\t"
119
0
            MOVNTQ"     %%mm3, 24(%0)"
120
0
            :: "r"(dest), "r"(s)
121
0
            :"memory");
122
0
        dest += 32;
123
0
        s += 24;
124
0
    }
125
0
    __asm__ volatile(SFENCE:::"memory");
126
0
    __asm__ volatile(EMMS:::"memory");
127
0
    while (s < end) {
128
0
        *dest++ = *s++;
129
0
        *dest++ = *s++;
130
0
        *dest++ = *s++;
131
0
        *dest++ = 255;
132
0
    }
133
0
}
134
135
#define STORE_BGR24_MMX \
136
            "psrlq         $8, %%mm2    \n\t" \
137
            "psrlq         $8, %%mm3    \n\t" \
138
            "psrlq         $8, %%mm6    \n\t" \
139
            "psrlq         $8, %%mm7    \n\t" \
140
            "pand "MANGLE(mask24l)", %%mm0\n\t" \
141
            "pand "MANGLE(mask24l)", %%mm1\n\t" \
142
            "pand "MANGLE(mask24l)", %%mm4\n\t" \
143
            "pand "MANGLE(mask24l)", %%mm5\n\t" \
144
            "pand "MANGLE(mask24h)", %%mm2\n\t" \
145
            "pand "MANGLE(mask24h)", %%mm3\n\t" \
146
            "pand "MANGLE(mask24h)", %%mm6\n\t" \
147
            "pand "MANGLE(mask24h)", %%mm7\n\t" \
148
            "por        %%mm2, %%mm0    \n\t" \
149
            "por        %%mm3, %%mm1    \n\t" \
150
            "por        %%mm6, %%mm4    \n\t" \
151
            "por        %%mm7, %%mm5    \n\t" \
152
 \
153
            "movq       %%mm1, %%mm2    \n\t" \
154
            "movq       %%mm4, %%mm3    \n\t" \
155
            "psllq        $48, %%mm2    \n\t" \
156
            "psllq        $32, %%mm3    \n\t" \
157
            "por        %%mm2, %%mm0    \n\t" \
158
            "psrlq        $16, %%mm1    \n\t" \
159
            "psrlq        $32, %%mm4    \n\t" \
160
            "psllq        $16, %%mm5    \n\t" \
161
            "por        %%mm3, %%mm1    \n\t" \
162
            "por        %%mm5, %%mm4    \n\t" \
163
 \
164
            MOVNTQ"     %%mm0,   (%0)    \n\t" \
165
            MOVNTQ"     %%mm1,  8(%0)    \n\t" \
166
            MOVNTQ"     %%mm4, 16(%0)"
167
168
169
static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
170
0
{
171
0
    uint8_t *dest = dst;
172
0
    const uint8_t *s = src;
173
0
    const uint8_t *end;
174
0
    const uint8_t *mm_end;
175
0
    end = s + src_size;
176
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
177
0
    mm_end = end - 31;
178
0
    while (s < mm_end) {
179
0
        __asm__ volatile(
180
0
            PREFETCH"  32(%1)           \n\t"
181
0
            "movq        (%1), %%mm0    \n\t"
182
0
            "movq       8(%1), %%mm1    \n\t"
183
0
            "movq      16(%1), %%mm4    \n\t"
184
0
            "movq      24(%1), %%mm5    \n\t"
185
0
            "movq       %%mm0, %%mm2    \n\t"
186
0
            "movq       %%mm1, %%mm3    \n\t"
187
0
            "movq       %%mm4, %%mm6    \n\t"
188
0
            "movq       %%mm5, %%mm7    \n\t"
189
0
            STORE_BGR24_MMX
190
0
            :: "r"(dest), "r"(s)
191
0
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
192
0
            :"memory");
193
0
        dest += 24;
194
0
        s += 32;
195
0
    }
196
0
    __asm__ volatile(SFENCE:::"memory");
197
0
    __asm__ volatile(EMMS:::"memory");
198
0
    while (s < end) {
199
0
        *dest++ = *s++;
200
0
        *dest++ = *s++;
201
0
        *dest++ = *s++;
202
0
        s++;
203
0
    }
204
0
}
205
206
/*
207
 original by Strepto/Astral
208
 ported to gcc & bugfixed: A'rpi
209
 MMXEXT, 3DNOW optimization by Nick Kurshev
210
 32-bit C version, and and&add trick by Michael Niedermayer
211
*/
212
static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
213
0
{
214
0
    register const uint8_t* s=src;
215
0
    register uint8_t* d=dst;
216
0
    register const uint8_t *end;
217
0
    const uint8_t *mm_end;
218
0
    end = s + src_size;
219
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
220
0
    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
221
0
    mm_end = end - 15;
222
0
    while (s<mm_end) {
223
0
        __asm__ volatile(
224
0
            PREFETCH" 32(%1)        \n\t"
225
0
            "movq      (%1), %%mm0  \n\t"
226
0
            "movq     8(%1), %%mm2  \n\t"
227
0
            "movq     %%mm0, %%mm1  \n\t"
228
0
            "movq     %%mm2, %%mm3  \n\t"
229
0
            "pand     %%mm4, %%mm0  \n\t"
230
0
            "pand     %%mm4, %%mm2  \n\t"
231
0
            "paddw    %%mm1, %%mm0  \n\t"
232
0
            "paddw    %%mm3, %%mm2  \n\t"
233
0
            MOVNTQ"   %%mm0,  (%0)  \n\t"
234
0
            MOVNTQ"   %%mm2, 8(%0)"
235
0
            :: "r"(d), "r"(s)
236
0
        );
237
0
        d+=16;
238
0
        s+=16;
239
0
    }
240
0
    __asm__ volatile(SFENCE:::"memory");
241
0
    __asm__ volatile(EMMS:::"memory");
242
0
    mm_end = end - 3;
243
0
    while (s < mm_end) {
244
0
        register unsigned x= *((const uint32_t *)s);
245
0
        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
246
0
        d+=4;
247
0
        s+=4;
248
0
    }
249
0
    if (s < end) {
250
0
        register unsigned short x= *((const uint16_t *)s);
251
0
        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
252
0
    }
253
0
}
254
255
static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
256
0
{
257
0
    register const uint8_t* s=src;
258
0
    register uint8_t* d=dst;
259
0
    register const uint8_t *end;
260
0
    const uint8_t *mm_end;
261
0
    end = s + src_size;
262
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
263
0
    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
264
0
    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
265
0
    mm_end = end - 15;
266
0
    while (s<mm_end) {
267
0
        __asm__ volatile(
268
0
            PREFETCH" 32(%1)        \n\t"
269
0
            "movq      (%1), %%mm0  \n\t"
270
0
            "movq     8(%1), %%mm2  \n\t"
271
0
            "movq     %%mm0, %%mm1  \n\t"
272
0
            "movq     %%mm2, %%mm3  \n\t"
273
0
            "psrlq       $1, %%mm0  \n\t"
274
0
            "psrlq       $1, %%mm2  \n\t"
275
0
            "pand     %%mm7, %%mm0  \n\t"
276
0
            "pand     %%mm7, %%mm2  \n\t"
277
0
            "pand     %%mm6, %%mm1  \n\t"
278
0
            "pand     %%mm6, %%mm3  \n\t"
279
0
            "por      %%mm1, %%mm0  \n\t"
280
0
            "por      %%mm3, %%mm2  \n\t"
281
0
            MOVNTQ"   %%mm0,  (%0)  \n\t"
282
0
            MOVNTQ"   %%mm2, 8(%0)"
283
0
            :: "r"(d), "r"(s)
284
0
        );
285
0
        d+=16;
286
0
        s+=16;
287
0
    }
288
0
    __asm__ volatile(SFENCE:::"memory");
289
0
    __asm__ volatile(EMMS:::"memory");
290
0
    mm_end = end - 3;
291
0
    while (s < mm_end) {
292
0
        register uint32_t x= *((const uint32_t*)s);
293
0
        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
294
0
        s+=4;
295
0
        d+=4;
296
0
    }
297
0
    if (s < end) {
298
0
        register uint16_t x= *((const uint16_t*)s);
299
0
        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300
0
    }
301
0
}
302
303
static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
304
0
{
305
0
    const uint8_t *s = src;
306
0
    const uint8_t *end;
307
0
    const uint8_t *mm_end;
308
0
    uint16_t *d = (uint16_t *)dst;
309
0
    end = s + src_size;
310
0
    mm_end = end - 15;
311
0
    __asm__ volatile(
312
0
        "movq           %3, %%mm5   \n\t"
313
0
        "movq           %4, %%mm6   \n\t"
314
0
        "movq           %5, %%mm7   \n\t"
315
0
        "jmp 2f                     \n\t"
316
0
        ".p2align        4          \n\t"
317
0
        "1:                         \n\t"
318
0
        PREFETCH"   32(%1)          \n\t"
319
0
        "movd         (%1), %%mm0   \n\t"
320
0
        "movd        4(%1), %%mm3   \n\t"
321
0
        "punpckldq   8(%1), %%mm0   \n\t"
322
0
        "punpckldq  12(%1), %%mm3   \n\t"
323
0
        "movq        %%mm0, %%mm1   \n\t"
324
0
        "movq        %%mm3, %%mm4   \n\t"
325
0
        "pand        %%mm6, %%mm0   \n\t"
326
0
        "pand        %%mm6, %%mm3   \n\t"
327
0
        "pmaddwd     %%mm7, %%mm0   \n\t"
328
0
        "pmaddwd     %%mm7, %%mm3   \n\t"
329
0
        "pand        %%mm5, %%mm1   \n\t"
330
0
        "pand        %%mm5, %%mm4   \n\t"
331
0
        "por         %%mm1, %%mm0   \n\t"
332
0
        "por         %%mm4, %%mm3   \n\t"
333
0
        "psrld          $5, %%mm0   \n\t"
334
0
        "pslld         $11, %%mm3   \n\t"
335
0
        "por         %%mm3, %%mm0   \n\t"
336
0
        MOVNTQ"      %%mm0, (%0)    \n\t"
337
0
        "add           $16,  %1     \n\t"
338
0
        "add            $8,  %0     \n\t"
339
0
        "2:                         \n\t"
340
0
        "cmp            %2,  %1     \n\t"
341
0
        " jb            1b          \n\t"
342
0
        : "+r" (d), "+r"(s)
343
0
        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
344
0
    );
345
0
    __asm__ volatile(SFENCE:::"memory");
346
0
    __asm__ volatile(EMMS:::"memory");
347
0
    while (s < end) {
348
0
        register int rgb = *(const uint32_t*)s; s += 4;
349
0
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
350
0
    }
351
0
}
352
353
static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
354
0
{
355
0
    const uint8_t *s = src;
356
0
    const uint8_t *end;
357
0
    const uint8_t *mm_end;
358
0
    uint16_t *d = (uint16_t *)dst;
359
0
    end = s + src_size;
360
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
361
0
    __asm__ volatile(
362
0
        "movq          %0, %%mm7    \n\t"
363
0
        "movq          %1, %%mm6    \n\t"
364
0
        ::"m"(red_16mask),"m"(green_16mask));
365
0
    mm_end = end - 15;
366
0
    while (s < mm_end) {
367
0
        __asm__ volatile(
368
0
            PREFETCH"  32(%1)           \n\t"
369
0
            "movd        (%1), %%mm0    \n\t"
370
0
            "movd       4(%1), %%mm3    \n\t"
371
0
            "punpckldq  8(%1), %%mm0    \n\t"
372
0
            "punpckldq 12(%1), %%mm3    \n\t"
373
0
            "movq       %%mm0, %%mm1    \n\t"
374
0
            "movq       %%mm0, %%mm2    \n\t"
375
0
            "movq       %%mm3, %%mm4    \n\t"
376
0
            "movq       %%mm3, %%mm5    \n\t"
377
0
            "psllq         $8, %%mm0    \n\t"
378
0
            "psllq         $8, %%mm3    \n\t"
379
0
            "pand       %%mm7, %%mm0    \n\t"
380
0
            "pand       %%mm7, %%mm3    \n\t"
381
0
            "psrlq         $5, %%mm1    \n\t"
382
0
            "psrlq         $5, %%mm4    \n\t"
383
0
            "pand       %%mm6, %%mm1    \n\t"
384
0
            "pand       %%mm6, %%mm4    \n\t"
385
0
            "psrlq        $19, %%mm2    \n\t"
386
0
            "psrlq        $19, %%mm5    \n\t"
387
0
            "pand          %2, %%mm2    \n\t"
388
0
            "pand          %2, %%mm5    \n\t"
389
0
            "por        %%mm1, %%mm0    \n\t"
390
0
            "por        %%mm4, %%mm3    \n\t"
391
0
            "por        %%mm2, %%mm0    \n\t"
392
0
            "por        %%mm5, %%mm3    \n\t"
393
0
            "psllq        $16, %%mm3    \n\t"
394
0
            "por        %%mm3, %%mm0    \n\t"
395
0
            MOVNTQ"     %%mm0, (%0)     \n\t"
396
0
            :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
397
0
        d += 4;
398
0
        s += 16;
399
0
    }
400
0
    __asm__ volatile(SFENCE:::"memory");
401
0
    __asm__ volatile(EMMS:::"memory");
402
0
    while (s < end) {
403
0
        register int rgb = *(const uint32_t*)s; s += 4;
404
0
        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
405
0
    }
406
0
}
407
408
static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
409
0
{
410
0
    const uint8_t *s = src;
411
0
    const uint8_t *end;
412
0
    const uint8_t *mm_end;
413
0
    uint16_t *d = (uint16_t *)dst;
414
0
    end = s + src_size;
415
0
    mm_end = end - 15;
416
0
    __asm__ volatile(
417
0
        "movq           %3, %%mm5   \n\t"
418
0
        "movq           %4, %%mm6   \n\t"
419
0
        "movq           %5, %%mm7   \n\t"
420
0
        "jmp            2f          \n\t"
421
0
        ".p2align        4          \n\t"
422
0
        "1:                         \n\t"
423
0
        PREFETCH"   32(%1)          \n\t"
424
0
        "movd         (%1), %%mm0   \n\t"
425
0
        "movd        4(%1), %%mm3   \n\t"
426
0
        "punpckldq   8(%1), %%mm0   \n\t"
427
0
        "punpckldq  12(%1), %%mm3   \n\t"
428
0
        "movq        %%mm0, %%mm1   \n\t"
429
0
        "movq        %%mm3, %%mm4   \n\t"
430
0
        "pand        %%mm6, %%mm0   \n\t"
431
0
        "pand        %%mm6, %%mm3   \n\t"
432
0
        "pmaddwd     %%mm7, %%mm0   \n\t"
433
0
        "pmaddwd     %%mm7, %%mm3   \n\t"
434
0
        "pand        %%mm5, %%mm1   \n\t"
435
0
        "pand        %%mm5, %%mm4   \n\t"
436
0
        "por         %%mm1, %%mm0   \n\t"
437
0
        "por         %%mm4, %%mm3   \n\t"
438
0
        "psrld          $6, %%mm0   \n\t"
439
0
        "pslld         $10, %%mm3   \n\t"
440
0
        "por         %%mm3, %%mm0   \n\t"
441
0
        MOVNTQ"      %%mm0, (%0)    \n\t"
442
0
        "add           $16,  %1     \n\t"
443
0
        "add            $8,  %0     \n\t"
444
0
        "2:                         \n\t"
445
0
        "cmp            %2,  %1     \n\t"
446
0
        " jb            1b          \n\t"
447
0
        : "+r" (d), "+r"(s)
448
0
        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
449
0
    );
450
0
    __asm__ volatile(SFENCE:::"memory");
451
0
    __asm__ volatile(EMMS:::"memory");
452
0
    while (s < end) {
453
0
        register int rgb = *(const uint32_t*)s; s += 4;
454
0
        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
455
0
    }
456
0
}
457
458
static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
459
0
{
460
0
    const uint8_t *s = src;
461
0
    const uint8_t *end;
462
0
    const uint8_t *mm_end;
463
0
    uint16_t *d = (uint16_t *)dst;
464
0
    end = s + src_size;
465
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
466
0
    __asm__ volatile(
467
0
        "movq          %0, %%mm7    \n\t"
468
0
        "movq          %1, %%mm6    \n\t"
469
0
        ::"m"(red_15mask),"m"(green_15mask));
470
0
    mm_end = end - 15;
471
0
    while (s < mm_end) {
472
0
        __asm__ volatile(
473
0
            PREFETCH"  32(%1)           \n\t"
474
0
            "movd        (%1), %%mm0    \n\t"
475
0
            "movd       4(%1), %%mm3    \n\t"
476
0
            "punpckldq  8(%1), %%mm0    \n\t"
477
0
            "punpckldq 12(%1), %%mm3    \n\t"
478
0
            "movq       %%mm0, %%mm1    \n\t"
479
0
            "movq       %%mm0, %%mm2    \n\t"
480
0
            "movq       %%mm3, %%mm4    \n\t"
481
0
            "movq       %%mm3, %%mm5    \n\t"
482
0
            "psllq         $7, %%mm0    \n\t"
483
0
            "psllq         $7, %%mm3    \n\t"
484
0
            "pand       %%mm7, %%mm0    \n\t"
485
0
            "pand       %%mm7, %%mm3    \n\t"
486
0
            "psrlq         $6, %%mm1    \n\t"
487
0
            "psrlq         $6, %%mm4    \n\t"
488
0
            "pand       %%mm6, %%mm1    \n\t"
489
0
            "pand       %%mm6, %%mm4    \n\t"
490
0
            "psrlq        $19, %%mm2    \n\t"
491
0
            "psrlq        $19, %%mm5    \n\t"
492
0
            "pand          %2, %%mm2    \n\t"
493
0
            "pand          %2, %%mm5    \n\t"
494
0
            "por        %%mm1, %%mm0    \n\t"
495
0
            "por        %%mm4, %%mm3    \n\t"
496
0
            "por        %%mm2, %%mm0    \n\t"
497
0
            "por        %%mm5, %%mm3    \n\t"
498
0
            "psllq        $16, %%mm3    \n\t"
499
0
            "por        %%mm3, %%mm0    \n\t"
500
0
            MOVNTQ"     %%mm0, (%0)     \n\t"
501
0
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
502
0
        d += 4;
503
0
        s += 16;
504
0
    }
505
0
    __asm__ volatile(SFENCE:::"memory");
506
0
    __asm__ volatile(EMMS:::"memory");
507
0
    while (s < end) {
508
0
        register int rgb = *(const uint32_t*)s; s += 4;
509
0
        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
510
0
    }
511
0
}
512
513
static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
514
0
{
515
0
    const uint8_t *s = src;
516
0
    const uint8_t *end;
517
0
    const uint8_t *mm_end;
518
0
    uint16_t *d = (uint16_t *)dst;
519
0
    end = s + src_size;
520
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
521
0
    __asm__ volatile(
522
0
        "movq         %0, %%mm7     \n\t"
523
0
        "movq         %1, %%mm6     \n\t"
524
0
        ::"m"(red_16mask),"m"(green_16mask));
525
0
    mm_end = end - 11;
526
0
    while (s < mm_end) {
527
0
        __asm__ volatile(
528
0
            PREFETCH"  32(%1)           \n\t"
529
0
            "movd        (%1), %%mm0    \n\t"
530
0
            "movd       3(%1), %%mm3    \n\t"
531
0
            "punpckldq  6(%1), %%mm0    \n\t"
532
0
            "punpckldq  9(%1), %%mm3    \n\t"
533
0
            "movq       %%mm0, %%mm1    \n\t"
534
0
            "movq       %%mm0, %%mm2    \n\t"
535
0
            "movq       %%mm3, %%mm4    \n\t"
536
0
            "movq       %%mm3, %%mm5    \n\t"
537
0
            "psrlq         $3, %%mm0    \n\t"
538
0
            "psrlq         $3, %%mm3    \n\t"
539
0
            "pand          %2, %%mm0    \n\t"
540
0
            "pand          %2, %%mm3    \n\t"
541
0
            "psrlq         $5, %%mm1    \n\t"
542
0
            "psrlq         $5, %%mm4    \n\t"
543
0
            "pand       %%mm6, %%mm1    \n\t"
544
0
            "pand       %%mm6, %%mm4    \n\t"
545
0
            "psrlq         $8, %%mm2    \n\t"
546
0
            "psrlq         $8, %%mm5    \n\t"
547
0
            "pand       %%mm7, %%mm2    \n\t"
548
0
            "pand       %%mm7, %%mm5    \n\t"
549
0
            "por        %%mm1, %%mm0    \n\t"
550
0
            "por        %%mm4, %%mm3    \n\t"
551
0
            "por        %%mm2, %%mm0    \n\t"
552
0
            "por        %%mm5, %%mm3    \n\t"
553
0
            "psllq        $16, %%mm3    \n\t"
554
0
            "por        %%mm3, %%mm0    \n\t"
555
0
            MOVNTQ"     %%mm0, (%0)     \n\t"
556
0
            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
557
0
        d += 4;
558
0
        s += 12;
559
0
    }
560
0
    __asm__ volatile(SFENCE:::"memory");
561
0
    __asm__ volatile(EMMS:::"memory");
562
0
    while (s < end) {
563
0
        const int b = *s++;
564
0
        const int g = *s++;
565
0
        const int r = *s++;
566
0
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
567
0
    }
568
0
}
569
570
static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
571
0
{
572
0
    const uint8_t *s = src;
573
0
    const uint8_t *end;
574
0
    const uint8_t *mm_end;
575
0
    uint16_t *d = (uint16_t *)dst;
576
0
    end = s + src_size;
577
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
578
0
    __asm__ volatile(
579
0
        "movq         %0, %%mm7     \n\t"
580
0
        "movq         %1, %%mm6     \n\t"
581
0
        ::"m"(red_16mask),"m"(green_16mask));
582
0
    mm_end = end - 15;
583
0
    while (s < mm_end) {
584
0
        __asm__ volatile(
585
0
            PREFETCH"  32(%1)           \n\t"
586
0
            "movd        (%1), %%mm0    \n\t"
587
0
            "movd       3(%1), %%mm3    \n\t"
588
0
            "punpckldq  6(%1), %%mm0    \n\t"
589
0
            "punpckldq  9(%1), %%mm3    \n\t"
590
0
            "movq       %%mm0, %%mm1    \n\t"
591
0
            "movq       %%mm0, %%mm2    \n\t"
592
0
            "movq       %%mm3, %%mm4    \n\t"
593
0
            "movq       %%mm3, %%mm5    \n\t"
594
0
            "psllq         $8, %%mm0    \n\t"
595
0
            "psllq         $8, %%mm3    \n\t"
596
0
            "pand       %%mm7, %%mm0    \n\t"
597
0
            "pand       %%mm7, %%mm3    \n\t"
598
0
            "psrlq         $5, %%mm1    \n\t"
599
0
            "psrlq         $5, %%mm4    \n\t"
600
0
            "pand       %%mm6, %%mm1    \n\t"
601
0
            "pand       %%mm6, %%mm4    \n\t"
602
0
            "psrlq        $19, %%mm2    \n\t"
603
0
            "psrlq        $19, %%mm5    \n\t"
604
0
            "pand          %2, %%mm2    \n\t"
605
0
            "pand          %2, %%mm5    \n\t"
606
0
            "por        %%mm1, %%mm0    \n\t"
607
0
            "por        %%mm4, %%mm3    \n\t"
608
0
            "por        %%mm2, %%mm0    \n\t"
609
0
            "por        %%mm5, %%mm3    \n\t"
610
0
            "psllq        $16, %%mm3    \n\t"
611
0
            "por        %%mm3, %%mm0    \n\t"
612
0
            MOVNTQ"     %%mm0, (%0)     \n\t"
613
0
            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
614
0
        d += 4;
615
0
        s += 12;
616
0
    }
617
0
    __asm__ volatile(SFENCE:::"memory");
618
0
    __asm__ volatile(EMMS:::"memory");
619
0
    while (s < end) {
620
0
        const int r = *s++;
621
0
        const int g = *s++;
622
0
        const int b = *s++;
623
0
        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
624
0
    }
625
0
}
626
627
static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
628
0
{
629
0
    const uint8_t *s = src;
630
0
    const uint8_t *end;
631
0
    const uint8_t *mm_end;
632
0
    uint16_t *d = (uint16_t *)dst;
633
0
    end = s + src_size;
634
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
635
0
    __asm__ volatile(
636
0
        "movq          %0, %%mm7    \n\t"
637
0
        "movq          %1, %%mm6    \n\t"
638
0
        ::"m"(red_15mask),"m"(green_15mask));
639
0
    mm_end = end - 11;
640
0
    while (s < mm_end) {
641
0
        __asm__ volatile(
642
0
            PREFETCH"  32(%1)           \n\t"
643
0
            "movd        (%1), %%mm0    \n\t"
644
0
            "movd       3(%1), %%mm3    \n\t"
645
0
            "punpckldq  6(%1), %%mm0    \n\t"
646
0
            "punpckldq  9(%1), %%mm3    \n\t"
647
0
            "movq       %%mm0, %%mm1    \n\t"
648
0
            "movq       %%mm0, %%mm2    \n\t"
649
0
            "movq       %%mm3, %%mm4    \n\t"
650
0
            "movq       %%mm3, %%mm5    \n\t"
651
0
            "psrlq         $3, %%mm0    \n\t"
652
0
            "psrlq         $3, %%mm3    \n\t"
653
0
            "pand          %2, %%mm0    \n\t"
654
0
            "pand          %2, %%mm3    \n\t"
655
0
            "psrlq         $6, %%mm1    \n\t"
656
0
            "psrlq         $6, %%mm4    \n\t"
657
0
            "pand       %%mm6, %%mm1    \n\t"
658
0
            "pand       %%mm6, %%mm4    \n\t"
659
0
            "psrlq         $9, %%mm2    \n\t"
660
0
            "psrlq         $9, %%mm5    \n\t"
661
0
            "pand       %%mm7, %%mm2    \n\t"
662
0
            "pand       %%mm7, %%mm5    \n\t"
663
0
            "por        %%mm1, %%mm0    \n\t"
664
0
            "por        %%mm4, %%mm3    \n\t"
665
0
            "por        %%mm2, %%mm0    \n\t"
666
0
            "por        %%mm5, %%mm3    \n\t"
667
0
            "psllq        $16, %%mm3    \n\t"
668
0
            "por        %%mm3, %%mm0    \n\t"
669
0
            MOVNTQ"     %%mm0, (%0)     \n\t"
670
0
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
671
0
        d += 4;
672
0
        s += 12;
673
0
    }
674
0
    __asm__ volatile(SFENCE:::"memory");
675
0
    __asm__ volatile(EMMS:::"memory");
676
0
    while (s < end) {
677
0
        const int b = *s++;
678
0
        const int g = *s++;
679
0
        const int r = *s++;
680
0
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
681
0
    }
682
0
}
683
684
static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
685
0
{
686
0
    const uint8_t *s = src;
687
0
    const uint8_t *end;
688
0
    const uint8_t *mm_end;
689
0
    uint16_t *d = (uint16_t *)dst;
690
0
    end = s + src_size;
691
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
692
0
    __asm__ volatile(
693
0
        "movq         %0, %%mm7     \n\t"
694
0
        "movq         %1, %%mm6     \n\t"
695
0
        ::"m"(red_15mask),"m"(green_15mask));
696
0
    mm_end = end - 15;
697
0
    while (s < mm_end) {
698
0
        __asm__ volatile(
699
0
            PREFETCH" 32(%1)            \n\t"
700
0
            "movd       (%1), %%mm0     \n\t"
701
0
            "movd      3(%1), %%mm3     \n\t"
702
0
            "punpckldq 6(%1), %%mm0     \n\t"
703
0
            "punpckldq 9(%1), %%mm3     \n\t"
704
0
            "movq      %%mm0, %%mm1     \n\t"
705
0
            "movq      %%mm0, %%mm2     \n\t"
706
0
            "movq      %%mm3, %%mm4     \n\t"
707
0
            "movq      %%mm3, %%mm5     \n\t"
708
0
            "psllq        $7, %%mm0     \n\t"
709
0
            "psllq        $7, %%mm3     \n\t"
710
0
            "pand      %%mm7, %%mm0     \n\t"
711
0
            "pand      %%mm7, %%mm3     \n\t"
712
0
            "psrlq        $6, %%mm1     \n\t"
713
0
            "psrlq        $6, %%mm4     \n\t"
714
0
            "pand      %%mm6, %%mm1     \n\t"
715
0
            "pand      %%mm6, %%mm4     \n\t"
716
0
            "psrlq       $19, %%mm2     \n\t"
717
0
            "psrlq       $19, %%mm5     \n\t"
718
0
            "pand         %2, %%mm2     \n\t"
719
0
            "pand         %2, %%mm5     \n\t"
720
0
            "por       %%mm1, %%mm0     \n\t"
721
0
            "por       %%mm4, %%mm3     \n\t"
722
0
            "por       %%mm2, %%mm0     \n\t"
723
0
            "por       %%mm5, %%mm3     \n\t"
724
0
            "psllq       $16, %%mm3     \n\t"
725
0
            "por       %%mm3, %%mm0     \n\t"
726
0
            MOVNTQ"    %%mm0, (%0)      \n\t"
727
0
            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
728
0
        d += 4;
729
0
        s += 12;
730
0
    }
731
0
    __asm__ volatile(SFENCE:::"memory");
732
0
    __asm__ volatile(EMMS:::"memory");
733
0
    while (s < end) {
734
0
        const int r = *s++;
735
0
        const int g = *s++;
736
0
        const int b = *s++;
737
0
        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
738
0
    }
739
0
}
740
741
static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
742
0
{
743
0
    const uint16_t *end;
744
0
    const uint16_t *mm_end;
745
0
    uint8_t *d = dst;
746
0
    const uint16_t *s = (const uint16_t*)src;
747
0
    end = s + src_size/2;
748
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
749
0
    mm_end = end - 7;
750
0
    while (s < mm_end) {
751
0
        __asm__ volatile(
752
0
            PREFETCH"  32(%1)           \n\t"
753
0
            "movq        (%1), %%mm0    \n\t"
754
0
            "movq        (%1), %%mm1    \n\t"
755
0
            "movq        (%1), %%mm2    \n\t"
756
0
            "pand          %2, %%mm0    \n\t"
757
0
            "pand          %3, %%mm1    \n\t"
758
0
            "pand          %4, %%mm2    \n\t"
759
0
            "psllq         $5, %%mm0    \n\t"
760
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
761
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
762
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
763
0
            "movq       %%mm0, %%mm3    \n\t"
764
0
            "movq       %%mm1, %%mm4    \n\t"
765
0
            "movq       %%mm2, %%mm5    \n\t"
766
0
            "punpcklwd     %5, %%mm0    \n\t"
767
0
            "punpcklwd     %5, %%mm1    \n\t"
768
0
            "punpcklwd     %5, %%mm2    \n\t"
769
0
            "punpckhwd     %5, %%mm3    \n\t"
770
0
            "punpckhwd     %5, %%mm4    \n\t"
771
0
            "punpckhwd     %5, %%mm5    \n\t"
772
0
            "psllq         $8, %%mm1    \n\t"
773
0
            "psllq        $16, %%mm2    \n\t"
774
0
            "por        %%mm1, %%mm0    \n\t"
775
0
            "por        %%mm2, %%mm0    \n\t"
776
0
            "psllq         $8, %%mm4    \n\t"
777
0
            "psllq        $16, %%mm5    \n\t"
778
0
            "por        %%mm4, %%mm3    \n\t"
779
0
            "por        %%mm5, %%mm3    \n\t"
780
781
0
            "movq       %%mm0, %%mm6    \n\t"
782
0
            "movq       %%mm3, %%mm7    \n\t"
783
784
0
            "movq       8(%1), %%mm0    \n\t"
785
0
            "movq       8(%1), %%mm1    \n\t"
786
0
            "movq       8(%1), %%mm2    \n\t"
787
0
            "pand          %2, %%mm0    \n\t"
788
0
            "pand          %3, %%mm1    \n\t"
789
0
            "pand          %4, %%mm2    \n\t"
790
0
            "psllq         $5, %%mm0    \n\t"
791
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
792
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
793
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
794
0
            "movq       %%mm0, %%mm3    \n\t"
795
0
            "movq       %%mm1, %%mm4    \n\t"
796
0
            "movq       %%mm2, %%mm5    \n\t"
797
0
            "punpcklwd     %5, %%mm0    \n\t"
798
0
            "punpcklwd     %5, %%mm1    \n\t"
799
0
            "punpcklwd     %5, %%mm2    \n\t"
800
0
            "punpckhwd     %5, %%mm3    \n\t"
801
0
            "punpckhwd     %5, %%mm4    \n\t"
802
0
            "punpckhwd     %5, %%mm5    \n\t"
803
0
            "psllq         $8, %%mm1    \n\t"
804
0
            "psllq        $16, %%mm2    \n\t"
805
0
            "por        %%mm1, %%mm0    \n\t"
806
0
            "por        %%mm2, %%mm0    \n\t"
807
0
            "psllq         $8, %%mm4    \n\t"
808
0
            "psllq        $16, %%mm5    \n\t"
809
0
            "por        %%mm4, %%mm3    \n\t"
810
0
            "por        %%mm5, %%mm3    \n\t"
811
812
0
            :"=m"(*d)
813
0
            :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
814
0
             NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
815
0
            :"memory");
816
        /* borrowed 32 to 24 */
817
0
        __asm__ volatile(
818
0
            "movq       %%mm0, %%mm4    \n\t"
819
0
            "movq       %%mm3, %%mm5    \n\t"
820
0
            "movq       %%mm6, %%mm0    \n\t"
821
0
            "movq       %%mm7, %%mm1    \n\t"
822
823
0
            "movq       %%mm4, %%mm6    \n\t"
824
0
            "movq       %%mm5, %%mm7    \n\t"
825
0
            "movq       %%mm0, %%mm2    \n\t"
826
0
            "movq       %%mm1, %%mm3    \n\t"
827
828
0
            STORE_BGR24_MMX
829
830
0
            :: "r"(d), "m"(*s)
831
0
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
832
0
            :"memory");
833
0
        d += 24;
834
0
        s += 8;
835
0
    }
836
0
    __asm__ volatile(SFENCE:::"memory");
837
0
    __asm__ volatile(EMMS:::"memory");
838
0
    while (s < end) {
839
0
        register uint16_t bgr;
840
0
        bgr = *s++;
841
0
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
842
0
        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
843
0
        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
844
0
    }
845
0
}
846
847
static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
848
0
{
849
0
    const uint16_t *end;
850
0
    const uint16_t *mm_end;
851
0
    uint8_t *d = (uint8_t *)dst;
852
0
    const uint16_t *s = (const uint16_t *)src;
853
0
    end = s + src_size/2;
854
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
855
0
    mm_end = end - 7;
856
0
    while (s < mm_end) {
857
0
        __asm__ volatile(
858
0
            PREFETCH"  32(%1)           \n\t"
859
0
            "movq        (%1), %%mm0    \n\t"
860
0
            "movq        (%1), %%mm1    \n\t"
861
0
            "movq        (%1), %%mm2    \n\t"
862
0
            "pand          %2, %%mm0    \n\t"
863
0
            "pand          %3, %%mm1    \n\t"
864
0
            "pand          %4, %%mm2    \n\t"
865
0
            "psllq         $5, %%mm0    \n\t"
866
0
            "psrlq         $1, %%mm2    \n\t"
867
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
868
0
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
869
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
870
0
            "movq       %%mm0, %%mm3    \n\t"
871
0
            "movq       %%mm1, %%mm4    \n\t"
872
0
            "movq       %%mm2, %%mm5    \n\t"
873
0
            "punpcklwd     %5, %%mm0    \n\t"
874
0
            "punpcklwd     %5, %%mm1    \n\t"
875
0
            "punpcklwd     %5, %%mm2    \n\t"
876
0
            "punpckhwd     %5, %%mm3    \n\t"
877
0
            "punpckhwd     %5, %%mm4    \n\t"
878
0
            "punpckhwd     %5, %%mm5    \n\t"
879
0
            "psllq         $8, %%mm1    \n\t"
880
0
            "psllq        $16, %%mm2    \n\t"
881
0
            "por        %%mm1, %%mm0    \n\t"
882
0
            "por        %%mm2, %%mm0    \n\t"
883
0
            "psllq         $8, %%mm4    \n\t"
884
0
            "psllq        $16, %%mm5    \n\t"
885
0
            "por        %%mm4, %%mm3    \n\t"
886
0
            "por        %%mm5, %%mm3    \n\t"
887
888
0
            "movq       %%mm0, %%mm6    \n\t"
889
0
            "movq       %%mm3, %%mm7    \n\t"
890
891
0
            "movq       8(%1), %%mm0    \n\t"
892
0
            "movq       8(%1), %%mm1    \n\t"
893
0
            "movq       8(%1), %%mm2    \n\t"
894
0
            "pand          %2, %%mm0    \n\t"
895
0
            "pand          %3, %%mm1    \n\t"
896
0
            "pand          %4, %%mm2    \n\t"
897
0
            "psllq         $5, %%mm0    \n\t"
898
0
            "psrlq         $1, %%mm2    \n\t"
899
0
            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
900
0
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
901
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
902
0
            "movq       %%mm0, %%mm3    \n\t"
903
0
            "movq       %%mm1, %%mm4    \n\t"
904
0
            "movq       %%mm2, %%mm5    \n\t"
905
0
            "punpcklwd     %5, %%mm0    \n\t"
906
0
            "punpcklwd     %5, %%mm1    \n\t"
907
0
            "punpcklwd     %5, %%mm2    \n\t"
908
0
            "punpckhwd     %5, %%mm3    \n\t"
909
0
            "punpckhwd     %5, %%mm4    \n\t"
910
0
            "punpckhwd     %5, %%mm5    \n\t"
911
0
            "psllq         $8, %%mm1    \n\t"
912
0
            "psllq        $16, %%mm2    \n\t"
913
0
            "por        %%mm1, %%mm0    \n\t"
914
0
            "por        %%mm2, %%mm0    \n\t"
915
0
            "psllq         $8, %%mm4    \n\t"
916
0
            "psllq        $16, %%mm5    \n\t"
917
0
            "por        %%mm4, %%mm3    \n\t"
918
0
            "por        %%mm5, %%mm3    \n\t"
919
0
            :"=m"(*d)
920
0
            :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
921
0
             NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
922
0
            :"memory");
923
        /* borrowed 32 to 24 */
924
0
        __asm__ volatile(
925
0
            "movq       %%mm0, %%mm4    \n\t"
926
0
            "movq       %%mm3, %%mm5    \n\t"
927
0
            "movq       %%mm6, %%mm0    \n\t"
928
0
            "movq       %%mm7, %%mm1    \n\t"
929
930
0
            "movq       %%mm4, %%mm6    \n\t"
931
0
            "movq       %%mm5, %%mm7    \n\t"
932
0
            "movq       %%mm0, %%mm2    \n\t"
933
0
            "movq       %%mm1, %%mm3    \n\t"
934
935
0
            STORE_BGR24_MMX
936
937
0
            :: "r"(d), "m"(*s)
938
0
              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
939
0
            :"memory");
940
0
        d += 24;
941
0
        s += 8;
942
0
    }
943
0
    __asm__ volatile(SFENCE:::"memory");
944
0
    __asm__ volatile(EMMS:::"memory");
945
0
    while (s < end) {
946
0
        register uint16_t bgr;
947
0
        bgr = *s++;
948
0
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
949
0
        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
950
0
        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
951
0
    }
952
0
}
953
954
/*
955
 * mm0 = 00 B3 00 B2 00 B1 00 B0
956
 * mm1 = 00 G3 00 G2 00 G1 00 G0
957
 * mm2 = 00 R3 00 R2 00 R1 00 R0
958
 * mm6 = FF FF FF FF FF FF FF FF
959
 * mm7 = 00 00 00 00 00 00 00 00
960
 */
961
#define PACK_RGB32 \
962
    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
963
    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
964
    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
965
    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
966
    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
967
    "movq       %%mm0, %%mm3    \n\t"                               \
968
    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
969
    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
970
    MOVNTQ"     %%mm0,  (%0)    \n\t"                               \
971
    MOVNTQ"     %%mm3, 8(%0)    \n\t"                               \
972
973
static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
974
0
{
975
0
    const uint16_t *end;
976
0
    const uint16_t *mm_end;
977
0
    uint8_t *d = dst;
978
0
    const uint16_t *s = (const uint16_t *)src;
979
0
    end = s + src_size/2;
980
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
981
0
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
982
0
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
983
0
    mm_end = end - 3;
984
0
    while (s < mm_end) {
985
0
        __asm__ volatile(
986
0
            PREFETCH"  32(%1)           \n\t"
987
0
            "movq        (%1), %%mm0    \n\t"
988
0
            "movq        (%1), %%mm1    \n\t"
989
0
            "movq        (%1), %%mm2    \n\t"
990
0
            "pand          %2, %%mm0    \n\t"
991
0
            "pand          %3, %%mm1    \n\t"
992
0
            "pand          %4, %%mm2    \n\t"
993
0
            "psllq         $5, %%mm0    \n\t"
994
0
            "pmulhw        %5, %%mm0    \n\t"
995
0
            "pmulhw        %5, %%mm1    \n\t"
996
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
997
0
            PACK_RGB32
998
0
            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
999
0
              NAMED_CONSTRAINTS_ADD(mul15_hi)
1000
0
            :"memory");
1001
0
        d += 16;
1002
0
        s += 4;
1003
0
    }
1004
0
    __asm__ volatile(SFENCE:::"memory");
1005
0
    __asm__ volatile(EMMS:::"memory");
1006
0
    while (s < end) {
1007
0
        register uint16_t bgr;
1008
0
        bgr = *s++;
1009
0
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1010
0
        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
1011
0
        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
1012
0
        *d++ = 255;
1013
0
    }
1014
0
}
1015
1016
static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1017
0
{
1018
0
    const uint16_t *end;
1019
0
    const uint16_t *mm_end;
1020
0
    uint8_t *d = dst;
1021
0
    const uint16_t *s = (const uint16_t*)src;
1022
0
    end = s + src_size/2;
1023
0
    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
1024
0
    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
1025
0
    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
1026
0
    mm_end = end - 3;
1027
0
    while (s < mm_end) {
1028
0
        __asm__ volatile(
1029
0
            PREFETCH"  32(%1)           \n\t"
1030
0
            "movq        (%1), %%mm0    \n\t"
1031
0
            "movq        (%1), %%mm1    \n\t"
1032
0
            "movq        (%1), %%mm2    \n\t"
1033
0
            "pand          %2, %%mm0    \n\t"
1034
0
            "pand          %3, %%mm1    \n\t"
1035
0
            "pand          %4, %%mm2    \n\t"
1036
0
            "psllq         $5, %%mm0    \n\t"
1037
0
            "psrlq         $1, %%mm2    \n\t"
1038
0
            "pmulhw        %5, %%mm0    \n\t"
1039
0
            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
1040
0
            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
1041
0
            PACK_RGB32
1042
0
            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1043
0
              NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1044
0
            :"memory");
1045
0
        d += 16;
1046
0
        s += 4;
1047
0
    }
1048
0
    __asm__ volatile(SFENCE:::"memory");
1049
0
    __asm__ volatile(EMMS:::"memory");
1050
0
    while (s < end) {
1051
0
        register uint16_t bgr;
1052
0
        bgr = *s++;
1053
0
        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1054
0
        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1055
0
        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1056
0
        *d++ = 255;
1057
0
    }
1058
0
}
1059
1060
static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1061
0
{
1062
0
    x86_reg mmx_size= 23 - src_size;
1063
0
    __asm__ volatile (
1064
0
        "test             %%"FF_REG_a", %%"FF_REG_a"    \n\t"
1065
0
        "jns                     2f                     \n\t"
1066
0
        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1067
0
        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1068
0
        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1069
0
        ".p2align                 4                     \n\t"
1070
0
        "1:                                             \n\t"
1071
0
        PREFETCH" 32(%1, %%"FF_REG_a")                  \n\t"
1072
0
        "movq    (%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1073
0
        "movq    (%1, %%"FF_REG_a"), %%mm1              \n\t" // BGR BGR BG
1074
0
        "movq   2(%1, %%"FF_REG_a"), %%mm2              \n\t" // R BGR BGR B
1075
0
        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1076
0
        "pand                 %%mm5, %%mm0              \n\t"
1077
0
        "pand                 %%mm6, %%mm1              \n\t"
1078
0
        "pand                 %%mm7, %%mm2              \n\t"
1079
0
        "por                  %%mm0, %%mm1              \n\t"
1080
0
        "por                  %%mm2, %%mm1              \n\t"
1081
0
        "movq   6(%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1082
0
        MOVNTQ"               %%mm1,(%2, %%"FF_REG_a")  \n\t" // RGB RGB RG
1083
0
        "movq   8(%1, %%"FF_REG_a"), %%mm1              \n\t" // R BGR BGR B
1084
0
        "movq  10(%1, %%"FF_REG_a"), %%mm2              \n\t" // GR BGR BGR
1085
0
        "pand                 %%mm7, %%mm0              \n\t"
1086
0
        "pand                 %%mm5, %%mm1              \n\t"
1087
0
        "pand                 %%mm6, %%mm2              \n\t"
1088
0
        "por                  %%mm0, %%mm1              \n\t"
1089
0
        "por                  %%mm2, %%mm1              \n\t"
1090
0
        "movq  14(%1, %%"FF_REG_a"), %%mm0              \n\t" // R BGR BGR B
1091
0
        MOVNTQ"               %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1092
0
        "movq  16(%1, %%"FF_REG_a"), %%mm1              \n\t" // GR BGR BGR
1093
0
        "movq  18(%1, %%"FF_REG_a"), %%mm2              \n\t" // BGR BGR BG
1094
0
        "pand                 %%mm6, %%mm0              \n\t"
1095
0
        "pand                 %%mm7, %%mm1              \n\t"
1096
0
        "pand                 %%mm5, %%mm2              \n\t"
1097
0
        "por                  %%mm0, %%mm1              \n\t"
1098
0
        "por                  %%mm2, %%mm1              \n\t"
1099
0
        MOVNTQ"               %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1100
0
        "add                    $24, %%"FF_REG_a"       \n\t"
1101
0
        " js                     1b                     \n\t"
1102
0
        "2:                                             \n\t"
1103
0
        : "+a" (mmx_size)
1104
0
        : "r" (src-mmx_size), "r"(dst-mmx_size)
1105
0
          NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1106
0
    );
1107
1108
0
    __asm__ volatile(SFENCE:::"memory");
1109
0
    __asm__ volatile(EMMS:::"memory");
1110
1111
0
    if (mmx_size==23) return; //finished, was multiple of 8
1112
1113
0
    src+= src_size;
1114
0
    dst+= src_size;
1115
0
    src_size= 23-mmx_size;
1116
0
    src-= src_size;
1117
0
    dst-= src_size;
1118
0
    for (unsigned i = 0; i < src_size; i +=3) {
1119
0
        register uint8_t x;
1120
0
        x          = src[i + 2];
1121
0
        dst[i + 1] = src[i + 1];
1122
0
        dst[i + 2] = src[i + 0];
1123
0
        dst[i + 0] = x;
1124
0
    }
1125
0
}
1126
1127
static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1128
                                           int width, int height,
1129
                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1130
0
{
1131
0
    const x86_reg chromWidth= width>>1;
1132
0
    for (int y = 0; y < height; y++) {
1133
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1134
0
        __asm__ volatile(
1135
0
            "xor                 %%"FF_REG_a", %%"FF_REG_a" \n\t"
1136
0
            ".p2align                    4              \n\t"
1137
0
            "1:                                         \n\t"
1138
0
            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1139
0
            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1140
0
            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1141
0
            "movq       (%2, %%"FF_REG_a"), %%mm0       \n\t" // U(0)
1142
0
            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1143
0
            "movq       (%3, %%"FF_REG_a"), %%mm1       \n\t" // V(0)
1144
0
            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1145
0
            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1146
1147
0
            "movq     (%1, %%"FF_REG_a",2), %%mm3       \n\t" // Y(0)
1148
0
            "movq    8(%1, %%"FF_REG_a",2), %%mm5       \n\t" // Y(8)
1149
0
            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1150
0
            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1151
0
            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1152
0
            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1153
0
            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1154
0
            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1155
1156
0
            MOVNTQ"                  %%mm3,   (%0, %%"FF_REG_a", 4)    \n\t"
1157
0
            MOVNTQ"                  %%mm4,  8(%0, %%"FF_REG_a", 4)    \n\t"
1158
0
            MOVNTQ"                  %%mm5, 16(%0, %%"FF_REG_a", 4)    \n\t"
1159
0
            MOVNTQ"                  %%mm6, 24(%0, %%"FF_REG_a", 4)    \n\t"
1160
1161
0
            "add                        $8, %%"FF_REG_a" \n\t"
1162
0
            "cmp                        %4, %%"FF_REG_a" \n\t"
1163
0
            " jb                        1b               \n\t"
1164
0
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1165
0
            : "%"FF_REG_a
1166
0
        );
1167
0
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1168
0
            usrc += chromStride;
1169
0
            vsrc += chromStride;
1170
0
        }
1171
0
        ysrc += lumStride;
1172
0
        dst  += dstStride;
1173
0
    }
1174
0
    __asm__(EMMS"       \n\t"
1175
0
            SFENCE"     \n\t"
1176
0
            :::"memory");
1177
0
}
1178
1179
/**
1180
 * Height should be a multiple of 2 and width should be a multiple of 16.
1181
 * (If this is a problem for anyone then tell me, and I will fix it.)
1182
 */
1183
static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1184
                                      int width, int height,
1185
                                      int lumStride, int chromStride, int dstStride)
1186
0
{
1187
    //FIXME interpolate chroma
1188
0
    yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1189
0
}
1190
1191
static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1192
                                           int width, int height,
1193
                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1194
0
{
1195
0
    const x86_reg chromWidth= width>>1;
1196
0
    for (int y = 0; y < height; y++) {
1197
        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1198
0
        __asm__ volatile(
1199
0
            "xor             %%"FF_REG_a", %%"FF_REG_a" \n\t"
1200
0
            ".p2align                   4               \n\t"
1201
0
            "1:                                         \n\t"
1202
0
            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1203
0
            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1204
0
            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1205
0
            "movq      (%2, %%"FF_REG_a"), %%mm0        \n\t" // U(0)
1206
0
            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1207
0
            "movq      (%3, %%"FF_REG_a"), %%mm1        \n\t" // V(0)
1208
0
            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1209
0
            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1210
1211
0
            "movq    (%1, %%"FF_REG_a",2), %%mm3        \n\t" // Y(0)
1212
0
            "movq   8(%1, %%"FF_REG_a",2), %%mm5        \n\t" // Y(8)
1213
0
            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1214
0
            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1215
0
            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1216
0
            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1217
0
            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1218
0
            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1219
1220
0
            MOVNTQ"                 %%mm0,   (%0, %%"FF_REG_a", 4)     \n\t"
1221
0
            MOVNTQ"                 %%mm4,  8(%0, %%"FF_REG_a", 4)     \n\t"
1222
0
            MOVNTQ"                 %%mm2, 16(%0, %%"FF_REG_a", 4)     \n\t"
1223
0
            MOVNTQ"                 %%mm6, 24(%0, %%"FF_REG_a", 4)     \n\t"
1224
1225
0
            "add                       $8, %%"FF_REG_a" \n\t"
1226
0
            "cmp                       %4, %%"FF_REG_a" \n\t"
1227
0
            " jb                       1b               \n\t"
1228
0
            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1229
0
            : "%"FF_REG_a
1230
0
        );
1231
0
        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1232
0
            usrc += chromStride;
1233
0
            vsrc += chromStride;
1234
0
        }
1235
0
        ysrc += lumStride;
1236
0
        dst += dstStride;
1237
0
    }
1238
0
    __asm__(EMMS"       \n\t"
1239
0
            SFENCE"     \n\t"
1240
0
            :::"memory");
1241
0
}
1242
1243
/**
1244
 * Height should be a multiple of 2 and width should be a multiple of 16
1245
 * (If this is a problem for anyone then tell me, and I will fix it.)
1246
 */
1247
static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1248
                                      int width, int height,
1249
                                      int lumStride, int chromStride, int dstStride)
1250
0
{
1251
    //FIXME interpolate chroma
1252
0
    yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1253
0
}
1254
1255
/**
1256
 * Width should be a multiple of 16.
1257
 */
1258
static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1259
                                         int width, int height,
1260
                                         int lumStride, int chromStride, int dstStride)
1261
0
{
1262
0
    yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1263
0
}
1264
1265
/**
1266
 * Width should be a multiple of 16.
1267
 */
1268
static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1269
                                         int width, int height,
1270
                                         int lumStride, int chromStride, int dstStride)
1271
0
{
1272
0
    yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1273
0
}
1274
1275
/**
1276
 * Height should be a multiple of 2 and width should be a multiple of 16.
1277
 * (If this is a problem for anyone then tell me, and I will fix it.)
1278
 */
1279
static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1280
                                      int width, int height,
1281
                                      int lumStride, int chromStride, int srcStride)
1282
0
{
1283
0
    const x86_reg chromWidth= width>>1;
1284
0
    for (int y = 0; y < height; y += 2) {
1285
0
        __asm__ volatile(
1286
0
            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1287
0
            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1288
0
            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1289
0
            ".p2align                    4              \n\t"
1290
0
            "1:                \n\t"
1291
0
            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1292
0
            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1293
0
            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1294
0
            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1295
0
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1296
0
            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1297
0
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1298
0
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1299
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1300
0
            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1301
0
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1302
1303
0
            MOVNTQ"                  %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1304
1305
0
            "movq  16(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1306
0
            "movq  24(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1307
0
            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1308
0
            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1309
0
            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1310
0
            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1311
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1312
0
            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1313
0
            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1314
0
            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1315
1316
0
            MOVNTQ"                  %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1317
1318
0
            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1319
0
            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1320
0
            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1321
0
            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1322
0
            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1323
0
            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1324
0
            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1325
0
            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1326
1327
0
            MOVNTQ"                  %%mm0, (%3, %%"FF_REG_a")     \n\t"
1328
0
            MOVNTQ"                  %%mm2, (%2, %%"FF_REG_a")     \n\t"
1329
1330
0
            "add                        $8, %%"FF_REG_a" \n\t"
1331
0
            "cmp                        %4, %%"FF_REG_a" \n\t"
1332
0
            " jb                        1b               \n\t"
1333
0
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1334
0
            : "memory", "%"FF_REG_a
1335
0
        );
1336
1337
0
        ydst += lumStride;
1338
0
        src  += srcStride;
1339
1340
0
        __asm__ volatile(
1341
0
            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1342
0
            ".p2align                    4              \n\t"
1343
0
            "1:                                         \n\t"
1344
0
            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1345
0
            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1346
0
            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1347
0
            "movq  16(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1348
0
            "movq  24(%0, %%"FF_REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1349
0
            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1350
0
            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1351
0
            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1352
0
            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1353
0
            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1354
0
            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1355
1356
0
            MOVNTQ"                  %%mm0,  (%1, %%"FF_REG_a", 2) \n\t"
1357
0
            MOVNTQ"                  %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1358
1359
0
            "add                        $8, %%"FF_REG_a"\n\t"
1360
0
            "cmp                        %4, %%"FF_REG_a"\n\t"
1361
0
            " jb                        1b              \n\t"
1362
1363
0
            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1364
0
            : "memory", "%"FF_REG_a
1365
0
        );
1366
0
        udst += chromStride;
1367
0
        vdst += chromStride;
1368
0
        ydst += lumStride;
1369
0
        src  += srcStride;
1370
0
    }
1371
0
    __asm__ volatile(EMMS"       \n\t"
1372
0
                     SFENCE"     \n\t"
1373
0
                     :::"memory");
1374
0
}
1375
1376
static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1377
0
{
1378
0
    dst[0]= src[0];
1379
1380
    // first line
1381
0
    for (int x = 0; x < srcWidth - 1; x++) {
1382
0
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1383
0
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1384
0
    }
1385
0
    dst[2*srcWidth-1]= src[srcWidth-1];
1386
1387
0
    dst+= dstStride;
1388
1389
0
    for (int y = 1; y < srcHeight; y++) {
1390
0
        x86_reg mmxSize= srcWidth&~15;
1391
1392
0
        if (mmxSize) {
1393
0
        __asm__ volatile(
1394
0
            "mov                       %4, %%"FF_REG_a" \n\t"
1395
0
            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1396
0
            "movq      (%0, %%"FF_REG_a"), %%mm4    \n\t"
1397
0
            "movq                   %%mm4, %%mm2    \n\t"
1398
0
            "psllq                     $8, %%mm4    \n\t"
1399
0
            "pand                   %%mm0, %%mm2    \n\t"
1400
0
            "por                    %%mm2, %%mm4    \n\t"
1401
0
            "movq      (%1, %%"FF_REG_a"), %%mm5    \n\t"
1402
0
            "movq                   %%mm5, %%mm3    \n\t"
1403
0
            "psllq                     $8, %%mm5    \n\t"
1404
0
            "pand                   %%mm0, %%mm3    \n\t"
1405
0
            "por                    %%mm3, %%mm5    \n\t"
1406
0
            "1:                                     \n\t"
1407
0
            "movq      (%0, %%"FF_REG_a"), %%mm0    \n\t"
1408
0
            "movq      (%1, %%"FF_REG_a"), %%mm1    \n\t"
1409
0
            "movq     1(%0, %%"FF_REG_a"), %%mm2    \n\t"
1410
0
            "movq     1(%1, %%"FF_REG_a"), %%mm3    \n\t"
1411
0
            PAVGB"                  %%mm0, %%mm5    \n\t"
1412
0
            PAVGB"                  %%mm0, %%mm3    \n\t"
1413
0
            PAVGB"                  %%mm0, %%mm5    \n\t"
1414
0
            PAVGB"                  %%mm0, %%mm3    \n\t"
1415
0
            PAVGB"                  %%mm1, %%mm4    \n\t"
1416
0
            PAVGB"                  %%mm1, %%mm2    \n\t"
1417
0
            PAVGB"                  %%mm1, %%mm4    \n\t"
1418
0
            PAVGB"                  %%mm1, %%mm2    \n\t"
1419
0
            "movq                   %%mm5, %%mm7    \n\t"
1420
0
            "movq                   %%mm4, %%mm6    \n\t"
1421
0
            "punpcklbw              %%mm3, %%mm5    \n\t"
1422
0
            "punpckhbw              %%mm3, %%mm7    \n\t"
1423
0
            "punpcklbw              %%mm2, %%mm4    \n\t"
1424
0
            "punpckhbw              %%mm2, %%mm6    \n\t"
1425
0
            MOVNTQ"                 %%mm5,  (%2, %%"FF_REG_a", 2)  \n\t"
1426
0
            MOVNTQ"                 %%mm7, 8(%2, %%"FF_REG_a", 2)  \n\t"
1427
0
            MOVNTQ"                 %%mm4,  (%3, %%"FF_REG_a", 2)  \n\t"
1428
0
            MOVNTQ"                 %%mm6, 8(%3, %%"FF_REG_a", 2)  \n\t"
1429
0
            "add                       $8, %%"FF_REG_a"            \n\t"
1430
0
            "movq    -1(%0, %%"FF_REG_a"), %%mm4    \n\t"
1431
0
            "movq    -1(%1, %%"FF_REG_a"), %%mm5    \n\t"
1432
0
            " js                       1b           \n\t"
1433
0
            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1434
0
               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1435
0
               "g" (-mmxSize)
1436
0
               NAMED_CONSTRAINTS_ADD(mmx_ff)
1437
0
            : "%"FF_REG_a
1438
0
        );
1439
0
        } else {
1440
0
            mmxSize = 1;
1441
0
            dst[0]         = (src[0] * 3 + src[srcStride]) >> 2;
1442
0
            dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1443
0
        }
1444
1445
0
        for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
1446
0
            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1447
0
            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1448
0
            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1449
0
            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1450
0
        }
1451
0
        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1452
0
        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1453
1454
0
        dst+=dstStride*2;
1455
0
        src+=srcStride;
1456
0
    }
1457
1458
    // last line
1459
0
    dst[0]= src[0];
1460
1461
0
    for (int x = 0; x < srcWidth - 1; x++) {
1462
0
        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1463
0
        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1464
0
    }
1465
0
    dst[2*srcWidth-1]= src[srcWidth-1];
1466
1467
0
    __asm__ volatile(EMMS"       \n\t"
1468
0
                     SFENCE"     \n\t"
1469
0
                     :::"memory");
1470
0
}
1471
1472
/**
1473
 * Height should be a multiple of 2 and width should be a multiple of 2.
1474
 * (If this is a problem for anyone then tell me, and I will fix it.)
1475
 * Chrominance data is only taken from every second line,
1476
 * others are ignored in the C version.
1477
 * FIXME: Write HQ version.
1478
 */
1479
#if ARCH_X86_32 && HAVE_7REGS
1480
DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset)  = 0x1010101010101010ULL;
1481
DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL;
1482
DECLARE_ASM_CONST(8, uint64_t, w1111)        = 0x0001000100010001ULL;
1483
1484
static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1485
                                       int width, int height,
1486
                                       int lumStride, int chromStride, int srcStride,
1487
                                       const int32_t *rgb2yuv)
1488
{
1489
#define BGR2Y_IDX "16*4+16*32"
1490
#define BGR2U_IDX "16*4+16*33"
1491
#define BGR2V_IDX "16*4+16*34"
1492
    int y;
1493
    const x86_reg chromWidth= width>>1;
1494
1495
    if (height > 2) {
1496
        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1497
        src  += 2*srcStride;
1498
        ydst += 2*lumStride;
1499
        udst += chromStride;
1500
        vdst += chromStride;
1501
        height -= 2;
1502
    }
1503
1504
    for (y = 0; y < height - 2; y += 2) {
1505
        for (int i = 0; i < 2; i++) {
1506
            __asm__ volatile(
1507
                "mov                        %2, %%"FF_REG_a"\n\t"
1508
                "movq          "BGR2Y_IDX"(%3), %%mm6       \n\t"
1509
                "movq          "MANGLE(w1111)", %%mm5       \n\t"
1510
                "pxor                    %%mm7, %%mm7       \n\t"
1511
                "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1512
                ".p2align                    4              \n\t"
1513
                "1:                                         \n\t"
1514
                PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1515
                "movd       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1516
                "movd      3(%0, %%"FF_REG_d"), %%mm1       \n\t"
1517
                "punpcklbw               %%mm7, %%mm0       \n\t"
1518
                "punpcklbw               %%mm7, %%mm1       \n\t"
1519
                "movd      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1520
                "movd      9(%0, %%"FF_REG_d"), %%mm3       \n\t"
1521
                "punpcklbw               %%mm7, %%mm2       \n\t"
1522
                "punpcklbw               %%mm7, %%mm3       \n\t"
1523
                "pmaddwd                 %%mm6, %%mm0       \n\t"
1524
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1525
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1526
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1527
                "psrad                      $8, %%mm0       \n\t"
1528
                "psrad                      $8, %%mm1       \n\t"
1529
                "psrad                      $8, %%mm2       \n\t"
1530
                "psrad                      $8, %%mm3       \n\t"
1531
                "packssdw                %%mm1, %%mm0       \n\t"
1532
                "packssdw                %%mm3, %%mm2       \n\t"
1533
                "pmaddwd                 %%mm5, %%mm0       \n\t"
1534
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1535
                "packssdw                %%mm2, %%mm0       \n\t"
1536
                "psraw                      $7, %%mm0       \n\t"
1537
1538
                "movd     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1539
                "movd     15(%0, %%"FF_REG_d"), %%mm1       \n\t"
1540
                "punpcklbw               %%mm7, %%mm4       \n\t"
1541
                "punpcklbw               %%mm7, %%mm1       \n\t"
1542
                "movd     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1543
                "movd     21(%0, %%"FF_REG_d"), %%mm3       \n\t"
1544
                "punpcklbw               %%mm7, %%mm2       \n\t"
1545
                "punpcklbw               %%mm7, %%mm3       \n\t"
1546
                "pmaddwd                 %%mm6, %%mm4       \n\t"
1547
                "pmaddwd                 %%mm6, %%mm1       \n\t"
1548
                "pmaddwd                 %%mm6, %%mm2       \n\t"
1549
                "pmaddwd                 %%mm6, %%mm3       \n\t"
1550
                "psrad                      $8, %%mm4       \n\t"
1551
                "psrad                      $8, %%mm1       \n\t"
1552
                "psrad                      $8, %%mm2       \n\t"
1553
                "psrad                      $8, %%mm3       \n\t"
1554
                "packssdw                %%mm1, %%mm4       \n\t"
1555
                "packssdw                %%mm3, %%mm2       \n\t"
1556
                "pmaddwd                 %%mm5, %%mm4       \n\t"
1557
                "pmaddwd                 %%mm5, %%mm2       \n\t"
1558
                "add                       $24, %%"FF_REG_d"\n\t"
1559
                "packssdw                %%mm2, %%mm4       \n\t"
1560
                "psraw                      $7, %%mm4       \n\t"
1561
1562
                "packuswb                %%mm4, %%mm0       \n\t"
1563
                "paddusb "MANGLE(bgr2YOffset)", %%mm0       \n\t"
1564
1565
                MOVNTQ"                  %%mm0, (%1, %%"FF_REG_a") \n\t"
1566
                "add                        $8,      %%"FF_REG_a"  \n\t"
1567
                " js                        1b                     \n\t"
1568
                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1569
                  NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset)
1570
                : "%"FF_REG_a, "%"FF_REG_d
1571
            );
1572
            ydst += lumStride;
1573
            src  += srcStride;
1574
        }
1575
        src -= srcStride*2;
1576
        __asm__ volatile(
1577
            "mov                        %4, %%"FF_REG_a"\n\t"
1578
            "movq          "MANGLE(w1111)", %%mm5       \n\t"
1579
            "movq          "BGR2U_IDX"(%5), %%mm6       \n\t"
1580
            "pxor                    %%mm7, %%mm7       \n\t"
1581
            "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1582
            "add              %%"FF_REG_d", %%"FF_REG_d"\n\t"
1583
            ".p2align                    4              \n\t"
1584
            "1:                                         \n\t"
1585
            PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1586
            PREFETCH" 64(%1, %%"FF_REG_d")              \n\t"
1587
            "movq       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1588
            "movq       (%1, %%"FF_REG_d"), %%mm1       \n\t"
1589
            "movq      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1590
            "movq      6(%1, %%"FF_REG_d"), %%mm3       \n\t"
1591
            PAVGB"                   %%mm1, %%mm0       \n\t"
1592
            PAVGB"                   %%mm3, %%mm2       \n\t"
1593
            "movq                    %%mm0, %%mm1       \n\t"
1594
            "movq                    %%mm2, %%mm3       \n\t"
1595
            "psrlq                     $24, %%mm0       \n\t"
1596
            "psrlq                     $24, %%mm2       \n\t"
1597
            PAVGB"                   %%mm1, %%mm0       \n\t"
1598
            PAVGB"                   %%mm3, %%mm2       \n\t"
1599
            "punpcklbw               %%mm7, %%mm0       \n\t"
1600
            "punpcklbw               %%mm7, %%mm2       \n\t"
1601
            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1602
            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1603
1604
            "pmaddwd                 %%mm0, %%mm1       \n\t"
1605
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1606
            "pmaddwd                 %%mm6, %%mm0       \n\t"
1607
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1608
            "psrad                      $8, %%mm0       \n\t"
1609
            "psrad                      $8, %%mm1       \n\t"
1610
            "psrad                      $8, %%mm2       \n\t"
1611
            "psrad                      $8, %%mm3       \n\t"
1612
            "packssdw                %%mm2, %%mm0       \n\t"
1613
            "packssdw                %%mm3, %%mm1       \n\t"
1614
            "pmaddwd                 %%mm5, %%mm0       \n\t"
1615
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1616
            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1617
            "psraw                      $7, %%mm0       \n\t"
1618
1619
            "movq     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1620
            "movq     12(%1, %%"FF_REG_d"), %%mm1       \n\t"
1621
            "movq     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1622
            "movq     18(%1, %%"FF_REG_d"), %%mm3       \n\t"
1623
            PAVGB"                   %%mm1, %%mm4       \n\t"
1624
            PAVGB"                   %%mm3, %%mm2       \n\t"
1625
            "movq                    %%mm4, %%mm1       \n\t"
1626
            "movq                    %%mm2, %%mm3       \n\t"
1627
            "psrlq                     $24, %%mm4       \n\t"
1628
            "psrlq                     $24, %%mm2       \n\t"
1629
            PAVGB"                   %%mm1, %%mm4       \n\t"
1630
            PAVGB"                   %%mm3, %%mm2       \n\t"
1631
            "punpcklbw               %%mm7, %%mm4       \n\t"
1632
            "punpcklbw               %%mm7, %%mm2       \n\t"
1633
            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1634
            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1635
1636
            "pmaddwd                 %%mm4, %%mm1       \n\t"
1637
            "pmaddwd                 %%mm2, %%mm3       \n\t"
1638
            "pmaddwd                 %%mm6, %%mm4       \n\t"
1639
            "pmaddwd                 %%mm6, %%mm2       \n\t"
1640
            "psrad                      $8, %%mm4       \n\t"
1641
            "psrad                      $8, %%mm1       \n\t"
1642
            "psrad                      $8, %%mm2       \n\t"
1643
            "psrad                      $8, %%mm3       \n\t"
1644
            "packssdw                %%mm2, %%mm4       \n\t"
1645
            "packssdw                %%mm3, %%mm1       \n\t"
1646
            "pmaddwd                 %%mm5, %%mm4       \n\t"
1647
            "pmaddwd                 %%mm5, %%mm1       \n\t"
1648
            "add                       $24, %%"FF_REG_d"\n\t"
1649
            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
1650
            "psraw                      $7, %%mm4       \n\t"
1651
1652
            "movq                    %%mm0, %%mm1           \n\t"
1653
            "punpckldq               %%mm4, %%mm0           \n\t"
1654
            "punpckhdq               %%mm4, %%mm1           \n\t"
1655
            "packsswb                %%mm1, %%mm0           \n\t"
1656
            "paddb  "MANGLE(bgr2UVOffset)", %%mm0           \n\t"
1657
            "movd                    %%mm0, (%2, %%"FF_REG_a") \n\t"
1658
            "punpckhdq               %%mm0, %%mm0              \n\t"
1659
            "movd                    %%mm0, (%3, %%"FF_REG_a") \n\t"
1660
            "add                        $4, %%"FF_REG_a"       \n\t"
1661
            " js                        1b              \n\t"
1662
            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1663
              NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset)
1664
            : "%"FF_REG_a, "%"FF_REG_d
1665
        );
1666
1667
        udst += chromStride;
1668
        vdst += chromStride;
1669
        src  += srcStride*2;
1670
    }
1671
1672
    __asm__ volatile(EMMS"       \n\t"
1673
                     SFENCE"     \n\t"
1674
                     :::"memory");
1675
1676
     ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1677
}
1678
#endif /* HAVE_7REGS */
1679
1680
static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
1681
                                       uint8_t *dst1, uint8_t *dst2,
1682
                                       int width, int height,
1683
                                       int srcStride1, int srcStride2,
1684
                                       int dstStride1, int dstStride2)
1685
0
{
1686
0
    int w,h;
1687
0
    w=width/2; h=height/2;
1688
0
    __asm__ volatile(
1689
0
        PREFETCH" %0    \n\t"
1690
0
        PREFETCH" %1    \n\t"
1691
0
        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1692
0
    for (x86_reg y = 0; y < h; y++) {
1693
0
        const uint8_t* s1=src1+srcStride1*(y>>1);
1694
0
        uint8_t* d=dst1+dstStride1*y;
1695
0
        x86_reg x = 0;
1696
0
        for (;x<w-31;x+=32) {
1697
0
            __asm__ volatile(
1698
0
                PREFETCH"   32(%1,%2)        \n\t"
1699
0
                "movq         (%1,%2), %%mm0 \n\t"
1700
0
                "movq        8(%1,%2), %%mm2 \n\t"
1701
0
                "movq       16(%1,%2), %%mm4 \n\t"
1702
0
                "movq       24(%1,%2), %%mm6 \n\t"
1703
0
                "movq      %%mm0, %%mm1 \n\t"
1704
0
                "movq      %%mm2, %%mm3 \n\t"
1705
0
                "movq      %%mm4, %%mm5 \n\t"
1706
0
                "movq      %%mm6, %%mm7 \n\t"
1707
0
                "punpcklbw %%mm0, %%mm0 \n\t"
1708
0
                "punpckhbw %%mm1, %%mm1 \n\t"
1709
0
                "punpcklbw %%mm2, %%mm2 \n\t"
1710
0
                "punpckhbw %%mm3, %%mm3 \n\t"
1711
0
                "punpcklbw %%mm4, %%mm4 \n\t"
1712
0
                "punpckhbw %%mm5, %%mm5 \n\t"
1713
0
                "punpcklbw %%mm6, %%mm6 \n\t"
1714
0
                "punpckhbw %%mm7, %%mm7 \n\t"
1715
0
                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1716
0
                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1717
0
                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1718
0
                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1719
0
                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1720
0
                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1721
0
                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1722
0
                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1723
0
                :: "r"(d), "r"(s1), "r"(x)
1724
0
                :"memory");
1725
0
        }
1726
0
        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1727
0
    }
1728
0
    for (x86_reg y = 0; y < h; y++) {
1729
0
        const uint8_t* s2=src2+srcStride2*(y>>1);
1730
0
        uint8_t* d=dst2+dstStride2*y;
1731
0
        x86_reg x = 0;
1732
0
        for (;x<w-31;x+=32) {
1733
0
            __asm__ volatile(
1734
0
                PREFETCH"   32(%1,%2)        \n\t"
1735
0
                "movq         (%1,%2), %%mm0 \n\t"
1736
0
                "movq        8(%1,%2), %%mm2 \n\t"
1737
0
                "movq       16(%1,%2), %%mm4 \n\t"
1738
0
                "movq       24(%1,%2), %%mm6 \n\t"
1739
0
                "movq      %%mm0, %%mm1 \n\t"
1740
0
                "movq      %%mm2, %%mm3 \n\t"
1741
0
                "movq      %%mm4, %%mm5 \n\t"
1742
0
                "movq      %%mm6, %%mm7 \n\t"
1743
0
                "punpcklbw %%mm0, %%mm0 \n\t"
1744
0
                "punpckhbw %%mm1, %%mm1 \n\t"
1745
0
                "punpcklbw %%mm2, %%mm2 \n\t"
1746
0
                "punpckhbw %%mm3, %%mm3 \n\t"
1747
0
                "punpcklbw %%mm4, %%mm4 \n\t"
1748
0
                "punpckhbw %%mm5, %%mm5 \n\t"
1749
0
                "punpcklbw %%mm6, %%mm6 \n\t"
1750
0
                "punpckhbw %%mm7, %%mm7 \n\t"
1751
0
                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1752
0
                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1753
0
                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1754
0
                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1755
0
                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1756
0
                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1757
0
                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1758
0
                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1759
0
                :: "r"(d), "r"(s2), "r"(x)
1760
0
                :"memory");
1761
0
        }
1762
0
        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1763
0
    }
1764
0
    __asm__(
1765
0
            EMMS"       \n\t"
1766
0
            SFENCE"     \n\t"
1767
0
            ::: "memory"
1768
0
        );
1769
0
}
1770
1771
static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1772
                                        uint8_t *dst,
1773
                                        int width, int height,
1774
                                        int srcStride1, int srcStride2,
1775
                                        int srcStride3, int dstStride)
1776
0
{
1777
0
    int w,h;
1778
0
    w=width/2; h=height;
1779
0
    for (int y = 0; y < h; y++) {
1780
0
        const uint8_t* yp=src1+srcStride1*y;
1781
0
        const uint8_t* up=src2+srcStride2*(y>>2);
1782
0
        const uint8_t* vp=src3+srcStride3*(y>>2);
1783
0
        uint8_t* d=dst+dstStride*y;
1784
0
        x86_reg x = 0;
1785
0
        for (;x<w-7;x+=8) {
1786
0
            __asm__ volatile(
1787
0
                PREFETCH"   32(%1, %0)          \n\t"
1788
0
                PREFETCH"   32(%2, %0)          \n\t"
1789
0
                PREFETCH"   32(%3, %0)          \n\t"
1790
0
                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1791
0
                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
1792
0
                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
1793
0
                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1794
0
                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
1795
0
                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
1796
0
                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1797
0
                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1798
0
                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1799
0
                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1800
1801
0
                "movq            %%mm1, %%mm6   \n\t"
1802
0
                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1803
0
                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1804
0
                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1805
0
                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
1806
0
                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
1807
1808
0
                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1809
0
                "movq     8(%1, %0, 4), %%mm0   \n\t"
1810
0
                "movq            %%mm0, %%mm3   \n\t"
1811
0
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1812
0
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1813
0
                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
1814
0
                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
1815
1816
0
                "movq            %%mm4, %%mm6   \n\t"
1817
0
                "movq    16(%1, %0, 4), %%mm0   \n\t"
1818
0
                "movq            %%mm0, %%mm3   \n\t"
1819
0
                "punpcklbw       %%mm5, %%mm4   \n\t"
1820
0
                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1821
0
                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1822
0
                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
1823
0
                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
1824
1825
0
                "punpckhbw       %%mm5, %%mm6   \n\t"
1826
0
                "movq    24(%1, %0, 4), %%mm0   \n\t"
1827
0
                "movq            %%mm0, %%mm3   \n\t"
1828
0
                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1829
0
                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
1830
0
                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
1831
0
                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
1832
1833
0
                : "+r" (x)
1834
0
                : "r"(yp), "r" (up), "r"(vp), "r"(d)
1835
0
                :"memory");
1836
0
        }
1837
0
        for (; x<w; x++) {
1838
0
            const int x2 = x<<2;
1839
0
            d[8*x+0] = yp[x2];
1840
0
            d[8*x+1] = up[x];
1841
0
            d[8*x+2] = yp[x2+1];
1842
0
            d[8*x+3] = vp[x];
1843
0
            d[8*x+4] = yp[x2+2];
1844
0
            d[8*x+5] = up[x];
1845
0
            d[8*x+6] = yp[x2+3];
1846
0
            d[8*x+7] = vp[x];
1847
0
        }
1848
0
    }
1849
0
    __asm__(
1850
0
            EMMS"       \n\t"
1851
0
            SFENCE"     \n\t"
1852
0
            ::: "memory"
1853
0
        );
1854
0
}
1855
1856
static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1857
0
{
1858
0
    dst +=   count;
1859
0
    src += 2*count;
1860
0
    count= - count;
1861
1862
0
    if(count <= -16) {
1863
0
        count += 15;
1864
0
        __asm__ volatile(
1865
0
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1866
0
            "psrlw            $8, %%mm7        \n\t"
1867
0
            "1:                                \n\t"
1868
0
            "movq -30(%1, %0, 2), %%mm0        \n\t"
1869
0
            "movq -22(%1, %0, 2), %%mm1        \n\t"
1870
0
            "movq -14(%1, %0, 2), %%mm2        \n\t"
1871
0
            "movq  -6(%1, %0, 2), %%mm3        \n\t"
1872
0
            "pand          %%mm7, %%mm0        \n\t"
1873
0
            "pand          %%mm7, %%mm1        \n\t"
1874
0
            "pand          %%mm7, %%mm2        \n\t"
1875
0
            "pand          %%mm7, %%mm3        \n\t"
1876
0
            "packuswb      %%mm1, %%mm0        \n\t"
1877
0
            "packuswb      %%mm3, %%mm2        \n\t"
1878
0
            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
1879
0
            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
1880
0
            "add             $16, %0           \n\t"
1881
0
            " js 1b                            \n\t"
1882
0
            : "+r"(count)
1883
0
            : "r"(src), "r"(dst)
1884
0
        );
1885
0
        count -= 15;
1886
0
    }
1887
0
    while(count<0) {
1888
0
        dst[count]= src[2*count];
1889
0
        count++;
1890
0
    }
1891
0
}
1892
1893
static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1894
0
{
1895
0
    src ++;
1896
0
    dst +=   count;
1897
0
    src += 2*count;
1898
0
    count= - count;
1899
1900
0
    if(count < -16) {
1901
0
        count += 16;
1902
0
        __asm__ volatile(
1903
0
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1904
0
            "psrlw            $8, %%mm7        \n\t"
1905
0
            "1:                                \n\t"
1906
0
            "movq -32(%1, %0, 2), %%mm0        \n\t"
1907
0
            "movq -24(%1, %0, 2), %%mm1        \n\t"
1908
0
            "movq -16(%1, %0, 2), %%mm2        \n\t"
1909
0
            "movq  -8(%1, %0, 2), %%mm3        \n\t"
1910
0
            "pand          %%mm7, %%mm0        \n\t"
1911
0
            "pand          %%mm7, %%mm1        \n\t"
1912
0
            "pand          %%mm7, %%mm2        \n\t"
1913
0
            "pand          %%mm7, %%mm3        \n\t"
1914
0
            "packuswb      %%mm1, %%mm0        \n\t"
1915
0
            "packuswb      %%mm3, %%mm2        \n\t"
1916
0
            MOVNTQ"        %%mm0,-16(%2, %0)   \n\t"
1917
0
            MOVNTQ"        %%mm2,- 8(%2, %0)   \n\t"
1918
0
            "add             $16, %0           \n\t"
1919
0
            " js 1b                            \n\t"
1920
0
            : "+r"(count)
1921
0
            : "r"(src), "r"(dst)
1922
0
        );
1923
0
        count -= 16;
1924
0
    }
1925
0
    while(count<0) {
1926
0
        dst[count]= src[2*count];
1927
0
        count++;
1928
0
    }
1929
0
}
1930
1931
#if ARCH_X86_32
1932
static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1933
{
1934
    dst0+=   count;
1935
    dst1+=   count;
1936
    src += 4*count;
1937
    count= - count;
1938
    if(count <= -8) {
1939
        count += 7;
1940
        __asm__ volatile(
1941
            "pcmpeqw       %%mm7, %%mm7        \n\t"
1942
            "psrlw            $8, %%mm7        \n\t"
1943
            "1:                                \n\t"
1944
            "movq -28(%1, %0, 4), %%mm0        \n\t"
1945
            "movq -20(%1, %0, 4), %%mm1        \n\t"
1946
            "movq -12(%1, %0, 4), %%mm2        \n\t"
1947
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
1948
            "pand          %%mm7, %%mm0        \n\t"
1949
            "pand          %%mm7, %%mm1        \n\t"
1950
            "pand          %%mm7, %%mm2        \n\t"
1951
            "pand          %%mm7, %%mm3        \n\t"
1952
            "packuswb      %%mm1, %%mm0        \n\t"
1953
            "packuswb      %%mm3, %%mm2        \n\t"
1954
            "movq          %%mm0, %%mm1        \n\t"
1955
            "movq          %%mm2, %%mm3        \n\t"
1956
            "psrlw            $8, %%mm0        \n\t"
1957
            "psrlw            $8, %%mm2        \n\t"
1958
            "pand          %%mm7, %%mm1        \n\t"
1959
            "pand          %%mm7, %%mm3        \n\t"
1960
            "packuswb      %%mm2, %%mm0        \n\t"
1961
            "packuswb      %%mm3, %%mm1        \n\t"
1962
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
1963
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
1964
            "add              $8, %0           \n\t"
1965
            " js 1b                            \n\t"
1966
            : "+r"(count)
1967
            : "r"(src), "r"(dst0), "r"(dst1)
1968
        );
1969
        count -= 7;
1970
    }
1971
    while(count<0) {
1972
        dst0[count]= src[4*count+0];
1973
        dst1[count]= src[4*count+2];
1974
        count++;
1975
    }
1976
}
1977
#endif /* ARCH_X86_32 */
1978
1979
static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1980
0
{
1981
0
    dst0 +=   count;
1982
0
    dst1 +=   count;
1983
0
    src0 += 4*count;
1984
0
    src1 += 4*count;
1985
0
    count= - count;
1986
0
#ifdef PAVGB
1987
0
    if(count <= -8) {
1988
0
        count += 7;
1989
0
        __asm__ volatile(
1990
0
            "pcmpeqw        %%mm7, %%mm7        \n\t"
1991
0
            "psrlw             $8, %%mm7        \n\t"
1992
0
            "1:                                \n\t"
1993
0
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
1994
0
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
1995
0
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
1996
0
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
1997
0
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
1998
0
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
1999
0
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2000
0
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2001
0
            "pand           %%mm7, %%mm0        \n\t"
2002
0
            "pand           %%mm7, %%mm1        \n\t"
2003
0
            "pand           %%mm7, %%mm2        \n\t"
2004
0
            "pand           %%mm7, %%mm3        \n\t"
2005
0
            "packuswb       %%mm1, %%mm0        \n\t"
2006
0
            "packuswb       %%mm3, %%mm2        \n\t"
2007
0
            "movq           %%mm0, %%mm1        \n\t"
2008
0
            "movq           %%mm2, %%mm3        \n\t"
2009
0
            "psrlw             $8, %%mm0        \n\t"
2010
0
            "psrlw             $8, %%mm2        \n\t"
2011
0
            "pand           %%mm7, %%mm1        \n\t"
2012
0
            "pand           %%mm7, %%mm3        \n\t"
2013
0
            "packuswb       %%mm2, %%mm0        \n\t"
2014
0
            "packuswb       %%mm3, %%mm1        \n\t"
2015
0
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2016
0
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2017
0
            "add               $8, %0           \n\t"
2018
0
            " js 1b                            \n\t"
2019
0
            : "+r"(count)
2020
0
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2021
0
        );
2022
0
        count -= 7;
2023
0
    }
2024
0
#endif
2025
0
    while(count<0) {
2026
0
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2027
0
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2028
0
        count++;
2029
0
    }
2030
0
}
2031
2032
static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2033
0
{
2034
0
    dst0+=   count;
2035
0
    dst1+=   count;
2036
0
    src += 4*count;
2037
0
    count= - count;
2038
0
    if(count <= -8) {
2039
0
        count += 7;
2040
0
        __asm__ volatile(
2041
0
            "pcmpeqw       %%mm7, %%mm7        \n\t"
2042
0
            "psrlw            $8, %%mm7        \n\t"
2043
0
            "1:                                \n\t"
2044
0
            "movq -28(%1, %0, 4), %%mm0        \n\t"
2045
0
            "movq -20(%1, %0, 4), %%mm1        \n\t"
2046
0
            "movq -12(%1, %0, 4), %%mm2        \n\t"
2047
0
            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2048
0
            "psrlw            $8, %%mm0        \n\t"
2049
0
            "psrlw            $8, %%mm1        \n\t"
2050
0
            "psrlw            $8, %%mm2        \n\t"
2051
0
            "psrlw            $8, %%mm3        \n\t"
2052
0
            "packuswb      %%mm1, %%mm0        \n\t"
2053
0
            "packuswb      %%mm3, %%mm2        \n\t"
2054
0
            "movq          %%mm0, %%mm1        \n\t"
2055
0
            "movq          %%mm2, %%mm3        \n\t"
2056
0
            "psrlw            $8, %%mm0        \n\t"
2057
0
            "psrlw            $8, %%mm2        \n\t"
2058
0
            "pand          %%mm7, %%mm1        \n\t"
2059
0
            "pand          %%mm7, %%mm3        \n\t"
2060
0
            "packuswb      %%mm2, %%mm0        \n\t"
2061
0
            "packuswb      %%mm3, %%mm1        \n\t"
2062
0
            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2063
0
            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2064
0
            "add              $8, %0           \n\t"
2065
0
            " js 1b                            \n\t"
2066
0
            : "+r"(count)
2067
0
            : "r"(src), "r"(dst0), "r"(dst1)
2068
0
        );
2069
0
        count -= 7;
2070
0
    }
2071
0
    src++;
2072
0
    while(count<0) {
2073
0
        dst0[count]= src[4*count+0];
2074
0
        dst1[count]= src[4*count+2];
2075
0
        count++;
2076
0
    }
2077
0
}
2078
2079
static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2080
0
{
2081
0
    dst0 +=   count;
2082
0
    dst1 +=   count;
2083
0
    src0 += 4*count;
2084
0
    src1 += 4*count;
2085
0
    count= - count;
2086
0
#ifdef PAVGB
2087
0
    if(count <= -8) {
2088
0
        count += 7;
2089
0
        __asm__ volatile(
2090
0
            "pcmpeqw        %%mm7, %%mm7        \n\t"
2091
0
            "psrlw             $8, %%mm7        \n\t"
2092
0
            "1:                                \n\t"
2093
0
            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2094
0
            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2095
0
            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2096
0
            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2097
0
            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2098
0
            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2099
0
            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2100
0
            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2101
0
            "psrlw             $8, %%mm0        \n\t"
2102
0
            "psrlw             $8, %%mm1        \n\t"
2103
0
            "psrlw             $8, %%mm2        \n\t"
2104
0
            "psrlw             $8, %%mm3        \n\t"
2105
0
            "packuswb       %%mm1, %%mm0        \n\t"
2106
0
            "packuswb       %%mm3, %%mm2        \n\t"
2107
0
            "movq           %%mm0, %%mm1        \n\t"
2108
0
            "movq           %%mm2, %%mm3        \n\t"
2109
0
            "psrlw             $8, %%mm0        \n\t"
2110
0
            "psrlw             $8, %%mm2        \n\t"
2111
0
            "pand           %%mm7, %%mm1        \n\t"
2112
0
            "pand           %%mm7, %%mm3        \n\t"
2113
0
            "packuswb       %%mm2, %%mm0        \n\t"
2114
0
            "packuswb       %%mm3, %%mm1        \n\t"
2115
0
            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2116
0
            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2117
0
            "add               $8, %0           \n\t"
2118
0
            " js 1b                            \n\t"
2119
0
            : "+r"(count)
2120
0
            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2121
0
        );
2122
0
        count -= 7;
2123
0
    }
2124
0
#endif
2125
0
    src0++;
2126
0
    src1++;
2127
0
    while(count<0) {
2128
0
        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2129
0
        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2130
0
        count++;
2131
0
    }
2132
0
}
2133
2134
static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2135
                                 int width, int height,
2136
                                 int lumStride, int chromStride, int srcStride)
2137
0
{
2138
0
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2139
2140
0
    for (int y = 0; y < height; y++) {
2141
0
        extract_even_mmxext(src, ydst, width);
2142
0
        if(y&1) {
2143
0
            extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2144
0
            udst+= chromStride;
2145
0
            vdst+= chromStride;
2146
0
        }
2147
2148
0
        src += srcStride;
2149
0
        ydst+= lumStride;
2150
0
    }
2151
0
    __asm__(
2152
0
            EMMS"       \n\t"
2153
0
            SFENCE"     \n\t"
2154
0
            ::: "memory"
2155
0
        );
2156
0
}
2157
2158
static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2159
                                 int width, int height,
2160
                                 int lumStride, int chromStride, int srcStride)
2161
0
{
2162
0
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2163
2164
0
    for (int y = 0; y < height; y++) {
2165
0
        extract_even_mmxext(src, ydst, width);
2166
0
        extract_odd2_mmxext(src, udst, vdst, chromWidth);
2167
2168
0
        src += srcStride;
2169
0
        ydst+= lumStride;
2170
0
        udst+= chromStride;
2171
0
        vdst+= chromStride;
2172
0
    }
2173
0
    __asm__(
2174
0
            EMMS"       \n\t"
2175
0
            SFENCE"     \n\t"
2176
0
            ::: "memory"
2177
0
        );
2178
0
}
2179
2180
static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2181
                                 int width, int height,
2182
                                 int lumStride, int chromStride, int srcStride)
2183
0
{
2184
0
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2185
2186
0
    for (int y = 0; y < height; y++) {
2187
0
        extract_odd_mmxext(src, ydst, width);
2188
0
        if(y&1) {
2189
0
            extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2190
0
            udst+= chromStride;
2191
0
            vdst+= chromStride;
2192
0
        }
2193
2194
0
        src += srcStride;
2195
0
        ydst+= lumStride;
2196
0
    }
2197
0
    __asm__(
2198
0
            EMMS"       \n\t"
2199
0
            SFENCE"     \n\t"
2200
0
            ::: "memory"
2201
0
        );
2202
0
}
2203
2204
#if ARCH_X86_32
2205
static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2206
                                 int width, int height,
2207
                                 int lumStride, int chromStride, int srcStride)
2208
{
2209
    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2210
2211
    for (int y = 0; y < height; y++) {
2212
        extract_odd_mmxext(src, ydst, width);
2213
        extract_even2_mmxext(src, udst, vdst, chromWidth);
2214
2215
        src += srcStride;
2216
        ydst+= lumStride;
2217
        udst+= chromStride;
2218
        vdst+= chromStride;
2219
    }
2220
    __asm__(
2221
            EMMS"       \n\t"
2222
            SFENCE"     \n\t"
2223
            ::: "memory"
2224
        );
2225
}
2226
#endif /* ARCH_X86_32 */
2227
2228
static av_cold void rgb2rgb_init_mmxext(void)
2229
0
{
2230
0
    rgb15to16          = rgb15to16_mmxext;
2231
0
    rgb15tobgr24       = rgb15tobgr24_mmxext;
2232
0
    rgb15to32          = rgb15to32_mmxext;
2233
0
    rgb16tobgr24       = rgb16tobgr24_mmxext;
2234
0
    rgb16to32          = rgb16to32_mmxext;
2235
0
    rgb16to15          = rgb16to15_mmxext;
2236
0
    rgb24tobgr16       = rgb24tobgr16_mmxext;
2237
0
    rgb24tobgr15       = rgb24tobgr15_mmxext;
2238
0
    rgb24tobgr32       = rgb24tobgr32_mmxext;
2239
0
    rgb32to16          = rgb32to16_mmxext;
2240
0
    rgb32to15          = rgb32to15_mmxext;
2241
0
    rgb32tobgr24       = rgb32tobgr24_mmxext;
2242
0
    rgb24to15          = rgb24to15_mmxext;
2243
0
    rgb24to16          = rgb24to16_mmxext;
2244
0
    rgb24tobgr24       = rgb24tobgr24_mmxext;
2245
0
    rgb32tobgr16       = rgb32tobgr16_mmxext;
2246
0
    rgb32tobgr15       = rgb32tobgr15_mmxext;
2247
0
    yv12toyuy2         = yv12toyuy2_mmxext;
2248
0
    yv12touyvy         = yv12touyvy_mmxext;
2249
0
    yuv422ptoyuy2      = yuv422ptoyuy2_mmxext;
2250
0
    yuv422ptouyvy      = yuv422ptouyvy_mmxext;
2251
0
    yuy2toyv12         = yuy2toyv12_mmxext;
2252
0
    vu9_to_vu12        = vu9_to_vu12_mmxext;
2253
0
    yvu9_to_yuy2       = yvu9_to_yuy2_mmxext;
2254
#if ARCH_X86_32
2255
    uyvytoyuv422       = uyvytoyuv422_mmxext;
2256
#endif
2257
0
    yuyvtoyuv422       = yuyvtoyuv422_mmxext;
2258
2259
0
    planar2x           = planar2x_mmxext;
2260
#if ARCH_X86_32 && HAVE_7REGS
2261
    ff_rgb24toyv12     = rgb24toyv12_mmxext;
2262
#endif /* ARCH_X86_32 && HAVE_7REGS */
2263
2264
0
    yuyvtoyuv420       = yuyvtoyuv420_mmxext;
2265
0
    uyvytoyuv420       = uyvytoyuv420_mmxext;
2266
0
}
2267
2268
//SSE2 versions
2269
static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2270
                                  int width, int height, int src1Stride,
2271
                                  int src2Stride, int dstStride)
2272
0
{
2273
0
    for (int h = 0; h < height; h++) {
2274
0
        if (width >= 16) {
2275
0
            if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
2276
0
        __asm__(
2277
0
            "xor              %%"FF_REG_a", %%"FF_REG_a"  \n\t"
2278
0
            "1:                                     \n\t"
2279
0
            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
2280
0
            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
2281
0
            "movdqa  (%1, %%"FF_REG_a"), %%xmm0     \n\t"
2282
0
            "movdqa  (%1, %%"FF_REG_a"), %%xmm1     \n\t"
2283
0
            "movdqa  (%2, %%"FF_REG_a"), %%xmm2     \n\t"
2284
0
            "punpcklbw           %%xmm2, %%xmm0     \n\t"
2285
0
            "punpckhbw           %%xmm2, %%xmm1     \n\t"
2286
0
            "movntdq             %%xmm0,   (%0, %%"FF_REG_a", 2) \n\t"
2287
0
            "movntdq             %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
2288
0
            "add                    $16, %%"FF_REG_a"            \n\t"
2289
0
            "cmp                     %3, %%"FF_REG_a"            \n\t"
2290
0
            " jb                     1b             \n\t"
2291
0
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2292
0
            : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
2293
0
        );
2294
0
            } else
2295
0
        __asm__(
2296
0
            "xor %%"FF_REG_a", %%"FF_REG_a"         \n\t"
2297
0
            "1:                                     \n\t"
2298
0
            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
2299
0
            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
2300
0
            "movq    (%1, %%"FF_REG_a"), %%mm0      \n\t"
2301
0
            "movq   8(%1, %%"FF_REG_a"), %%mm2      \n\t"
2302
0
            "movq                 %%mm0, %%mm1      \n\t"
2303
0
            "movq                 %%mm2, %%mm3      \n\t"
2304
0
            "movq    (%2, %%"FF_REG_a"), %%mm4      \n\t"
2305
0
            "movq   8(%2, %%"FF_REG_a"), %%mm5      \n\t"
2306
0
            "punpcklbw            %%mm4, %%mm0      \n\t"
2307
0
            "punpckhbw            %%mm4, %%mm1      \n\t"
2308
0
            "punpcklbw            %%mm5, %%mm2      \n\t"
2309
0
            "punpckhbw            %%mm5, %%mm3      \n\t"
2310
0
            MOVNTQ"               %%mm0,   (%0, %%"FF_REG_a", 2) \n\t"
2311
0
            MOVNTQ"               %%mm1,  8(%0, %%"FF_REG_a", 2) \n\t"
2312
0
            MOVNTQ"               %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
2313
0
            MOVNTQ"               %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
2314
0
            "add                    $16, %%"FF_REG_a"            \n\t"
2315
0
            "cmp                     %3, %%"FF_REG_a"            \n\t"
2316
0
            " jb                     1b                          \n\t"
2317
0
            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2318
0
            : "memory", "%"FF_REG_a
2319
0
        );
2320
2321
0
        }
2322
0
        for (int w = (width & (~15)); w < width; w++) {
2323
0
            dest[2*w+0] = src1[w];
2324
0
            dest[2*w+1] = src2[w];
2325
0
        }
2326
0
        dest += dstStride;
2327
0
        src1 += src1Stride;
2328
0
        src2 += src2Stride;
2329
0
    }
2330
0
    __asm__(
2331
0
            EMMS"       \n\t"
2332
0
            SFENCE"     \n\t"
2333
0
            ::: "memory"
2334
0
            );
2335
0
}
2336
2337
/*
2338
 RGB15->RGB16 original by Strepto/Astral
2339
 ported to gcc & bugfixed : A'rpi
2340
 MMXEXT, 3DNOW optimization by Nick Kurshev
2341
 32-bit C version, and and&add trick by Michael Niedermayer
2342
*/
2343
2344
#endif /* HAVE_INLINE_ASM */
2345
2346
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2347
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2348
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2349
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2350
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2351
void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2352
void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2353
void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2354
void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2355
2356
#if ARCH_X86_64
2357
void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2358
void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2359
void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2360
void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2361
void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2362
void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2363
void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2364
void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2365
void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2366
2367
void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2368
void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2369
void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2370
void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2371
void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2372
void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2373
void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2374
void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2375
void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2376
2377
void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2378
                          const uint8_t *src, int width, int height,
2379
                          int lumStride, int chromStride, int srcStride);
2380
void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2381
                         const uint8_t *src, int width, int height,
2382
                         int lumStride, int chromStride, int srcStride);
2383
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2384
                          const uint8_t *src, int width, int height,
2385
                          int lumStride, int chromStride, int srcStride);
2386
void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2387
                               const uint8_t *src, int width, int height,
2388
                               int lumStride, int chromStride, int srcStride);
2389
#endif
2390
2391
#define DEINTERLEAVE_BYTES(cpuext)                                            \
2392
void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV,                     \
2393
                            const uint8_t *unused,                            \
2394
                            const uint8_t *src1,                              \
2395
                            const uint8_t *src2,                              \
2396
                            int w,                                            \
2397
                            uint32_t *unused2,                                \
2398
                            void *opq);                                       \
2399
static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
2400
                                          int width, int height, int srcStride, \
2401
0
                                          int dst1Stride, int dst2Stride)     \
2402
0
{                                                                             \
2403
0
    for (int h = 0; h < height; h++) {                                        \
2404
0
        if (width >= 16)                                                      \
2405
0
            ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
2406
0
        for (int w = (width & (~15)); w < width; w++) {                       \
2407
0
            dst1[w] = src[2*w+0];                                             \
2408
0
            dst2[w] = src[2*w+1];                                             \
2409
0
        }                                                                     \
2410
0
        src  += srcStride;                                                    \
2411
0
        dst1 += dst1Stride;                                                   \
2412
0
        dst2 += dst2Stride;                                                   \
2413
0
    }                                                                         \
2414
0
}
2415
2416
#if HAVE_SSE2_EXTERNAL
2417
0
DEINTERLEAVE_BYTES(sse2)
2418
#endif
2419
#if HAVE_AVX_EXTERNAL
2420
0
DEINTERLEAVE_BYTES(avx)
2421
#endif
2422
2423
av_cold void rgb2rgb_init_x86(void)
2424
0
{
2425
0
    int cpu_flags = av_get_cpu_flags();
2426
2427
0
#if HAVE_INLINE_ASM
2428
0
    if (INLINE_MMXEXT(cpu_flags))
2429
0
        rgb2rgb_init_mmxext();
2430
0
    if (INLINE_SSE2(cpu_flags))
2431
0
        interleaveBytes = interleave_bytes_sse2;
2432
0
#endif /* HAVE_INLINE_ASM */
2433
2434
0
#if HAVE_SSE2_EXTERNAL
2435
0
    if (EXTERNAL_SSE2(cpu_flags)) {
2436
0
#if ARCH_X86_64
2437
0
        uyvytoyuv422 = ff_uyvytoyuv422_sse2;
2438
0
#endif
2439
0
        deinterleaveBytes = deinterleave_bytes_sse2;
2440
0
    }
2441
0
#endif
2442
0
    if (EXTERNAL_SSSE3(cpu_flags)) {
2443
0
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3;
2444
0
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3;
2445
0
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3;
2446
0
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3;
2447
0
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3;
2448
0
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3;
2449
0
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3;
2450
0
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3;
2451
0
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3;
2452
0
    }
2453
0
#if HAVE_AVX_EXTERNAL
2454
0
    if (EXTERNAL_AVX(cpu_flags)) {
2455
0
        deinterleaveBytes = deinterleave_bytes_avx;
2456
0
#if ARCH_X86_64
2457
0
        uyvytoyuv422 = ff_uyvytoyuv422_avx;
2458
0
    }
2459
0
    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
2460
0
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
2461
0
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
2462
0
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
2463
0
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
2464
0
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
2465
0
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
2466
0
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
2467
0
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
2468
0
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
2469
0
    }
2470
0
    if (EXTERNAL_AVX512ICL(cpu_flags)) {
2471
0
        shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
2472
0
        shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
2473
0
        shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
2474
0
        shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
2475
0
        shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
2476
0
        shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
2477
0
        shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
2478
0
        shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
2479
0
        shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
2480
0
    }
2481
0
    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
2482
0
        uyvytoyuv422 = ff_uyvytoyuv422_avx2;
2483
0
    }
2484
0
    if (EXTERNAL_AVX512ICL(cpu_flags)) {
2485
0
        uyvytoyuv422 = ff_uyvytoyuv422_avx512icl;
2486
0
#endif
2487
0
    }
2488
0
#endif
2489
0
}