Coverage Report

Created: 2026-01-25 07:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/ffmpeg/libswscale/x86/swscale_template.c
Line
Count
Source
1
/*
2
 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
21
#include <stdint.h>
22
23
#include "libavutil/x86/asm.h"
24
#include "libswscale/swscale_internal.h"
25
26
#undef REAL_MOVNTQ
27
#undef MOVNTQ
28
#undef MOVNTQ2
29
#undef PREFETCH
30
31
32
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
33
#define MOVNTQ2 "movntq "
34
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
35
36
#define YSCALEYUV2PACKEDX_UV \
37
0
    __asm__ volatile(\
38
0
        "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\
39
0
        ".p2align                      4                \n\t"\
40
0
        "nop                                            \n\t"\
41
0
        "1:                                             \n\t"\
42
0
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
43
0
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
44
0
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
45
0
        "movq                      %%mm3, %%mm4         \n\t"\
46
0
        ".p2align                      4                \n\t"\
47
0
        "2:                                             \n\t"\
48
0
        "movq            8(%%"FF_REG_d"), %%mm0         \n\t" /* filterCoeff */\
49
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* UsrcData */\
50
0
        "add                          %6, %%"FF_REG_S"  \n\t" \
51
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm5      \n\t" /* VsrcData */\
52
0
        "add                         $16, %%"FF_REG_d"  \n\t"\
53
0
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
54
0
        "pmulhw                    %%mm0, %%mm2         \n\t"\
55
0
        "pmulhw                    %%mm0, %%mm5         \n\t"\
56
0
        "paddw                     %%mm2, %%mm3         \n\t"\
57
0
        "paddw                     %%mm5, %%mm4         \n\t"\
58
0
        "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
59
0
        " jnz                         2b                \n\t"\
60
61
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
62
    "lea                "offset"(%0), %%"FF_REG_d"  \n\t"\
63
    "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
64
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
65
    "movq                    "#dst1", "#dst2"       \n\t"\
66
    ".p2align                      4                \n\t"\
67
    "2:                                             \n\t"\
68
    "movq            8(%%"FF_REG_d"), "#coeff"      \n\t" /* filterCoeff */\
69
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
70
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
71
    "add                         $16, %%"FF_REG_d"  \n\t"\
72
    "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
73
    "pmulhw                 "#coeff", "#src1"       \n\t"\
74
    "pmulhw                 "#coeff", "#src2"       \n\t"\
75
    "paddw                   "#src1", "#dst1"       \n\t"\
76
    "paddw                   "#src2", "#dst2"       \n\t"\
77
    "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
78
    " jnz                         2b                \n\t"\
79
80
#define YSCALEYUV2PACKEDX \
81
0
    YSCALEYUV2PACKEDX_UV \
82
0
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
83
84
#define YSCALEYUV2PACKEDX_END                     \
85
0
        :: "r" (&c->redDither),                   \
86
0
            "m" (dummy), "m" (dummy), "m" (dummy),\
87
0
            "r" (dest), "m" (dstW_reg), "m"(uv_off) \
88
0
            NAMED_CONSTRAINTS_ADD(bF8,bFC) \
89
0
        : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S            \
90
0
    );
91
92
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
93
0
    __asm__ volatile(\
94
0
        "xor %%"FF_REG_a", %%"FF_REG_a"                 \n\t"\
95
0
        ".p2align                      4                \n\t"\
96
0
        "nop                                            \n\t"\
97
0
        "1:                                             \n\t"\
98
0
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
99
0
        "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
100
0
        "pxor                      %%mm4, %%mm4         \n\t"\
101
0
        "pxor                      %%mm5, %%mm5         \n\t"\
102
0
        "pxor                      %%mm6, %%mm6         \n\t"\
103
0
        "pxor                      %%mm7, %%mm7         \n\t"\
104
0
        ".p2align                      4                \n\t"\
105
0
        "2:                                             \n\t"\
106
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm0      \n\t" /* UsrcData */\
107
0
        "add                          %6, %%"FF_REG_S"  \n\t" \
108
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* VsrcData */\
109
0
        "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
110
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm1      \n\t" /* UsrcData */\
111
0
        "movq                      %%mm0, %%mm3         \n\t"\
112
0
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
113
0
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
114
0
        "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1      \n\t" /* filterCoeff */\
115
0
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
116
0
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
117
0
        "paddd                     %%mm0, %%mm4         \n\t"\
118
0
        "paddd                     %%mm3, %%mm5         \n\t"\
119
0
        "add                          %6, %%"FF_REG_S"  \n\t" \
120
0
        "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm3      \n\t" /* VsrcData */\
121
0
        "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
122
0
        "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
123
0
        "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
124
0
        "movq                      %%mm2, %%mm0         \n\t"\
125
0
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
126
0
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
127
0
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
128
0
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
129
0
        "paddd                     %%mm2, %%mm6         \n\t"\
130
0
        "paddd                     %%mm0, %%mm7         \n\t"\
131
0
        " jnz                         2b                \n\t"\
132
0
        "psrad                       $16, %%mm4         \n\t"\
133
0
        "psrad                       $16, %%mm5         \n\t"\
134
0
        "psrad                       $16, %%mm6         \n\t"\
135
0
        "psrad                       $16, %%mm7         \n\t"\
136
0
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
137
0
        "packssdw                  %%mm5, %%mm4         \n\t"\
138
0
        "packssdw                  %%mm7, %%mm6         \n\t"\
139
0
        "paddw                     %%mm0, %%mm4         \n\t"\
140
0
        "paddw                     %%mm0, %%mm6         \n\t"\
141
0
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
142
0
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
143
144
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
145
    "lea                "offset"(%0), %%"FF_REG_d"      \n\t"\
146
    "mov                 (%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
147
    "pxor                      %%mm1, %%mm1         \n\t"\
148
    "pxor                      %%mm5, %%mm5         \n\t"\
149
    "pxor                      %%mm7, %%mm7         \n\t"\
150
    "pxor                      %%mm6, %%mm6         \n\t"\
151
    ".p2align                      4                \n\t"\
152
    "2:                                             \n\t"\
153
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0       \n\t" /* Y1srcData */\
154
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2       \n\t" /* Y2srcData */\
155
    "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
156
    "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4       \n\t" /* Y1srcData */\
157
    "movq                      %%mm0, %%mm3         \n\t"\
158
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
159
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
160
    "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4     \n\t" /* filterCoeff */\
161
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
162
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
163
    "paddd                     %%mm0, %%mm1         \n\t"\
164
    "paddd                     %%mm3, %%mm5         \n\t"\
165
    "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3   \n\t" /* Y2srcData */\
166
    "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
167
    "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
168
    "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
169
    "movq                      %%mm2, %%mm0         \n\t"\
170
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
171
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
172
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
173
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
174
    "paddd                     %%mm2, %%mm7         \n\t"\
175
    "paddd                     %%mm0, %%mm6         \n\t"\
176
    " jnz                         2b                \n\t"\
177
    "psrad                       $16, %%mm1         \n\t"\
178
    "psrad                       $16, %%mm5         \n\t"\
179
    "psrad                       $16, %%mm7         \n\t"\
180
    "psrad                       $16, %%mm6         \n\t"\
181
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
182
    "packssdw                  %%mm5, %%mm1         \n\t"\
183
    "packssdw                  %%mm6, %%mm7         \n\t"\
184
    "paddw                     %%mm0, %%mm1         \n\t"\
185
    "paddw                     %%mm0, %%mm7         \n\t"\
186
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
187
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
188
189
#define YSCALEYUV2PACKEDX_ACCURATE \
190
0
    YSCALEYUV2PACKEDX_ACCURATE_UV \
191
0
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
192
193
#define YSCALEYUV2RGBX \
194
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
195
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
196
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
197
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
198
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
199
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
200
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
201
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
202
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
203
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
204
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
205
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
206
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
207
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
208
    "paddw           %%mm3, %%mm4       \n\t"\
209
    "movq            %%mm2, %%mm0       \n\t"\
210
    "movq            %%mm5, %%mm6       \n\t"\
211
    "movq            %%mm4, %%mm3       \n\t"\
212
    "punpcklwd       %%mm2, %%mm2       \n\t"\
213
    "punpcklwd       %%mm5, %%mm5       \n\t"\
214
    "punpcklwd       %%mm4, %%mm4       \n\t"\
215
    "paddw           %%mm1, %%mm2       \n\t"\
216
    "paddw           %%mm1, %%mm5       \n\t"\
217
    "paddw           %%mm1, %%mm4       \n\t"\
218
    "punpckhwd       %%mm0, %%mm0       \n\t"\
219
    "punpckhwd       %%mm6, %%mm6       \n\t"\
220
    "punpckhwd       %%mm3, %%mm3       \n\t"\
221
    "paddw           %%mm7, %%mm0       \n\t"\
222
    "paddw           %%mm7, %%mm6       \n\t"\
223
    "paddw           %%mm7, %%mm3       \n\t"\
224
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
225
    "packuswb        %%mm0, %%mm2       \n\t"\
226
    "packuswb        %%mm6, %%mm5       \n\t"\
227
    "packuswb        %%mm3, %%mm4       \n\t"\
228
229
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
230
    "movq       "#b", "#q2"     \n\t" /* B */\
231
    "movq       "#r", "#t"      \n\t" /* R */\
232
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
233
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
234
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
235
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
236
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
237
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
238
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
239
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
240
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
241
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
242
\
243
    MOVNTQ(   q0,   (dst, index, 4))\
244
    MOVNTQ(    b,  8(dst, index, 4))\
245
    MOVNTQ(   q2, 16(dst, index, 4))\
246
    MOVNTQ(   q3, 24(dst, index, 4))\
247
\
248
    "add      $8, "#index"      \n\t"\
249
    "cmp  "dstw", "#index"      \n\t"\
250
    " jb      1b                \n\t"
251
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
252
253
static void RENAME(yuv2rgb32_X_ar)(SwsInternal *c, const int16_t *lumFilter,
254
                                   const int16_t **lumSrc, int lumFilterSize,
255
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
256
                                   const int16_t **chrVSrc,
257
                                   int chrFilterSize, const int16_t **alpSrc,
258
                                   uint8_t *dest, int dstW, int dstY)
259
0
{
260
0
    x86_reg dummy=0;
261
0
    x86_reg dstW_reg = dstW;
262
0
    x86_reg uv_off = c->uv_offx2;
263
264
0
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
265
0
        YSCALEYUV2PACKEDX_ACCURATE
266
0
        YSCALEYUV2RGBX
267
0
        "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
268
0
        "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
269
0
        "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
270
0
        YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
271
0
        "movq               "Y_TEMP"(%0), %%mm5         \n\t"
272
0
        "psraw                        $3, %%mm1         \n\t"
273
0
        "psraw                        $3, %%mm7         \n\t"
274
0
        "packuswb                  %%mm7, %%mm1         \n\t"
275
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
276
0
        YSCALEYUV2PACKEDX_END
277
0
    } else {
278
0
        YSCALEYUV2PACKEDX_ACCURATE
279
0
        YSCALEYUV2RGBX
280
0
        "pcmpeqd %%mm7, %%mm7 \n\t"
281
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
282
0
        YSCALEYUV2PACKEDX_END
283
0
    }
284
0
}
285
286
static void RENAME(yuv2rgb32_X)(SwsInternal *c, const int16_t *lumFilter,
287
                                const int16_t **lumSrc, int lumFilterSize,
288
                                const int16_t *chrFilter, const int16_t **chrUSrc,
289
                                const int16_t **chrVSrc,
290
                                int chrFilterSize, const int16_t **alpSrc,
291
                                uint8_t *dest, int dstW, int dstY)
292
0
{
293
0
    x86_reg dummy=0;
294
0
    x86_reg dstW_reg = dstW;
295
0
    x86_reg uv_off = c->uv_offx2;
296
297
0
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
298
0
        YSCALEYUV2PACKEDX
299
0
        YSCALEYUV2RGBX
300
0
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
301
0
        "psraw                        $3, %%mm1         \n\t"
302
0
        "psraw                        $3, %%mm7         \n\t"
303
0
        "packuswb                  %%mm7, %%mm1         \n\t"
304
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
305
0
        YSCALEYUV2PACKEDX_END
306
0
    } else {
307
0
        YSCALEYUV2PACKEDX
308
0
        YSCALEYUV2RGBX
309
0
        "pcmpeqd %%mm7, %%mm7 \n\t"
310
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
311
0
        YSCALEYUV2PACKEDX_END
312
0
    }
313
0
}
314
315
static void RENAME(yuv2bgr32_X)(SwsInternal *c, const int16_t *lumFilter,
316
                                const int16_t **lumSrc, int lumFilterSize,
317
                                const int16_t *chrFilter, const int16_t **chrUSrc,
318
                                const int16_t **chrVSrc,
319
                                int chrFilterSize, const int16_t **alpSrc,
320
                                uint8_t *dest, int dstW, int dstY)
321
0
{
322
0
    x86_reg dummy=0;
323
0
    x86_reg dstW_reg = dstW;
324
0
    x86_reg uv_off = c->uv_offx2;
325
326
0
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
327
0
        YSCALEYUV2PACKEDX
328
0
        YSCALEYUV2RGBX
329
0
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
330
0
        "psraw                        $3, %%mm1         \n\t"
331
0
        "psraw                        $3, %%mm7         \n\t"
332
0
        "packuswb                  %%mm7, %%mm1         \n\t"
333
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
334
0
        YSCALEYUV2PACKEDX_END
335
0
    } else {
336
0
        YSCALEYUV2PACKEDX
337
0
        YSCALEYUV2RGBX
338
0
        "pcmpeqd %%mm7, %%mm7 \n\t"
339
0
        WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
340
0
        YSCALEYUV2PACKEDX_END
341
0
    }
342
0
}
343
344
#define REAL_WRITERGB16(dst, dstw, index) \
345
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
346
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
347
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
348
    "psrlq           $3, %%mm2  \n\t"\
349
\
350
    "movq         %%mm2, %%mm1  \n\t"\
351
    "movq         %%mm4, %%mm3  \n\t"\
352
\
353
    "punpcklbw    %%mm7, %%mm3  \n\t"\
354
    "punpcklbw    %%mm5, %%mm2  \n\t"\
355
    "punpckhbw    %%mm7, %%mm4  \n\t"\
356
    "punpckhbw    %%mm5, %%mm1  \n\t"\
357
\
358
    "psllq           $3, %%mm3  \n\t"\
359
    "psllq           $3, %%mm4  \n\t"\
360
\
361
    "por          %%mm3, %%mm2  \n\t"\
362
    "por          %%mm4, %%mm1  \n\t"\
363
\
364
    MOVNTQ(%%mm2,  (dst, index, 2))\
365
    MOVNTQ(%%mm1, 8(dst, index, 2))\
366
\
367
    "add             $8, "#index"   \n\t"\
368
    "cmp         "dstw", "#index"   \n\t"\
369
    " jb             1b             \n\t"
370
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
371
372
static void RENAME(yuv2rgb565_X_ar)(SwsInternal *c, const int16_t *lumFilter,
373
                                    const int16_t **lumSrc, int lumFilterSize,
374
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
375
                                    const int16_t **chrVSrc,
376
                                    int chrFilterSize, const int16_t **alpSrc,
377
                                    uint8_t *dest, int dstW, int dstY)
378
0
{
379
0
    x86_reg dummy=0;
380
0
    x86_reg dstW_reg = dstW;
381
0
    x86_reg uv_off = c->uv_offx2;
382
383
0
    YSCALEYUV2PACKEDX_ACCURATE
384
0
    YSCALEYUV2RGBX
385
0
    "pxor %%mm7, %%mm7 \n\t"
386
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
387
0
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
388
0
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
389
0
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
390
0
    WRITERGB16(%4, "%5", %%FF_REGa)
391
0
    YSCALEYUV2PACKEDX_END
392
0
}
393
394
static void RENAME(yuv2rgb565_X)(SwsInternal *c, const int16_t *lumFilter,
395
                                 const int16_t **lumSrc, int lumFilterSize,
396
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
397
                                 const int16_t **chrVSrc,
398
                                 int chrFilterSize, const int16_t **alpSrc,
399
                                 uint8_t *dest, int dstW, int dstY)
400
0
{
401
0
    x86_reg dummy=0;
402
0
    x86_reg dstW_reg = dstW;
403
0
    x86_reg uv_off = c->uv_offx2;
404
405
0
    YSCALEYUV2PACKEDX
406
0
    YSCALEYUV2RGBX
407
0
    "pxor %%mm7, %%mm7 \n\t"
408
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
409
0
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
410
0
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
411
0
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
412
0
    WRITERGB16(%4, "%5", %%FF_REGa)
413
0
    YSCALEYUV2PACKEDX_END
414
0
}
415
416
#define REAL_WRITERGB15(dst, dstw, index) \
417
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
418
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
419
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
420
    "psrlq           $3, %%mm2  \n\t"\
421
    "psrlq           $1, %%mm5  \n\t"\
422
\
423
    "movq         %%mm2, %%mm1  \n\t"\
424
    "movq         %%mm4, %%mm3  \n\t"\
425
\
426
    "punpcklbw    %%mm7, %%mm3  \n\t"\
427
    "punpcklbw    %%mm5, %%mm2  \n\t"\
428
    "punpckhbw    %%mm7, %%mm4  \n\t"\
429
    "punpckhbw    %%mm5, %%mm1  \n\t"\
430
\
431
    "psllq           $2, %%mm3  \n\t"\
432
    "psllq           $2, %%mm4  \n\t"\
433
\
434
    "por          %%mm3, %%mm2  \n\t"\
435
    "por          %%mm4, %%mm1  \n\t"\
436
\
437
    MOVNTQ(%%mm2,  (dst, index, 2))\
438
    MOVNTQ(%%mm1, 8(dst, index, 2))\
439
\
440
    "add             $8, "#index"   \n\t"\
441
    "cmp         "dstw", "#index"   \n\t"\
442
    " jb             1b             \n\t"
443
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
444
445
static void RENAME(yuv2rgb555_X_ar)(SwsInternal *c, const int16_t *lumFilter,
446
                                    const int16_t **lumSrc, int lumFilterSize,
447
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
448
                                    const int16_t **chrVSrc,
449
                                    int chrFilterSize, const int16_t **alpSrc,
450
                                    uint8_t *dest, int dstW, int dstY)
451
0
{
452
0
    x86_reg dummy=0;
453
0
    x86_reg dstW_reg = dstW;
454
0
    x86_reg uv_off = c->uv_offx2;
455
456
0
    YSCALEYUV2PACKEDX_ACCURATE
457
0
    YSCALEYUV2RGBX
458
0
    "pxor %%mm7, %%mm7 \n\t"
459
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
460
0
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
461
0
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
462
0
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
463
0
    WRITERGB15(%4, "%5", %%FF_REGa)
464
0
    YSCALEYUV2PACKEDX_END
465
0
}
466
467
static void RENAME(yuv2rgb555_X)(SwsInternal *c, const int16_t *lumFilter,
468
                                 const int16_t **lumSrc, int lumFilterSize,
469
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
470
                                 const int16_t **chrVSrc,
471
                                 int chrFilterSize, const int16_t **alpSrc,
472
                                 uint8_t *dest, int dstW, int dstY)
473
0
{
474
0
    x86_reg dummy=0;
475
0
    x86_reg dstW_reg = dstW;
476
0
    x86_reg uv_off = c->uv_offx2;
477
478
0
    YSCALEYUV2PACKEDX
479
0
    YSCALEYUV2RGBX
480
0
    "pxor %%mm7, %%mm7 \n\t"
481
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
482
0
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
483
0
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
484
0
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
485
0
    WRITERGB15(%4, "%5", %%FF_REGa)
486
0
    YSCALEYUV2PACKEDX_END
487
0
}
488
489
#define WRITEBGR24MMX(dst, dstw, index) \
490
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
491
    "movq      %%mm2, %%mm1     \n\t" /* B */\
492
    "movq      %%mm5, %%mm6     \n\t" /* R */\
493
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
494
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
495
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
496
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
497
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
498
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
499
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
500
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
501
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
502
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
503
\
504
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
505
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
506
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
507
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
508
\
509
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
510
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
511
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
512
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
513
\
514
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
515
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
516
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
517
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
518
\
519
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
520
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
521
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
522
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
523
    MOVNTQ(%%mm0, (dst))\
524
\
525
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
526
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
527
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
528
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
529
    MOVNTQ(%%mm6, 8(dst))\
530
\
531
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
532
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
533
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
534
    MOVNTQ(%%mm5, 16(dst))\
535
\
536
    "add         $24, "#dst"    \n\t"\
537
\
538
    "add          $8, "#index"  \n\t"\
539
    "cmp      "dstw", "#index"  \n\t"\
540
    " jb          1b            \n\t"
541
542
#define WRITEBGR24MMXEXT(dst, dstw, index) \
543
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
544
    "movq "MANGLE(M24A)", %%mm0 \n\t"\
545
    "movq "MANGLE(M24C)", %%mm7 \n\t"\
546
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
547
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
548
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
549
\
550
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
551
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
552
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
553
\
554
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
555
    "por    %%mm1, %%mm6        \n\t"\
556
    "por    %%mm3, %%mm6        \n\t"\
557
    MOVNTQ(%%mm6, (dst))\
558
\
559
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
560
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
561
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
562
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
563
\
564
    "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
565
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
566
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
567
\
568
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
569
    "por    %%mm3, %%mm6        \n\t"\
570
    MOVNTQ(%%mm6, 8(dst))\
571
\
572
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
573
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
574
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
575
\
576
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
577
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
578
    "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
579
\
580
    "por    %%mm1, %%mm3        \n\t"\
581
    "por    %%mm3, %%mm6        \n\t"\
582
    MOVNTQ(%%mm6, 16(dst))\
583
\
584
    "add      $24, "#dst"       \n\t"\
585
\
586
    "add       $8, "#index"     \n\t"\
587
    "cmp   "dstw", "#index"     \n\t"\
588
    " jb       1b               \n\t"
589
590
#undef WRITEBGR24
591
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
592
593
#if HAVE_6REGS
594
static void RENAME(yuv2bgr24_X_ar)(SwsInternal *c, const int16_t *lumFilter,
595
                                   const int16_t **lumSrc, int lumFilterSize,
596
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
597
                                   const int16_t **chrVSrc,
598
                                   int chrFilterSize, const int16_t **alpSrc,
599
                                   uint8_t *dest, int dstW, int dstY)
600
0
{
601
0
    x86_reg dummy=0;
602
0
    x86_reg dstW_reg = dstW;
603
0
    x86_reg uv_off = c->uv_offx2;
604
605
0
    YSCALEYUV2PACKEDX_ACCURATE
606
0
    YSCALEYUV2RGBX
607
0
    "pxor %%mm7, %%mm7 \n\t"
608
0
    "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
609
0
    "add %4, %%"FF_REG_c"                        \n\t"
610
0
    WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
611
0
    :: "r" (&c->redDither),
612
0
       "m" (dummy), "m" (dummy), "m" (dummy),
613
0
       "r" (dest), "m" (dstW_reg), "m"(uv_off)
614
0
       NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
615
0
    : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
616
0
    );
617
0
}
618
619
static void RENAME(yuv2bgr24_X)(SwsInternal *c, const int16_t *lumFilter,
620
                                const int16_t **lumSrc, int lumFilterSize,
621
                                const int16_t *chrFilter, const int16_t **chrUSrc,
622
                                const int16_t **chrVSrc,
623
                                int chrFilterSize, const int16_t **alpSrc,
624
                                uint8_t *dest, int dstW, int dstY)
625
0
{
626
0
    x86_reg dummy=0;
627
0
    x86_reg dstW_reg = dstW;
628
0
    x86_reg uv_off = c->uv_offx2;
629
630
0
    YSCALEYUV2PACKEDX
631
0
    YSCALEYUV2RGBX
632
0
    "pxor                    %%mm7, %%mm7              \n\t"
633
0
    "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
634
0
    "add                        %4, %%"FF_REG_c"       \n\t"
635
0
    WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
636
0
    :: "r" (&c->redDither),
637
0
       "m" (dummy), "m" (dummy), "m" (dummy),
638
0
       "r" (dest),  "m" (dstW_reg), "m"(uv_off)
639
0
       NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
640
0
    : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
641
0
    );
642
0
}
643
#endif /* HAVE_6REGS */
644
645
#define REAL_WRITEYUY2(dst, dstw, index) \
646
    "packuswb  %%mm3, %%mm3     \n\t"\
647
    "packuswb  %%mm4, %%mm4     \n\t"\
648
    "packuswb  %%mm7, %%mm1     \n\t"\
649
    "punpcklbw %%mm4, %%mm3     \n\t"\
650
    "movq      %%mm1, %%mm7     \n\t"\
651
    "punpcklbw %%mm3, %%mm1     \n\t"\
652
    "punpckhbw %%mm3, %%mm7     \n\t"\
653
\
654
    MOVNTQ(%%mm1, (dst, index, 2))\
655
    MOVNTQ(%%mm7, 8(dst, index, 2))\
656
\
657
    "add          $8, "#index"  \n\t"\
658
    "cmp      "dstw", "#index"  \n\t"\
659
    " jb          1b            \n\t"
660
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
661
662
static void RENAME(yuv2yuyv422_X_ar)(SwsInternal *c, const int16_t *lumFilter,
663
                                     const int16_t **lumSrc, int lumFilterSize,
664
                                     const int16_t *chrFilter, const int16_t **chrUSrc,
665
                                     const int16_t **chrVSrc,
666
                                     int chrFilterSize, const int16_t **alpSrc,
667
                                     uint8_t *dest, int dstW, int dstY)
668
0
{
669
0
    x86_reg dummy=0;
670
0
    x86_reg dstW_reg = dstW;
671
0
    x86_reg uv_off = c->uv_offx2;
672
673
0
    YSCALEYUV2PACKEDX_ACCURATE
674
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
675
0
    "psraw $3, %%mm3    \n\t"
676
0
    "psraw $3, %%mm4    \n\t"
677
0
    "psraw $3, %%mm1    \n\t"
678
0
    "psraw $3, %%mm7    \n\t"
679
0
    WRITEYUY2(%4, "%5", %%FF_REGa)
680
0
    YSCALEYUV2PACKEDX_END
681
0
}
682
683
static void RENAME(yuv2yuyv422_X)(SwsInternal *c, const int16_t *lumFilter,
684
                                  const int16_t **lumSrc, int lumFilterSize,
685
                                  const int16_t *chrFilter, const int16_t **chrUSrc,
686
                                  const int16_t **chrVSrc,
687
                                  int chrFilterSize, const int16_t **alpSrc,
688
                                  uint8_t *dest, int dstW, int dstY)
689
0
{
690
0
    x86_reg dummy=0;
691
0
    x86_reg dstW_reg = dstW;
692
0
    x86_reg uv_off = c->uv_offx2;
693
694
0
    YSCALEYUV2PACKEDX
695
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
696
0
    "psraw $3, %%mm3    \n\t"
697
0
    "psraw $3, %%mm4    \n\t"
698
0
    "psraw $3, %%mm1    \n\t"
699
0
    "psraw $3, %%mm7    \n\t"
700
0
    WRITEYUY2(%4, "%5", %%FF_REGa)
701
0
    YSCALEYUV2PACKEDX_END
702
0
}
703
704
#define REAL_YSCALEYUV2RGB_UV(index, c) \
705
    "xor            "#index", "#index"  \n\t"\
706
    ".p2align              4            \n\t"\
707
    "1:                                 \n\t"\
708
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
709
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
710
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
711
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
712
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
713
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
714
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
715
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
716
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
717
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
718
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
719
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
720
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
721
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
722
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
723
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
724
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
725
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
726
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
727
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
728
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
729
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
730
731
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
732
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
733
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
734
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
735
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
736
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
737
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
738
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
739
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
740
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
741
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
742
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
743
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
744
745
#define REAL_YSCALEYUV2RGB_COEFF(c) \
746
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
747
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
748
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
749
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
750
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
751
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
752
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
753
    "paddw             %%mm3, %%mm4     \n\t"\
754
    "movq              %%mm2, %%mm0     \n\t"\
755
    "movq              %%mm5, %%mm6     \n\t"\
756
    "movq              %%mm4, %%mm3     \n\t"\
757
    "punpcklwd         %%mm2, %%mm2     \n\t"\
758
    "punpcklwd         %%mm5, %%mm5     \n\t"\
759
    "punpcklwd         %%mm4, %%mm4     \n\t"\
760
    "paddw             %%mm1, %%mm2     \n\t"\
761
    "paddw             %%mm1, %%mm5     \n\t"\
762
    "paddw             %%mm1, %%mm4     \n\t"\
763
    "punpckhwd         %%mm0, %%mm0     \n\t"\
764
    "punpckhwd         %%mm6, %%mm6     \n\t"\
765
    "punpckhwd         %%mm3, %%mm3     \n\t"\
766
    "paddw             %%mm7, %%mm0     \n\t"\
767
    "paddw             %%mm7, %%mm6     \n\t"\
768
    "paddw             %%mm7, %%mm3     \n\t"\
769
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
770
    "packuswb          %%mm0, %%mm2     \n\t"\
771
    "packuswb          %%mm6, %%mm5     \n\t"\
772
    "packuswb          %%mm3, %%mm4     \n\t"\
773
774
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
775
776
#define YSCALEYUV2RGB(index, c) \
777
    REAL_YSCALEYUV2RGB_UV(index, c) \
778
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
779
    REAL_YSCALEYUV2RGB_COEFF(c)
780
781
/**
782
 * vertical bilinear scale YV12 to RGB
783
 */
784
static void RENAME(yuv2rgb32_2)(SwsInternal *c, const int16_t *buf[2],
785
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
786
                                const int16_t *abuf[2], uint8_t *dest,
787
                                int dstW, int yalpha, int uvalpha, int y)
788
0
{
789
0
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
790
0
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
791
792
0
    if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
793
0
        const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
794
0
#if ARCH_X86_64
795
0
        __asm__ volatile(
796
0
            YSCALEYUV2RGB(%%r8, %5)
797
0
            YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
798
0
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
799
0
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
800
0
            "packuswb            %%mm7, %%mm1       \n\t"
801
0
            WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
802
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
803
0
               "a" (&c->redDither),
804
0
               "r" (abuf0), "r" (abuf1)
805
0
            : "%r8"
806
0
        );
807
#else
808
        c->u_temp=(intptr_t)abuf0;
809
        c->v_temp=(intptr_t)abuf1;
810
        __asm__ volatile(
811
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
812
            "mov        %4, %%"FF_REG_b"            \n\t"
813
            "push %%"FF_REG_BP"                     \n\t"
814
            YSCALEYUV2RGB(%%FF_REGBP, %5)
815
            "push                   %0              \n\t"
816
            "push                   %1              \n\t"
817
            "mov          "U_TEMP"(%5), %0          \n\t"
818
            "mov          "V_TEMP"(%5), %1          \n\t"
819
            YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
820
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
821
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
822
            "packuswb            %%mm7, %%mm1       \n\t"
823
            "pop                    %1              \n\t"
824
            "pop                    %0              \n\t"
825
            WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
826
            "pop %%"FF_REG_BP"                      \n\t"
827
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
828
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
829
               "a" (&c->redDither)
830
        );
831
#endif
832
0
    } else {
833
0
        __asm__ volatile(
834
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
835
0
            "mov        %4, %%"FF_REG_b"            \n\t"
836
0
            "push %%"FF_REG_BP"                     \n\t"
837
0
            YSCALEYUV2RGB(%%FF_REGBP, %5)
838
0
            "pcmpeqd %%mm7, %%mm7                   \n\t"
839
0
            WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
840
0
            "pop %%"FF_REG_BP"                      \n\t"
841
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
842
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
843
0
               "a" (&c->redDither)
844
0
        );
845
0
    }
846
0
}
847
848
static void RENAME(yuv2bgr24_2)(SwsInternal *c, const int16_t *buf[2],
849
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
850
                                const int16_t *abuf[2], uint8_t *dest,
851
                                int dstW, int yalpha, int uvalpha, int y)
852
0
{
853
0
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
854
0
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
855
856
0
    __asm__ volatile(
857
0
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
858
0
        "mov           %4, %%"FF_REG_b"         \n\t"
859
0
        "push %%"FF_REG_BP"                     \n\t"
860
0
        YSCALEYUV2RGB(%%FF_REGBP, %5)
861
0
        "pxor    %%mm7, %%mm7                   \n\t"
862
0
        WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
863
0
        "pop %%"FF_REG_BP"                      \n\t"
864
0
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
865
0
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
866
0
           "a" (&c->redDither)
867
0
           NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
868
0
    );
869
0
}
870
871
static void RENAME(yuv2rgb555_2)(SwsInternal *c, const int16_t *buf[2],
872
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
873
                                 const int16_t *abuf[2], uint8_t *dest,
874
                                 int dstW, int yalpha, int uvalpha, int y)
875
0
{
876
0
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
877
0
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
878
879
0
    __asm__ volatile(
880
0
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
881
0
        "mov        %4, %%"FF_REG_b"            \n\t"
882
0
        "push %%"FF_REG_BP"                     \n\t"
883
0
        YSCALEYUV2RGB(%%FF_REGBP, %5)
884
0
        "pxor    %%mm7, %%mm7                   \n\t"
885
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
886
0
        "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
887
0
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
888
0
        "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
889
0
        WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
890
0
        "pop %%"FF_REG_BP"                      \n\t"
891
0
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
892
0
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
893
0
           "a" (&c->redDither)
894
0
           NAMED_CONSTRAINTS_ADD(bF8)
895
0
    );
896
0
}
897
898
static void RENAME(yuv2rgb565_2)(SwsInternal *c, const int16_t *buf[2],
899
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
900
                                 const int16_t *abuf[2], uint8_t *dest,
901
                                 int dstW, int yalpha, int uvalpha, int y)
902
0
{
903
0
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
904
0
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
905
906
0
    __asm__ volatile(
907
0
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
908
0
        "mov           %4, %%"FF_REG_b"         \n\t"
909
0
        "push %%"FF_REG_BP"                     \n\t"
910
0
        YSCALEYUV2RGB(%%FF_REGBP, %5)
911
0
        "pxor    %%mm7, %%mm7                   \n\t"
912
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
913
0
        "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
914
0
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
915
0
        "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
916
0
        WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
917
0
        "pop %%"FF_REG_BP"                      \n\t"
918
0
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
919
0
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
920
0
           "a" (&c->redDither)
921
0
           NAMED_CONSTRAINTS_ADD(bF8,bFC)
922
0
    );
923
0
}
924
925
#define REAL_YSCALEYUV2PACKED(index, c) \
926
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
927
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
928
    "psraw                $3, %%mm0                           \n\t"\
929
    "psraw                $3, %%mm1                           \n\t"\
930
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
931
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
932
    "xor            "#index", "#index"                        \n\t"\
933
    ".p2align              4            \n\t"\
934
    "1:                                 \n\t"\
935
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
936
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
937
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
938
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
939
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
940
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
941
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
942
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
943
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
944
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
945
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
946
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
947
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
948
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
949
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
950
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
951
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
952
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
953
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
954
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
955
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
956
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
957
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
958
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
959
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
960
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
961
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
962
963
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
964
965
static void RENAME(yuv2yuyv422_2)(SwsInternal *c, const int16_t *buf[2],
966
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
967
                                  const int16_t *abuf[2], uint8_t *dest,
968
                                  int dstW, int yalpha, int uvalpha, int y)
969
0
{
970
0
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
971
0
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
972
973
0
    __asm__ volatile(
974
0
        "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
975
0
        "mov           %4, %%"FF_REG_b"         \n\t"
976
0
        "push %%"FF_REG_BP"                     \n\t"
977
0
        YSCALEYUV2PACKED(%%FF_REGBP, %5)
978
0
        WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
979
0
        "pop %%"FF_REG_BP"                      \n\t"
980
0
        "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
981
0
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
982
0
           "a" (&c->redDither)
983
0
    );
984
0
}
985
986
#define REAL_YSCALEYUV2RGB1(index, c) \
987
    "xor            "#index", "#index"  \n\t"\
988
    ".p2align              4            \n\t"\
989
    "1:                                 \n\t"\
990
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
991
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
992
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
993
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
994
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
995
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
996
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
997
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
998
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
999
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1000
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1001
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1002
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1003
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1004
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1005
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1006
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1007
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1008
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1009
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1010
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1011
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1012
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1013
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1014
    "paddw             %%mm3, %%mm4     \n\t"\
1015
    "movq              %%mm2, %%mm0     \n\t"\
1016
    "movq              %%mm5, %%mm6     \n\t"\
1017
    "movq              %%mm4, %%mm3     \n\t"\
1018
    "punpcklwd         %%mm2, %%mm2     \n\t"\
1019
    "punpcklwd         %%mm5, %%mm5     \n\t"\
1020
    "punpcklwd         %%mm4, %%mm4     \n\t"\
1021
    "paddw             %%mm1, %%mm2     \n\t"\
1022
    "paddw             %%mm1, %%mm5     \n\t"\
1023
    "paddw             %%mm1, %%mm4     \n\t"\
1024
    "punpckhwd         %%mm0, %%mm0     \n\t"\
1025
    "punpckhwd         %%mm6, %%mm6     \n\t"\
1026
    "punpckhwd         %%mm3, %%mm3     \n\t"\
1027
    "paddw             %%mm7, %%mm0     \n\t"\
1028
    "paddw             %%mm7, %%mm6     \n\t"\
1029
    "paddw             %%mm7, %%mm3     \n\t"\
1030
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1031
    "packuswb          %%mm0, %%mm2     \n\t"\
1032
    "packuswb          %%mm6, %%mm5     \n\t"\
1033
    "packuswb          %%mm3, %%mm4     \n\t"\
1034
1035
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
1036
1037
// do vertical chrominance interpolation
1038
#define REAL_YSCALEYUV2RGB1b(index, c) \
1039
    "xor            "#index", "#index"  \n\t"\
1040
    ".p2align              4            \n\t"\
1041
    "1:                                 \n\t"\
1042
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1043
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1044
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1045
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1046
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1047
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1048
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1049
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1050
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
1051
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
1052
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1053
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
1054
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
1055
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1056
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1057
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1058
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1059
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1060
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1061
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1062
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1063
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1064
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1065
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1066
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1067
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1068
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1069
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1070
    "paddw             %%mm3, %%mm4     \n\t"\
1071
    "movq              %%mm2, %%mm0     \n\t"\
1072
    "movq              %%mm5, %%mm6     \n\t"\
1073
    "movq              %%mm4, %%mm3     \n\t"\
1074
    "punpcklwd         %%mm2, %%mm2     \n\t"\
1075
    "punpcklwd         %%mm5, %%mm5     \n\t"\
1076
    "punpcklwd         %%mm4, %%mm4     \n\t"\
1077
    "paddw             %%mm1, %%mm2     \n\t"\
1078
    "paddw             %%mm1, %%mm5     \n\t"\
1079
    "paddw             %%mm1, %%mm4     \n\t"\
1080
    "punpckhwd         %%mm0, %%mm0     \n\t"\
1081
    "punpckhwd         %%mm6, %%mm6     \n\t"\
1082
    "punpckhwd         %%mm3, %%mm3     \n\t"\
1083
    "paddw             %%mm7, %%mm0     \n\t"\
1084
    "paddw             %%mm7, %%mm6     \n\t"\
1085
    "paddw             %%mm7, %%mm3     \n\t"\
1086
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1087
    "packuswb          %%mm0, %%mm2     \n\t"\
1088
    "packuswb          %%mm6, %%mm5     \n\t"\
1089
    "packuswb          %%mm3, %%mm4     \n\t"\
1090
1091
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
1092
1093
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1094
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
1095
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
1096
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
1097
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
1098
    "packuswb          %%mm1, %%mm7     \n\t"
1099
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1100
1101
/**
1102
 * YV12 to RGB without scaling or interpolating
1103
 */
1104
static void RENAME(yuv2rgb32_1)(SwsInternal *c, const int16_t *buf0,
1105
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1106
                                const int16_t *abuf0, uint8_t *dest,
1107
                                int dstW, int uvalpha, int y)
1108
0
{
1109
0
    const int16_t *ubuf0 = ubuf[0];
1110
0
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1111
1112
0
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1113
0
        const int16_t *ubuf1 = ubuf[0];
1114
0
        if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1115
0
            __asm__ volatile(
1116
0
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1117
0
                "mov           %4, %%"FF_REG_b"         \n\t"
1118
0
                "push %%"FF_REG_BP"                     \n\t"
1119
0
                YSCALEYUV2RGB1(%%FF_REGBP, %5)
1120
0
                YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1121
0
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1122
0
                "pop %%"FF_REG_BP"                      \n\t"
1123
0
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1124
0
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1125
0
                   "a" (&c->redDither)
1126
0
            );
1127
0
        } else {
1128
0
            __asm__ volatile(
1129
0
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1130
0
                "mov           %4, %%"FF_REG_b"         \n\t"
1131
0
                "push %%"FF_REG_BP"                     \n\t"
1132
0
                YSCALEYUV2RGB1(%%FF_REGBP, %5)
1133
0
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1134
0
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1135
0
                "pop %%"FF_REG_BP"                      \n\t"
1136
0
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1137
0
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1138
0
                   "a" (&c->redDither)
1139
0
            );
1140
0
        }
1141
0
    } else {
1142
0
        const int16_t *ubuf1 = ubuf[1];
1143
0
        if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1144
0
            __asm__ volatile(
1145
0
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1146
0
                "mov           %4, %%"FF_REG_b"         \n\t"
1147
0
                "push %%"FF_REG_BP"                     \n\t"
1148
0
                YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1149
0
                YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1150
0
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1151
0
                "pop %%"FF_REG_BP"                      \n\t"
1152
0
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1153
0
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1154
0
                   "a" (&c->redDither)
1155
0
            );
1156
0
        } else {
1157
0
            __asm__ volatile(
1158
0
                "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1159
0
                "mov           %4, %%"FF_REG_b"         \n\t"
1160
0
                "push %%"FF_REG_BP"                     \n\t"
1161
0
                YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1162
0
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1163
0
                WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1164
0
                "pop %%"FF_REG_BP"                      \n\t"
1165
0
                "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1166
0
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1167
0
                   "a" (&c->redDither)
1168
0
            );
1169
0
        }
1170
0
    }
1171
0
}
1172
1173
static void RENAME(yuv2bgr24_1)(SwsInternal *c, const int16_t *buf0,
1174
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1175
                                const int16_t *abuf0, uint8_t *dest,
1176
                                int dstW, int uvalpha, int y)
1177
0
{
1178
0
    const int16_t *ubuf0 = ubuf[0];
1179
0
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1180
1181
0
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1182
0
        const int16_t *ubuf1 = ubuf[0];
1183
0
        __asm__ volatile(
1184
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1185
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1186
0
            "push %%"FF_REG_BP"                     \n\t"
1187
0
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1188
0
            "pxor    %%mm7, %%mm7                   \n\t"
1189
0
            WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1190
0
            "pop %%"FF_REG_BP"                      \n\t"
1191
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1192
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1193
0
               "a" (&c->redDither)
1194
0
               NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
1195
0
        );
1196
0
    } else {
1197
0
        const int16_t *ubuf1 = ubuf[1];
1198
0
        __asm__ volatile(
1199
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1200
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1201
0
            "push %%"FF_REG_BP"                     \n\t"
1202
0
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1203
0
            "pxor    %%mm7, %%mm7                   \n\t"
1204
0
            WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1205
0
            "pop %%"FF_REG_BP"                      \n\t"
1206
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1207
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1208
0
               "a" (&c->redDither)
1209
0
               NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
1210
0
        );
1211
0
    }
1212
0
}
1213
1214
static void RENAME(yuv2rgb555_1)(SwsInternal *c, const int16_t *buf0,
1215
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1216
                                 const int16_t *abuf0, uint8_t *dest,
1217
                                 int dstW, int uvalpha, int y)
1218
0
{
1219
0
    const int16_t *ubuf0 = ubuf[0];
1220
0
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1221
1222
0
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1223
0
        const int16_t *ubuf1 = ubuf[0];
1224
0
        __asm__ volatile(
1225
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1226
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1227
0
            "push %%"FF_REG_BP"                     \n\t"
1228
0
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1229
0
            "pxor    %%mm7, %%mm7                   \n\t"
1230
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1231
0
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1232
0
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1233
0
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1234
0
            WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1235
0
            "pop %%"FF_REG_BP"                      \n\t"
1236
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1237
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1238
0
               "a" (&c->redDither)
1239
0
               NAMED_CONSTRAINTS_ADD(bF8)
1240
0
        );
1241
0
    } else {
1242
0
        const int16_t *ubuf1 = ubuf[1];
1243
0
        __asm__ volatile(
1244
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1245
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1246
0
            "push %%"FF_REG_BP"                     \n\t"
1247
0
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1248
0
            "pxor    %%mm7, %%mm7                   \n\t"
1249
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1250
0
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1251
0
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1252
0
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1253
0
            WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1254
0
            "pop %%"FF_REG_BP"                      \n\t"
1255
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1256
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1257
0
               "a" (&c->redDither)
1258
0
               NAMED_CONSTRAINTS_ADD(bF8)
1259
0
        );
1260
0
    }
1261
0
}
1262
1263
static void RENAME(yuv2rgb565_1)(SwsInternal *c, const int16_t *buf0,
1264
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1265
                                 const int16_t *abuf0, uint8_t *dest,
1266
                                 int dstW, int uvalpha, int y)
1267
0
{
1268
0
    const int16_t *ubuf0 = ubuf[0];
1269
0
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1270
1271
0
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1272
0
        const int16_t *ubuf1 = ubuf[0];
1273
0
        __asm__ volatile(
1274
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1275
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1276
0
            "push %%"FF_REG_BP"                     \n\t"
1277
0
            YSCALEYUV2RGB1(%%FF_REGBP, %5)
1278
0
            "pxor    %%mm7, %%mm7                   \n\t"
1279
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1280
0
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1281
0
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1282
0
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1283
0
            WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1284
0
            "pop %%"FF_REG_BP"                      \n\t"
1285
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1286
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1287
0
               "a" (&c->redDither)
1288
0
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1289
0
        );
1290
0
    } else {
1291
0
        const int16_t *ubuf1 = ubuf[1];
1292
0
        __asm__ volatile(
1293
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1294
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1295
0
            "push %%"FF_REG_BP"                     \n\t"
1296
0
            YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1297
0
            "pxor    %%mm7, %%mm7                   \n\t"
1298
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299
0
            "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
1300
0
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1301
0
            "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
1302
0
            WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1303
0
            "pop %%"FF_REG_BP"                      \n\t"
1304
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1305
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1306
0
               "a" (&c->redDither)
1307
0
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1308
0
        );
1309
0
    }
1310
0
}
1311
1312
#define REAL_YSCALEYUV2PACKED1(index, c) \
1313
    "xor            "#index", "#index"  \n\t"\
1314
    ".p2align              4            \n\t"\
1315
    "1:                                 \n\t"\
1316
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1317
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1318
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1319
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1320
    "psraw                $7, %%mm3     \n\t" \
1321
    "psraw                $7, %%mm4     \n\t" \
1322
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1323
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1324
    "psraw                $7, %%mm1     \n\t" \
1325
    "psraw                $7, %%mm7     \n\t" \
1326
1327
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
1328
1329
#define REAL_YSCALEYUV2PACKED1b(index, c) \
1330
    "xor "#index", "#index"             \n\t"\
1331
    ".p2align              4            \n\t"\
1332
    "1:                                 \n\t"\
1333
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1334
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1335
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1336
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1337
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1338
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1339
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1340
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1341
    "psrlw                $8, %%mm3     \n\t" \
1342
    "psrlw                $8, %%mm4     \n\t" \
1343
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1344
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1345
    "psraw                $7, %%mm1     \n\t" \
1346
    "psraw                $7, %%mm7     \n\t"
1347
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
1348
1349
static void RENAME(yuv2yuyv422_1)(SwsInternal *c, const int16_t *buf0,
1350
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1351
                                  const int16_t *abuf0, uint8_t *dest,
1352
                                  int dstW, int uvalpha, int y)
1353
0
{
1354
0
    const int16_t *ubuf0 = ubuf[0];
1355
0
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1356
1357
0
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1358
0
        const int16_t *ubuf1 = ubuf[0];
1359
0
        __asm__ volatile(
1360
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1361
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1362
0
            "push %%"FF_REG_BP"                     \n\t"
1363
0
            YSCALEYUV2PACKED1(%%FF_REGBP, %5)
1364
0
            WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1365
0
            "pop %%"FF_REG_BP"                      \n\t"
1366
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1367
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1368
0
               "a" (&c->redDither)
1369
0
        );
1370
0
    } else {
1371
0
        const int16_t *ubuf1 = ubuf[1];
1372
0
        __asm__ volatile(
1373
0
            "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
1374
0
            "mov           %4, %%"FF_REG_b"         \n\t"
1375
0
            "push %%"FF_REG_BP"                     \n\t"
1376
0
            YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
1377
0
            WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1378
0
            "pop %%"FF_REG_BP"                      \n\t"
1379
0
            "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
1380
0
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1381
0
               "a" (&c->redDither)
1382
0
        );
1383
0
    }
1384
0
}
1385
static av_cold void RENAME(sws_init_swscale)(SwsInternal *c)
1386
0
{
1387
0
    enum AVPixelFormat dstFormat = c->opts.dst_format;
1388
1389
0
    c->use_mmx_vfilter= 0;
1390
0
    if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
1391
0
        && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
1392
0
        && !(c->opts.flags & SWS_BITEXACT)) {
1393
0
            if (c->opts.flags & SWS_ACCURATE_RND) {
1394
0
                if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1395
0
                    switch (c->opts.dst_format) {
1396
0
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
1397
0
#if HAVE_6REGS
1398
0
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
1399
0
#endif
1400
0
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
1401
0
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
1402
0
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1403
0
                    default: break;
1404
0
                    }
1405
0
                }
1406
0
            } else {
1407
0
                c->use_mmx_vfilter= 1;
1408
0
                if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1409
0
                    switch (c->opts.dst_format) {
1410
0
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
1411
0
                    case AV_PIX_FMT_BGR32:   c->yuv2packedX = RENAME(yuv2bgr32_X);   break;
1412
0
#if HAVE_6REGS
1413
0
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
1414
0
#endif
1415
0
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
1416
0
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
1417
0
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1418
0
                    default: break;
1419
0
                    }
1420
0
                }
1421
0
            }
1422
0
        if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1423
0
            switch (c->opts.dst_format) {
1424
0
            case AV_PIX_FMT_RGB32:
1425
0
                c->yuv2packed1 = RENAME(yuv2rgb32_1);
1426
0
                c->yuv2packed2 = RENAME(yuv2rgb32_2);
1427
0
                break;
1428
0
            case AV_PIX_FMT_BGR24:
1429
0
                c->yuv2packed1 = RENAME(yuv2bgr24_1);
1430
0
                c->yuv2packed2 = RENAME(yuv2bgr24_2);
1431
0
                break;
1432
0
            case AV_PIX_FMT_RGB555:
1433
0
                c->yuv2packed1 = RENAME(yuv2rgb555_1);
1434
0
                c->yuv2packed2 = RENAME(yuv2rgb555_2);
1435
0
                break;
1436
0
            case AV_PIX_FMT_RGB565:
1437
0
                c->yuv2packed1 = RENAME(yuv2rgb565_1);
1438
0
                c->yuv2packed2 = RENAME(yuv2rgb565_2);
1439
0
                break;
1440
0
            case AV_PIX_FMT_YUYV422:
1441
0
                c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1442
0
                c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1443
0
                break;
1444
0
            default:
1445
0
                break;
1446
0
            }
1447
0
        }
1448
0
    }
1449
1450
0
    if (c->srcBpc == 8 && c->dstBpc <= 14) {
1451
        // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1452
0
        if (c->opts.flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1453
0
            c->hyscale_fast = ff_hyscale_fast_mmxext;
1454
0
            c->hcscale_fast = ff_hcscale_fast_mmxext;
1455
0
        } else {
1456
0
            c->hyscale_fast = NULL;
1457
            c->hcscale_fast = NULL;
1458
0
        }
1459
0
    }
1460
0
}