/src/ffmpeg/libswscale/x86/swscale_template.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at> |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include <stdint.h> |
22 | | |
23 | | #include "libavutil/x86/asm.h" |
24 | | #include "libswscale/swscale_internal.h" |
25 | | |
26 | | #undef REAL_MOVNTQ |
27 | | #undef MOVNTQ |
28 | | #undef MOVNTQ2 |
29 | | #undef PREFETCH |
30 | | |
31 | | |
32 | | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
33 | | #define MOVNTQ2 "movntq " |
34 | | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
35 | | |
36 | | #define YSCALEYUV2PACKEDX_UV \ |
37 | 0 | __asm__ volatile(\ |
38 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
39 | 0 | ".p2align 4 \n\t"\ |
40 | 0 | "nop \n\t"\ |
41 | 0 | "1: \n\t"\ |
42 | 0 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ |
43 | 0 | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
44 | 0 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ |
45 | 0 | "movq %%mm3, %%mm4 \n\t"\ |
46 | 0 | ".p2align 4 \n\t"\ |
47 | 0 | "2: \n\t"\ |
48 | 0 | "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
49 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\ |
50 | 0 | "add %6, %%"FF_REG_S" \n\t" \ |
51 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\ |
52 | 0 | "add $16, %%"FF_REG_d" \n\t"\ |
53 | 0 | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
54 | 0 | "pmulhw %%mm0, %%mm2 \n\t"\ |
55 | 0 | "pmulhw %%mm0, %%mm5 \n\t"\ |
56 | 0 | "paddw %%mm2, %%mm3 \n\t"\ |
57 | 0 | "paddw %%mm5, %%mm4 \n\t"\ |
58 | 0 | "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ |
59 | 0 | " jnz 2b \n\t"\ |
60 | | |
61 | | #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ |
62 | | "lea "offset"(%0), %%"FF_REG_d" \n\t"\ |
63 | | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
64 | | "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ |
65 | | "movq "#dst1", "#dst2" \n\t"\ |
66 | | ".p2align 4 \n\t"\ |
67 | | "2: \n\t"\ |
68 | | "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\ |
69 | | "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\ |
70 | | "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\ |
71 | | "add $16, %%"FF_REG_d" \n\t"\ |
72 | | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
73 | | "pmulhw "#coeff", "#src1" \n\t"\ |
74 | | "pmulhw "#coeff", "#src2" \n\t"\ |
75 | | "paddw "#src1", "#dst1" \n\t"\ |
76 | | "paddw "#src2", "#dst2" \n\t"\ |
77 | | "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ |
78 | | " jnz 2b \n\t"\ |
79 | | |
80 | | #define YSCALEYUV2PACKEDX \ |
81 | 0 | YSCALEYUV2PACKEDX_UV \ |
82 | 0 | YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ |
83 | | |
84 | | #define YSCALEYUV2PACKEDX_END \ |
85 | 0 | :: "r" (&c->redDither), \ |
86 | 0 | "m" (dummy), "m" (dummy), "m" (dummy),\ |
87 | 0 | "r" (dest), "m" (dstW_reg), "m"(uv_off) \ |
88 | 0 | NAMED_CONSTRAINTS_ADD(bF8,bFC) \ |
89 | 0 | : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ |
90 | 0 | ); |
91 | | |
92 | | #define YSCALEYUV2PACKEDX_ACCURATE_UV \ |
93 | 0 | __asm__ volatile(\ |
94 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
95 | 0 | ".p2align 4 \n\t"\ |
96 | 0 | "nop \n\t"\ |
97 | 0 | "1: \n\t"\ |
98 | 0 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ |
99 | 0 | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
100 | 0 | "pxor %%mm4, %%mm4 \n\t"\ |
101 | 0 | "pxor %%mm5, %%mm5 \n\t"\ |
102 | 0 | "pxor %%mm6, %%mm6 \n\t"\ |
103 | 0 | "pxor %%mm7, %%mm7 \n\t"\ |
104 | 0 | ".p2align 4 \n\t"\ |
105 | 0 | "2: \n\t"\ |
106 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\ |
107 | 0 | "add %6, %%"FF_REG_S" \n\t" \ |
108 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\ |
109 | 0 | "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
110 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\ |
111 | 0 | "movq %%mm0, %%mm3 \n\t"\ |
112 | 0 | "punpcklwd %%mm1, %%mm0 \n\t"\ |
113 | 0 | "punpckhwd %%mm1, %%mm3 \n\t"\ |
114 | 0 | "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\ |
115 | 0 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
116 | 0 | "pmaddwd %%mm1, %%mm3 \n\t"\ |
117 | 0 | "paddd %%mm0, %%mm4 \n\t"\ |
118 | 0 | "paddd %%mm3, %%mm5 \n\t"\ |
119 | 0 | "add %6, %%"FF_REG_S" \n\t" \ |
120 | 0 | "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\ |
121 | 0 | "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
122 | 0 | "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ |
123 | 0 | "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ |
124 | 0 | "movq %%mm2, %%mm0 \n\t"\ |
125 | 0 | "punpcklwd %%mm3, %%mm2 \n\t"\ |
126 | 0 | "punpckhwd %%mm3, %%mm0 \n\t"\ |
127 | 0 | "pmaddwd %%mm1, %%mm2 \n\t"\ |
128 | 0 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
129 | 0 | "paddd %%mm2, %%mm6 \n\t"\ |
130 | 0 | "paddd %%mm0, %%mm7 \n\t"\ |
131 | 0 | " jnz 2b \n\t"\ |
132 | 0 | "psrad $16, %%mm4 \n\t"\ |
133 | 0 | "psrad $16, %%mm5 \n\t"\ |
134 | 0 | "psrad $16, %%mm6 \n\t"\ |
135 | 0 | "psrad $16, %%mm7 \n\t"\ |
136 | 0 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ |
137 | 0 | "packssdw %%mm5, %%mm4 \n\t"\ |
138 | 0 | "packssdw %%mm7, %%mm6 \n\t"\ |
139 | 0 | "paddw %%mm0, %%mm4 \n\t"\ |
140 | 0 | "paddw %%mm0, %%mm6 \n\t"\ |
141 | 0 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ |
142 | 0 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ |
143 | | |
144 | | #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ |
145 | | "lea "offset"(%0), %%"FF_REG_d" \n\t"\ |
146 | | "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
147 | | "pxor %%mm1, %%mm1 \n\t"\ |
148 | | "pxor %%mm5, %%mm5 \n\t"\ |
149 | | "pxor %%mm7, %%mm7 \n\t"\ |
150 | | "pxor %%mm6, %%mm6 \n\t"\ |
151 | | ".p2align 4 \n\t"\ |
152 | | "2: \n\t"\ |
153 | | "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ |
154 | | "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ |
155 | | "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
156 | | "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
157 | | "movq %%mm0, %%mm3 \n\t"\ |
158 | | "punpcklwd %%mm4, %%mm0 \n\t"\ |
159 | | "punpckhwd %%mm4, %%mm3 \n\t"\ |
160 | | "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
161 | | "pmaddwd %%mm4, %%mm0 \n\t"\ |
162 | | "pmaddwd %%mm4, %%mm3 \n\t"\ |
163 | | "paddd %%mm0, %%mm1 \n\t"\ |
164 | | "paddd %%mm3, %%mm5 \n\t"\ |
165 | | "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ |
166 | | "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ |
167 | | "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ |
168 | | "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ |
169 | | "movq %%mm2, %%mm0 \n\t"\ |
170 | | "punpcklwd %%mm3, %%mm2 \n\t"\ |
171 | | "punpckhwd %%mm3, %%mm0 \n\t"\ |
172 | | "pmaddwd %%mm4, %%mm2 \n\t"\ |
173 | | "pmaddwd %%mm4, %%mm0 \n\t"\ |
174 | | "paddd %%mm2, %%mm7 \n\t"\ |
175 | | "paddd %%mm0, %%mm6 \n\t"\ |
176 | | " jnz 2b \n\t"\ |
177 | | "psrad $16, %%mm1 \n\t"\ |
178 | | "psrad $16, %%mm5 \n\t"\ |
179 | | "psrad $16, %%mm7 \n\t"\ |
180 | | "psrad $16, %%mm6 \n\t"\ |
181 | | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ |
182 | | "packssdw %%mm5, %%mm1 \n\t"\ |
183 | | "packssdw %%mm6, %%mm7 \n\t"\ |
184 | | "paddw %%mm0, %%mm1 \n\t"\ |
185 | | "paddw %%mm0, %%mm7 \n\t"\ |
186 | | "movq "U_TEMP"(%0), %%mm3 \n\t"\ |
187 | | "movq "V_TEMP"(%0), %%mm4 \n\t"\ |
188 | | |
189 | | #define YSCALEYUV2PACKEDX_ACCURATE \ |
190 | 0 | YSCALEYUV2PACKEDX_ACCURATE_UV \ |
191 | 0 | YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) |
192 | | |
193 | | #define YSCALEYUV2RGBX \ |
194 | | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
195 | | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ |
196 | | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
197 | | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
198 | | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
199 | | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ |
200 | | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
201 | | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
202 | | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ |
203 | | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ |
204 | | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ |
205 | | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ |
206 | | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ |
207 | | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
208 | | "paddw %%mm3, %%mm4 \n\t"\ |
209 | | "movq %%mm2, %%mm0 \n\t"\ |
210 | | "movq %%mm5, %%mm6 \n\t"\ |
211 | | "movq %%mm4, %%mm3 \n\t"\ |
212 | | "punpcklwd %%mm2, %%mm2 \n\t"\ |
213 | | "punpcklwd %%mm5, %%mm5 \n\t"\ |
214 | | "punpcklwd %%mm4, %%mm4 \n\t"\ |
215 | | "paddw %%mm1, %%mm2 \n\t"\ |
216 | | "paddw %%mm1, %%mm5 \n\t"\ |
217 | | "paddw %%mm1, %%mm4 \n\t"\ |
218 | | "punpckhwd %%mm0, %%mm0 \n\t"\ |
219 | | "punpckhwd %%mm6, %%mm6 \n\t"\ |
220 | | "punpckhwd %%mm3, %%mm3 \n\t"\ |
221 | | "paddw %%mm7, %%mm0 \n\t"\ |
222 | | "paddw %%mm7, %%mm6 \n\t"\ |
223 | | "paddw %%mm7, %%mm3 \n\t"\ |
224 | | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
225 | | "packuswb %%mm0, %%mm2 \n\t"\ |
226 | | "packuswb %%mm6, %%mm5 \n\t"\ |
227 | | "packuswb %%mm3, %%mm4 \n\t"\ |
228 | | |
229 | | #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ |
230 | | "movq "#b", "#q2" \n\t" /* B */\ |
231 | | "movq "#r", "#t" \n\t" /* R */\ |
232 | | "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ |
233 | | "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ |
234 | | "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ |
235 | | "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ |
236 | | "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ |
237 | | "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ |
238 | | "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ |
239 | | "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ |
240 | | "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ |
241 | | "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ |
242 | | \ |
243 | | MOVNTQ( q0, (dst, index, 4))\ |
244 | | MOVNTQ( b, 8(dst, index, 4))\ |
245 | | MOVNTQ( q2, 16(dst, index, 4))\ |
246 | | MOVNTQ( q3, 24(dst, index, 4))\ |
247 | | \ |
248 | | "add $8, "#index" \n\t"\ |
249 | | "cmp "dstw", "#index" \n\t"\ |
250 | | " jb 1b \n\t" |
251 | | #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) |
252 | | |
253 | | static void RENAME(yuv2rgb32_X_ar)(SwsInternal *c, const int16_t *lumFilter, |
254 | | const int16_t **lumSrc, int lumFilterSize, |
255 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
256 | | const int16_t **chrVSrc, |
257 | | int chrFilterSize, const int16_t **alpSrc, |
258 | | uint8_t *dest, int dstW, int dstY) |
259 | 0 | { |
260 | 0 | x86_reg dummy=0; |
261 | 0 | x86_reg dstW_reg = dstW; |
262 | 0 | x86_reg uv_off = c->uv_offx2; |
263 | |
|
264 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
265 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
266 | 0 | YSCALEYUV2RGBX |
267 | 0 | "movq %%mm2, "U_TEMP"(%0) \n\t" |
268 | 0 | "movq %%mm4, "V_TEMP"(%0) \n\t" |
269 | 0 | "movq %%mm5, "Y_TEMP"(%0) \n\t" |
270 | 0 | YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) |
271 | 0 | "movq "Y_TEMP"(%0), %%mm5 \n\t" |
272 | 0 | "psraw $3, %%mm1 \n\t" |
273 | 0 | "psraw $3, %%mm7 \n\t" |
274 | 0 | "packuswb %%mm7, %%mm1 \n\t" |
275 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) |
276 | 0 | YSCALEYUV2PACKEDX_END |
277 | 0 | } else { |
278 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
279 | 0 | YSCALEYUV2RGBX |
280 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
281 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
282 | 0 | YSCALEYUV2PACKEDX_END |
283 | 0 | } |
284 | 0 | } |
285 | | |
286 | | static void RENAME(yuv2rgb32_X)(SwsInternal *c, const int16_t *lumFilter, |
287 | | const int16_t **lumSrc, int lumFilterSize, |
288 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
289 | | const int16_t **chrVSrc, |
290 | | int chrFilterSize, const int16_t **alpSrc, |
291 | | uint8_t *dest, int dstW, int dstY) |
292 | 0 | { |
293 | 0 | x86_reg dummy=0; |
294 | 0 | x86_reg dstW_reg = dstW; |
295 | 0 | x86_reg uv_off = c->uv_offx2; |
296 | |
|
297 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
298 | 0 | YSCALEYUV2PACKEDX |
299 | 0 | YSCALEYUV2RGBX |
300 | 0 | YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) |
301 | 0 | "psraw $3, %%mm1 \n\t" |
302 | 0 | "psraw $3, %%mm7 \n\t" |
303 | 0 | "packuswb %%mm7, %%mm1 \n\t" |
304 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
305 | 0 | YSCALEYUV2PACKEDX_END |
306 | 0 | } else { |
307 | 0 | YSCALEYUV2PACKEDX |
308 | 0 | YSCALEYUV2RGBX |
309 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
310 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
311 | 0 | YSCALEYUV2PACKEDX_END |
312 | 0 | } |
313 | 0 | } |
314 | | |
315 | | static void RENAME(yuv2bgr32_X)(SwsInternal *c, const int16_t *lumFilter, |
316 | | const int16_t **lumSrc, int lumFilterSize, |
317 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
318 | | const int16_t **chrVSrc, |
319 | | int chrFilterSize, const int16_t **alpSrc, |
320 | | uint8_t *dest, int dstW, int dstY) |
321 | 0 | { |
322 | 0 | x86_reg dummy=0; |
323 | 0 | x86_reg dstW_reg = dstW; |
324 | 0 | x86_reg uv_off = c->uv_offx2; |
325 | |
|
326 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
327 | 0 | YSCALEYUV2PACKEDX |
328 | 0 | YSCALEYUV2RGBX |
329 | 0 | YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) |
330 | 0 | "psraw $3, %%mm1 \n\t" |
331 | 0 | "psraw $3, %%mm7 \n\t" |
332 | 0 | "packuswb %%mm7, %%mm1 \n\t" |
333 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
334 | 0 | YSCALEYUV2PACKEDX_END |
335 | 0 | } else { |
336 | 0 | YSCALEYUV2PACKEDX |
337 | 0 | YSCALEYUV2RGBX |
338 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
339 | 0 | WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
340 | 0 | YSCALEYUV2PACKEDX_END |
341 | 0 | } |
342 | 0 | } |
343 | | |
344 | | #define REAL_WRITERGB16(dst, dstw, index) \ |
345 | | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
346 | | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ |
347 | | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ |
348 | | "psrlq $3, %%mm2 \n\t"\ |
349 | | \ |
350 | | "movq %%mm2, %%mm1 \n\t"\ |
351 | | "movq %%mm4, %%mm3 \n\t"\ |
352 | | \ |
353 | | "punpcklbw %%mm7, %%mm3 \n\t"\ |
354 | | "punpcklbw %%mm5, %%mm2 \n\t"\ |
355 | | "punpckhbw %%mm7, %%mm4 \n\t"\ |
356 | | "punpckhbw %%mm5, %%mm1 \n\t"\ |
357 | | \ |
358 | | "psllq $3, %%mm3 \n\t"\ |
359 | | "psllq $3, %%mm4 \n\t"\ |
360 | | \ |
361 | | "por %%mm3, %%mm2 \n\t"\ |
362 | | "por %%mm4, %%mm1 \n\t"\ |
363 | | \ |
364 | | MOVNTQ(%%mm2, (dst, index, 2))\ |
365 | | MOVNTQ(%%mm1, 8(dst, index, 2))\ |
366 | | \ |
367 | | "add $8, "#index" \n\t"\ |
368 | | "cmp "dstw", "#index" \n\t"\ |
369 | | " jb 1b \n\t" |
370 | | #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) |
371 | | |
372 | | static void RENAME(yuv2rgb565_X_ar)(SwsInternal *c, const int16_t *lumFilter, |
373 | | const int16_t **lumSrc, int lumFilterSize, |
374 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
375 | | const int16_t **chrVSrc, |
376 | | int chrFilterSize, const int16_t **alpSrc, |
377 | | uint8_t *dest, int dstW, int dstY) |
378 | 0 | { |
379 | 0 | x86_reg dummy=0; |
380 | 0 | x86_reg dstW_reg = dstW; |
381 | 0 | x86_reg uv_off = c->uv_offx2; |
382 | |
|
383 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
384 | 0 | YSCALEYUV2RGBX |
385 | 0 | "pxor %%mm7, %%mm7 \n\t" |
386 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
387 | 0 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
388 | 0 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" |
389 | 0 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" |
390 | 0 | WRITERGB16(%4, "%5", %%FF_REGa) |
391 | 0 | YSCALEYUV2PACKEDX_END |
392 | 0 | } |
393 | | |
394 | | static void RENAME(yuv2rgb565_X)(SwsInternal *c, const int16_t *lumFilter, |
395 | | const int16_t **lumSrc, int lumFilterSize, |
396 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
397 | | const int16_t **chrVSrc, |
398 | | int chrFilterSize, const int16_t **alpSrc, |
399 | | uint8_t *dest, int dstW, int dstY) |
400 | 0 | { |
401 | 0 | x86_reg dummy=0; |
402 | 0 | x86_reg dstW_reg = dstW; |
403 | 0 | x86_reg uv_off = c->uv_offx2; |
404 | |
|
405 | 0 | YSCALEYUV2PACKEDX |
406 | 0 | YSCALEYUV2RGBX |
407 | 0 | "pxor %%mm7, %%mm7 \n\t" |
408 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
409 | 0 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
410 | 0 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" |
411 | 0 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" |
412 | 0 | WRITERGB16(%4, "%5", %%FF_REGa) |
413 | 0 | YSCALEYUV2PACKEDX_END |
414 | 0 | } |
415 | | |
416 | | #define REAL_WRITERGB15(dst, dstw, index) \ |
417 | | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
418 | | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ |
419 | | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ |
420 | | "psrlq $3, %%mm2 \n\t"\ |
421 | | "psrlq $1, %%mm5 \n\t"\ |
422 | | \ |
423 | | "movq %%mm2, %%mm1 \n\t"\ |
424 | | "movq %%mm4, %%mm3 \n\t"\ |
425 | | \ |
426 | | "punpcklbw %%mm7, %%mm3 \n\t"\ |
427 | | "punpcklbw %%mm5, %%mm2 \n\t"\ |
428 | | "punpckhbw %%mm7, %%mm4 \n\t"\ |
429 | | "punpckhbw %%mm5, %%mm1 \n\t"\ |
430 | | \ |
431 | | "psllq $2, %%mm3 \n\t"\ |
432 | | "psllq $2, %%mm4 \n\t"\ |
433 | | \ |
434 | | "por %%mm3, %%mm2 \n\t"\ |
435 | | "por %%mm4, %%mm1 \n\t"\ |
436 | | \ |
437 | | MOVNTQ(%%mm2, (dst, index, 2))\ |
438 | | MOVNTQ(%%mm1, 8(dst, index, 2))\ |
439 | | \ |
440 | | "add $8, "#index" \n\t"\ |
441 | | "cmp "dstw", "#index" \n\t"\ |
442 | | " jb 1b \n\t" |
443 | | #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) |
444 | | |
445 | | static void RENAME(yuv2rgb555_X_ar)(SwsInternal *c, const int16_t *lumFilter, |
446 | | const int16_t **lumSrc, int lumFilterSize, |
447 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
448 | | const int16_t **chrVSrc, |
449 | | int chrFilterSize, const int16_t **alpSrc, |
450 | | uint8_t *dest, int dstW, int dstY) |
451 | 0 | { |
452 | 0 | x86_reg dummy=0; |
453 | 0 | x86_reg dstW_reg = dstW; |
454 | 0 | x86_reg uv_off = c->uv_offx2; |
455 | |
|
456 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
457 | 0 | YSCALEYUV2RGBX |
458 | 0 | "pxor %%mm7, %%mm7 \n\t" |
459 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
460 | 0 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
461 | 0 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" |
462 | 0 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" |
463 | 0 | WRITERGB15(%4, "%5", %%FF_REGa) |
464 | 0 | YSCALEYUV2PACKEDX_END |
465 | 0 | } |
466 | | |
467 | | static void RENAME(yuv2rgb555_X)(SwsInternal *c, const int16_t *lumFilter, |
468 | | const int16_t **lumSrc, int lumFilterSize, |
469 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
470 | | const int16_t **chrVSrc, |
471 | | int chrFilterSize, const int16_t **alpSrc, |
472 | | uint8_t *dest, int dstW, int dstY) |
473 | 0 | { |
474 | 0 | x86_reg dummy=0; |
475 | 0 | x86_reg dstW_reg = dstW; |
476 | 0 | x86_reg uv_off = c->uv_offx2; |
477 | |
|
478 | 0 | YSCALEYUV2PACKEDX |
479 | 0 | YSCALEYUV2RGBX |
480 | 0 | "pxor %%mm7, %%mm7 \n\t" |
481 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
482 | 0 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
483 | 0 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" |
484 | 0 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" |
485 | 0 | WRITERGB15(%4, "%5", %%FF_REGa) |
486 | 0 | YSCALEYUV2PACKEDX_END |
487 | 0 | } |
488 | | |
489 | | #define WRITEBGR24MMX(dst, dstw, index) \ |
490 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
491 | | "movq %%mm2, %%mm1 \n\t" /* B */\ |
492 | | "movq %%mm5, %%mm6 \n\t" /* R */\ |
493 | | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
494 | | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
495 | | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
496 | | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
497 | | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
498 | | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
499 | | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
500 | | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
501 | | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
502 | | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
503 | | \ |
504 | | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
505 | | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ |
506 | | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ |
507 | | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ |
508 | | \ |
509 | | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ |
510 | | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ |
511 | | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ |
512 | | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ |
513 | | \ |
514 | | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ |
515 | | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ |
516 | | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ |
517 | | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ |
518 | | \ |
519 | | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
520 | | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ |
521 | | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ |
522 | | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
523 | | MOVNTQ(%%mm0, (dst))\ |
524 | | \ |
525 | | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ |
526 | | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ |
527 | | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ |
528 | | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ |
529 | | MOVNTQ(%%mm6, 8(dst))\ |
530 | | \ |
531 | | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ |
532 | | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ |
533 | | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ |
534 | | MOVNTQ(%%mm5, 16(dst))\ |
535 | | \ |
536 | | "add $24, "#dst" \n\t"\ |
537 | | \ |
538 | | "add $8, "#index" \n\t"\ |
539 | | "cmp "dstw", "#index" \n\t"\ |
540 | | " jb 1b \n\t" |
541 | | |
542 | | #define WRITEBGR24MMXEXT(dst, dstw, index) \ |
543 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
544 | | "movq "MANGLE(M24A)", %%mm0 \n\t"\ |
545 | | "movq "MANGLE(M24C)", %%mm7 \n\t"\ |
546 | | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
547 | | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ |
548 | | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ |
549 | | \ |
550 | | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ |
551 | | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ |
552 | | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ |
553 | | \ |
554 | | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ |
555 | | "por %%mm1, %%mm6 \n\t"\ |
556 | | "por %%mm3, %%mm6 \n\t"\ |
557 | | MOVNTQ(%%mm6, (dst))\ |
558 | | \ |
559 | | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ |
560 | | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ |
561 | | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ |
562 | | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ |
563 | | \ |
564 | | "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
565 | | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
566 | | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ |
567 | | \ |
568 | | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ |
569 | | "por %%mm3, %%mm6 \n\t"\ |
570 | | MOVNTQ(%%mm6, 8(dst))\ |
571 | | \ |
572 | | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ |
573 | | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ |
574 | | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ |
575 | | \ |
576 | | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ |
577 | | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ |
578 | | "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
579 | | \ |
580 | | "por %%mm1, %%mm3 \n\t"\ |
581 | | "por %%mm3, %%mm6 \n\t"\ |
582 | | MOVNTQ(%%mm6, 16(dst))\ |
583 | | \ |
584 | | "add $24, "#dst" \n\t"\ |
585 | | \ |
586 | | "add $8, "#index" \n\t"\ |
587 | | "cmp "dstw", "#index" \n\t"\ |
588 | | " jb 1b \n\t" |
589 | | |
590 | | #undef WRITEBGR24 |
591 | | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) |
592 | | |
593 | | #if HAVE_6REGS |
594 | | static void RENAME(yuv2bgr24_X_ar)(SwsInternal *c, const int16_t *lumFilter, |
595 | | const int16_t **lumSrc, int lumFilterSize, |
596 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
597 | | const int16_t **chrVSrc, |
598 | | int chrFilterSize, const int16_t **alpSrc, |
599 | | uint8_t *dest, int dstW, int dstY) |
600 | 0 | { |
601 | 0 | x86_reg dummy=0; |
602 | 0 | x86_reg dstW_reg = dstW; |
603 | 0 | x86_reg uv_off = c->uv_offx2; |
604 | |
|
605 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
606 | 0 | YSCALEYUV2RGBX |
607 | 0 | "pxor %%mm7, %%mm7 \n\t" |
608 | 0 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize |
609 | 0 | "add %4, %%"FF_REG_c" \n\t" |
610 | 0 | WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) |
611 | 0 | :: "r" (&c->redDither), |
612 | 0 | "m" (dummy), "m" (dummy), "m" (dummy), |
613 | 0 | "r" (dest), "m" (dstW_reg), "m"(uv_off) |
614 | 0 | NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B) |
615 | 0 | : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S |
616 | 0 | ); |
617 | 0 | } |
618 | | |
619 | | static void RENAME(yuv2bgr24_X)(SwsInternal *c, const int16_t *lumFilter, |
620 | | const int16_t **lumSrc, int lumFilterSize, |
621 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
622 | | const int16_t **chrVSrc, |
623 | | int chrFilterSize, const int16_t **alpSrc, |
624 | | uint8_t *dest, int dstW, int dstY) |
625 | 0 | { |
626 | 0 | x86_reg dummy=0; |
627 | 0 | x86_reg dstW_reg = dstW; |
628 | 0 | x86_reg uv_off = c->uv_offx2; |
629 | |
|
630 | 0 | YSCALEYUV2PACKEDX |
631 | 0 | YSCALEYUV2RGBX |
632 | 0 | "pxor %%mm7, %%mm7 \n\t" |
633 | 0 | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize |
634 | 0 | "add %4, %%"FF_REG_c" \n\t" |
635 | 0 | WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) |
636 | 0 | :: "r" (&c->redDither), |
637 | 0 | "m" (dummy), "m" (dummy), "m" (dummy), |
638 | 0 | "r" (dest), "m" (dstW_reg), "m"(uv_off) |
639 | 0 | NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B) |
640 | 0 | : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S |
641 | 0 | ); |
642 | 0 | } |
643 | | #endif /* HAVE_6REGS */ |
644 | | |
645 | | #define REAL_WRITEYUY2(dst, dstw, index) \ |
646 | | "packuswb %%mm3, %%mm3 \n\t"\ |
647 | | "packuswb %%mm4, %%mm4 \n\t"\ |
648 | | "packuswb %%mm7, %%mm1 \n\t"\ |
649 | | "punpcklbw %%mm4, %%mm3 \n\t"\ |
650 | | "movq %%mm1, %%mm7 \n\t"\ |
651 | | "punpcklbw %%mm3, %%mm1 \n\t"\ |
652 | | "punpckhbw %%mm3, %%mm7 \n\t"\ |
653 | | \ |
654 | | MOVNTQ(%%mm1, (dst, index, 2))\ |
655 | | MOVNTQ(%%mm7, 8(dst, index, 2))\ |
656 | | \ |
657 | | "add $8, "#index" \n\t"\ |
658 | | "cmp "dstw", "#index" \n\t"\ |
659 | | " jb 1b \n\t" |
660 | | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
661 | | |
662 | | static void RENAME(yuv2yuyv422_X_ar)(SwsInternal *c, const int16_t *lumFilter, |
663 | | const int16_t **lumSrc, int lumFilterSize, |
664 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
665 | | const int16_t **chrVSrc, |
666 | | int chrFilterSize, const int16_t **alpSrc, |
667 | | uint8_t *dest, int dstW, int dstY) |
668 | 0 | { |
669 | 0 | x86_reg dummy=0; |
670 | 0 | x86_reg dstW_reg = dstW; |
671 | 0 | x86_reg uv_off = c->uv_offx2; |
672 | |
|
673 | 0 | YSCALEYUV2PACKEDX_ACCURATE |
674 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
675 | 0 | "psraw $3, %%mm3 \n\t" |
676 | 0 | "psraw $3, %%mm4 \n\t" |
677 | 0 | "psraw $3, %%mm1 \n\t" |
678 | 0 | "psraw $3, %%mm7 \n\t" |
679 | 0 | WRITEYUY2(%4, "%5", %%FF_REGa) |
680 | 0 | YSCALEYUV2PACKEDX_END |
681 | 0 | } |
682 | | |
683 | | static void RENAME(yuv2yuyv422_X)(SwsInternal *c, const int16_t *lumFilter, |
684 | | const int16_t **lumSrc, int lumFilterSize, |
685 | | const int16_t *chrFilter, const int16_t **chrUSrc, |
686 | | const int16_t **chrVSrc, |
687 | | int chrFilterSize, const int16_t **alpSrc, |
688 | | uint8_t *dest, int dstW, int dstY) |
689 | 0 | { |
690 | 0 | x86_reg dummy=0; |
691 | 0 | x86_reg dstW_reg = dstW; |
692 | 0 | x86_reg uv_off = c->uv_offx2; |
693 | |
|
694 | 0 | YSCALEYUV2PACKEDX |
695 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
696 | 0 | "psraw $3, %%mm3 \n\t" |
697 | 0 | "psraw $3, %%mm4 \n\t" |
698 | 0 | "psraw $3, %%mm1 \n\t" |
699 | 0 | "psraw $3, %%mm7 \n\t" |
700 | 0 | WRITEYUY2(%4, "%5", %%FF_REGa) |
701 | 0 | YSCALEYUV2PACKEDX_END |
702 | 0 | } |
703 | | |
704 | | #define REAL_YSCALEYUV2RGB_UV(index, c) \ |
705 | | "xor "#index", "#index" \n\t"\ |
706 | | ".p2align 4 \n\t"\ |
707 | | "1: \n\t"\ |
708 | | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
709 | | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
710 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
711 | | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
712 | | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
713 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
714 | | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
715 | | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
716 | | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
717 | | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
718 | | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
719 | | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
720 | | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
721 | | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
722 | | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
723 | | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
724 | | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
725 | | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
726 | | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
727 | | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
728 | | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
729 | | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
730 | | |
731 | | #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ |
732 | | "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
733 | | "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
734 | | "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ |
735 | | "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ |
736 | | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
737 | | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
738 | | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
739 | | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
740 | | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
741 | | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
742 | | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
743 | | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
744 | | |
745 | | #define REAL_YSCALEYUV2RGB_COEFF(c) \ |
746 | | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
747 | | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
748 | | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
749 | | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
750 | | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
751 | | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
752 | | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
753 | | "paddw %%mm3, %%mm4 \n\t"\ |
754 | | "movq %%mm2, %%mm0 \n\t"\ |
755 | | "movq %%mm5, %%mm6 \n\t"\ |
756 | | "movq %%mm4, %%mm3 \n\t"\ |
757 | | "punpcklwd %%mm2, %%mm2 \n\t"\ |
758 | | "punpcklwd %%mm5, %%mm5 \n\t"\ |
759 | | "punpcklwd %%mm4, %%mm4 \n\t"\ |
760 | | "paddw %%mm1, %%mm2 \n\t"\ |
761 | | "paddw %%mm1, %%mm5 \n\t"\ |
762 | | "paddw %%mm1, %%mm4 \n\t"\ |
763 | | "punpckhwd %%mm0, %%mm0 \n\t"\ |
764 | | "punpckhwd %%mm6, %%mm6 \n\t"\ |
765 | | "punpckhwd %%mm3, %%mm3 \n\t"\ |
766 | | "paddw %%mm7, %%mm0 \n\t"\ |
767 | | "paddw %%mm7, %%mm6 \n\t"\ |
768 | | "paddw %%mm7, %%mm3 \n\t"\ |
769 | | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
770 | | "packuswb %%mm0, %%mm2 \n\t"\ |
771 | | "packuswb %%mm6, %%mm5 \n\t"\ |
772 | | "packuswb %%mm3, %%mm4 \n\t"\ |
773 | | |
774 | | #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) |
775 | | |
776 | | #define YSCALEYUV2RGB(index, c) \ |
777 | | REAL_YSCALEYUV2RGB_UV(index, c) \ |
778 | | REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ |
779 | | REAL_YSCALEYUV2RGB_COEFF(c) |
780 | | |
781 | | /** |
782 | | * vertical bilinear scale YV12 to RGB |
783 | | */ |
784 | | static void RENAME(yuv2rgb32_2)(SwsInternal *c, const int16_t *buf[2], |
785 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
786 | | const int16_t *abuf[2], uint8_t *dest, |
787 | | int dstW, int yalpha, int uvalpha, int y) |
788 | 0 | { |
789 | 0 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
790 | 0 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
791 | |
|
792 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
793 | 0 | const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; |
794 | 0 | #if ARCH_X86_64 |
795 | 0 | __asm__ volatile( |
796 | 0 | YSCALEYUV2RGB(%%r8, %5) |
797 | 0 | YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) |
798 | 0 | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
799 | 0 | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
800 | 0 | "packuswb %%mm7, %%mm1 \n\t" |
801 | 0 | WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
802 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), |
803 | 0 | "a" (&c->redDither), |
804 | 0 | "r" (abuf0), "r" (abuf1) |
805 | 0 | : "%r8" |
806 | 0 | ); |
807 | | #else |
808 | | c->u_temp=(intptr_t)abuf0; |
809 | | c->v_temp=(intptr_t)abuf1; |
810 | | __asm__ volatile( |
811 | | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
812 | | "mov %4, %%"FF_REG_b" \n\t" |
813 | | "push %%"FF_REG_BP" \n\t" |
814 | | YSCALEYUV2RGB(%%FF_REGBP, %5) |
815 | | "push %0 \n\t" |
816 | | "push %1 \n\t" |
817 | | "mov "U_TEMP"(%5), %0 \n\t" |
818 | | "mov "V_TEMP"(%5), %1 \n\t" |
819 | | YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1) |
820 | | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
821 | | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
822 | | "packuswb %%mm7, %%mm1 \n\t" |
823 | | "pop %1 \n\t" |
824 | | "pop %0 \n\t" |
825 | | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
826 | | "pop %%"FF_REG_BP" \n\t" |
827 | | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
828 | | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
829 | | "a" (&c->redDither) |
830 | | ); |
831 | | #endif |
832 | 0 | } else { |
833 | 0 | __asm__ volatile( |
834 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
835 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
836 | 0 | "push %%"FF_REG_BP" \n\t" |
837 | 0 | YSCALEYUV2RGB(%%FF_REGBP, %5) |
838 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
839 | 0 | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
840 | 0 | "pop %%"FF_REG_BP" \n\t" |
841 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
842 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
843 | 0 | "a" (&c->redDither) |
844 | 0 | ); |
845 | 0 | } |
846 | 0 | } |
847 | | |
848 | | static void RENAME(yuv2bgr24_2)(SwsInternal *c, const int16_t *buf[2], |
849 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
850 | | const int16_t *abuf[2], uint8_t *dest, |
851 | | int dstW, int yalpha, int uvalpha, int y) |
852 | 0 | { |
853 | 0 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
854 | 0 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
855 | |
|
856 | 0 | __asm__ volatile( |
857 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
858 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
859 | 0 | "push %%"FF_REG_BP" \n\t" |
860 | 0 | YSCALEYUV2RGB(%%FF_REGBP, %5) |
861 | 0 | "pxor %%mm7, %%mm7 \n\t" |
862 | 0 | WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
863 | 0 | "pop %%"FF_REG_BP" \n\t" |
864 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
865 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
866 | 0 | "a" (&c->redDither) |
867 | 0 | NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B) |
868 | 0 | ); |
869 | 0 | } |
870 | | |
871 | | static void RENAME(yuv2rgb555_2)(SwsInternal *c, const int16_t *buf[2], |
872 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
873 | | const int16_t *abuf[2], uint8_t *dest, |
874 | | int dstW, int yalpha, int uvalpha, int y) |
875 | 0 | { |
876 | 0 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
877 | 0 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
878 | |
|
879 | 0 | __asm__ volatile( |
880 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
881 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
882 | 0 | "push %%"FF_REG_BP" \n\t" |
883 | 0 | YSCALEYUV2RGB(%%FF_REGBP, %5) |
884 | 0 | "pxor %%mm7, %%mm7 \n\t" |
885 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
886 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
887 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
888 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
889 | 0 | WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
890 | 0 | "pop %%"FF_REG_BP" \n\t" |
891 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
892 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
893 | 0 | "a" (&c->redDither) |
894 | 0 | NAMED_CONSTRAINTS_ADD(bF8) |
895 | 0 | ); |
896 | 0 | } |
897 | | |
898 | | static void RENAME(yuv2rgb565_2)(SwsInternal *c, const int16_t *buf[2], |
899 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
900 | | const int16_t *abuf[2], uint8_t *dest, |
901 | | int dstW, int yalpha, int uvalpha, int y) |
902 | 0 | { |
903 | 0 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
904 | 0 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
905 | |
|
906 | 0 | __asm__ volatile( |
907 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
908 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
909 | 0 | "push %%"FF_REG_BP" \n\t" |
910 | 0 | YSCALEYUV2RGB(%%FF_REGBP, %5) |
911 | 0 | "pxor %%mm7, %%mm7 \n\t" |
912 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
913 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
914 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
915 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
916 | 0 | WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
917 | 0 | "pop %%"FF_REG_BP" \n\t" |
918 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
919 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
920 | 0 | "a" (&c->redDither) |
921 | 0 | NAMED_CONSTRAINTS_ADD(bF8,bFC) |
922 | 0 | ); |
923 | 0 | } |
924 | | |
925 | | #define REAL_YSCALEYUV2PACKED(index, c) \ |
926 | | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
927 | | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ |
928 | | "psraw $3, %%mm0 \n\t"\ |
929 | | "psraw $3, %%mm1 \n\t"\ |
930 | | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ |
931 | | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ |
932 | | "xor "#index", "#index" \n\t"\ |
933 | | ".p2align 4 \n\t"\ |
934 | | "1: \n\t"\ |
935 | | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
936 | | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
937 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
938 | | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
939 | | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
940 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
941 | | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
942 | | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
943 | | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
944 | | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
945 | | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
946 | | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
947 | | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
948 | | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
949 | | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
950 | | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
951 | | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
952 | | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ |
953 | | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ |
954 | | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
955 | | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
956 | | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
957 | | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
958 | | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
959 | | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
960 | | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
961 | | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
962 | | |
963 | | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
964 | | |
965 | | static void RENAME(yuv2yuyv422_2)(SwsInternal *c, const int16_t *buf[2], |
966 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
967 | | const int16_t *abuf[2], uint8_t *dest, |
968 | | int dstW, int yalpha, int uvalpha, int y) |
969 | 0 | { |
970 | 0 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
971 | 0 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
972 | |
|
973 | 0 | __asm__ volatile( |
974 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
975 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
976 | 0 | "push %%"FF_REG_BP" \n\t" |
977 | 0 | YSCALEYUV2PACKED(%%FF_REGBP, %5) |
978 | 0 | WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
979 | 0 | "pop %%"FF_REG_BP" \n\t" |
980 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
981 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
982 | 0 | "a" (&c->redDither) |
983 | 0 | ); |
984 | 0 | } |
985 | | |
986 | | #define REAL_YSCALEYUV2RGB1(index, c) \ |
987 | | "xor "#index", "#index" \n\t"\ |
988 | | ".p2align 4 \n\t"\ |
989 | | "1: \n\t"\ |
990 | | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
991 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
992 | | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
993 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
994 | | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
995 | | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
996 | | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
997 | | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
998 | | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
999 | | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
1000 | | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
1001 | | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
1002 | | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
1003 | | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
1004 | | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
1005 | | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
1006 | | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
1007 | | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
1008 | | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
1009 | | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
1010 | | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
1011 | | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
1012 | | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
1013 | | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
1014 | | "paddw %%mm3, %%mm4 \n\t"\ |
1015 | | "movq %%mm2, %%mm0 \n\t"\ |
1016 | | "movq %%mm5, %%mm6 \n\t"\ |
1017 | | "movq %%mm4, %%mm3 \n\t"\ |
1018 | | "punpcklwd %%mm2, %%mm2 \n\t"\ |
1019 | | "punpcklwd %%mm5, %%mm5 \n\t"\ |
1020 | | "punpcklwd %%mm4, %%mm4 \n\t"\ |
1021 | | "paddw %%mm1, %%mm2 \n\t"\ |
1022 | | "paddw %%mm1, %%mm5 \n\t"\ |
1023 | | "paddw %%mm1, %%mm4 \n\t"\ |
1024 | | "punpckhwd %%mm0, %%mm0 \n\t"\ |
1025 | | "punpckhwd %%mm6, %%mm6 \n\t"\ |
1026 | | "punpckhwd %%mm3, %%mm3 \n\t"\ |
1027 | | "paddw %%mm7, %%mm0 \n\t"\ |
1028 | | "paddw %%mm7, %%mm6 \n\t"\ |
1029 | | "paddw %%mm7, %%mm3 \n\t"\ |
1030 | | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
1031 | | "packuswb %%mm0, %%mm2 \n\t"\ |
1032 | | "packuswb %%mm6, %%mm5 \n\t"\ |
1033 | | "packuswb %%mm3, %%mm4 \n\t"\ |
1034 | | |
1035 | | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
1036 | | |
1037 | | // do vertical chrominance interpolation |
1038 | | #define REAL_YSCALEYUV2RGB1b(index, c) \ |
1039 | | "xor "#index", "#index" \n\t"\ |
1040 | | ".p2align 4 \n\t"\ |
1041 | | "1: \n\t"\ |
1042 | | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
1043 | | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
1044 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1045 | | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
1046 | | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
1047 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1048 | | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
1049 | | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ |
1050 | | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
1051 | | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ |
1052 | | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
1053 | | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
1054 | | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
1055 | | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
1056 | | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
1057 | | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
1058 | | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
1059 | | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
1060 | | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
1061 | | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
1062 | | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
1063 | | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
1064 | | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
1065 | | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
1066 | | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
1067 | | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
1068 | | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
1069 | | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
1070 | | "paddw %%mm3, %%mm4 \n\t"\ |
1071 | | "movq %%mm2, %%mm0 \n\t"\ |
1072 | | "movq %%mm5, %%mm6 \n\t"\ |
1073 | | "movq %%mm4, %%mm3 \n\t"\ |
1074 | | "punpcklwd %%mm2, %%mm2 \n\t"\ |
1075 | | "punpcklwd %%mm5, %%mm5 \n\t"\ |
1076 | | "punpcklwd %%mm4, %%mm4 \n\t"\ |
1077 | | "paddw %%mm1, %%mm2 \n\t"\ |
1078 | | "paddw %%mm1, %%mm5 \n\t"\ |
1079 | | "paddw %%mm1, %%mm4 \n\t"\ |
1080 | | "punpckhwd %%mm0, %%mm0 \n\t"\ |
1081 | | "punpckhwd %%mm6, %%mm6 \n\t"\ |
1082 | | "punpckhwd %%mm3, %%mm3 \n\t"\ |
1083 | | "paddw %%mm7, %%mm0 \n\t"\ |
1084 | | "paddw %%mm7, %%mm6 \n\t"\ |
1085 | | "paddw %%mm7, %%mm3 \n\t"\ |
1086 | | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
1087 | | "packuswb %%mm0, %%mm2 \n\t"\ |
1088 | | "packuswb %%mm6, %%mm5 \n\t"\ |
1089 | | "packuswb %%mm3, %%mm4 \n\t"\ |
1090 | | |
1091 | | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
1092 | | |
1093 | | #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ |
1094 | | "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ |
1095 | | "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ |
1096 | | "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ |
1097 | | "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ |
1098 | | "packuswb %%mm1, %%mm7 \n\t" |
1099 | | #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) |
1100 | | |
1101 | | /** |
1102 | | * YV12 to RGB without scaling or interpolating |
1103 | | */ |
1104 | | static void RENAME(yuv2rgb32_1)(SwsInternal *c, const int16_t *buf0, |
1105 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
1106 | | const int16_t *abuf0, uint8_t *dest, |
1107 | | int dstW, int uvalpha, int y) |
1108 | 0 | { |
1109 | 0 | const int16_t *ubuf0 = ubuf[0]; |
1110 | 0 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
1111 | |
|
1112 | 0 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
1113 | 0 | const int16_t *ubuf1 = ubuf[0]; |
1114 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
1115 | 0 | __asm__ volatile( |
1116 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1117 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1118 | 0 | "push %%"FF_REG_BP" \n\t" |
1119 | 0 | YSCALEYUV2RGB1(%%FF_REGBP, %5) |
1120 | 0 | YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) |
1121 | 0 | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
1122 | 0 | "pop %%"FF_REG_BP" \n\t" |
1123 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1124 | 0 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1125 | 0 | "a" (&c->redDither) |
1126 | 0 | ); |
1127 | 0 | } else { |
1128 | 0 | __asm__ volatile( |
1129 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1130 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1131 | 0 | "push %%"FF_REG_BP" \n\t" |
1132 | 0 | YSCALEYUV2RGB1(%%FF_REGBP, %5) |
1133 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
1134 | 0 | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
1135 | 0 | "pop %%"FF_REG_BP" \n\t" |
1136 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1137 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1138 | 0 | "a" (&c->redDither) |
1139 | 0 | ); |
1140 | 0 | } |
1141 | 0 | } else { |
1142 | 0 | const int16_t *ubuf1 = ubuf[1]; |
1143 | 0 | if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { |
1144 | 0 | __asm__ volatile( |
1145 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1146 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1147 | 0 | "push %%"FF_REG_BP" \n\t" |
1148 | 0 | YSCALEYUV2RGB1b(%%FF_REGBP, %5) |
1149 | 0 | YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) |
1150 | 0 | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
1151 | 0 | "pop %%"FF_REG_BP" \n\t" |
1152 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1153 | 0 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1154 | 0 | "a" (&c->redDither) |
1155 | 0 | ); |
1156 | 0 | } else { |
1157 | 0 | __asm__ volatile( |
1158 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1159 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1160 | 0 | "push %%"FF_REG_BP" \n\t" |
1161 | 0 | YSCALEYUV2RGB1b(%%FF_REGBP, %5) |
1162 | 0 | "pcmpeqd %%mm7, %%mm7 \n\t" |
1163 | 0 | WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
1164 | 0 | "pop %%"FF_REG_BP" \n\t" |
1165 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1166 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1167 | 0 | "a" (&c->redDither) |
1168 | 0 | ); |
1169 | 0 | } |
1170 | 0 | } |
1171 | 0 | } |
1172 | | |
1173 | | static void RENAME(yuv2bgr24_1)(SwsInternal *c, const int16_t *buf0, |
1174 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
1175 | | const int16_t *abuf0, uint8_t *dest, |
1176 | | int dstW, int uvalpha, int y) |
1177 | 0 | { |
1178 | 0 | const int16_t *ubuf0 = ubuf[0]; |
1179 | 0 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
1180 | |
|
1181 | 0 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
1182 | 0 | const int16_t *ubuf1 = ubuf[0]; |
1183 | 0 | __asm__ volatile( |
1184 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1185 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1186 | 0 | "push %%"FF_REG_BP" \n\t" |
1187 | 0 | YSCALEYUV2RGB1(%%FF_REGBP, %5) |
1188 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1189 | 0 | WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1190 | 0 | "pop %%"FF_REG_BP" \n\t" |
1191 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1192 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1193 | 0 | "a" (&c->redDither) |
1194 | 0 | NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B) |
1195 | 0 | ); |
1196 | 0 | } else { |
1197 | 0 | const int16_t *ubuf1 = ubuf[1]; |
1198 | 0 | __asm__ volatile( |
1199 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1200 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1201 | 0 | "push %%"FF_REG_BP" \n\t" |
1202 | 0 | YSCALEYUV2RGB1b(%%FF_REGBP, %5) |
1203 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1204 | 0 | WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1205 | 0 | "pop %%"FF_REG_BP" \n\t" |
1206 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1207 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1208 | 0 | "a" (&c->redDither) |
1209 | 0 | NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B) |
1210 | 0 | ); |
1211 | 0 | } |
1212 | 0 | } |
1213 | | |
1214 | | static void RENAME(yuv2rgb555_1)(SwsInternal *c, const int16_t *buf0, |
1215 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
1216 | | const int16_t *abuf0, uint8_t *dest, |
1217 | | int dstW, int uvalpha, int y) |
1218 | 0 | { |
1219 | 0 | const int16_t *ubuf0 = ubuf[0]; |
1220 | 0 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
1221 | |
|
1222 | 0 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
1223 | 0 | const int16_t *ubuf1 = ubuf[0]; |
1224 | 0 | __asm__ volatile( |
1225 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1226 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1227 | 0 | "push %%"FF_REG_BP" \n\t" |
1228 | 0 | YSCALEYUV2RGB1(%%FF_REGBP, %5) |
1229 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1230 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1231 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1232 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
1233 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
1234 | 0 | WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1235 | 0 | "pop %%"FF_REG_BP" \n\t" |
1236 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1237 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1238 | 0 | "a" (&c->redDither) |
1239 | 0 | NAMED_CONSTRAINTS_ADD(bF8) |
1240 | 0 | ); |
1241 | 0 | } else { |
1242 | 0 | const int16_t *ubuf1 = ubuf[1]; |
1243 | 0 | __asm__ volatile( |
1244 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1245 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1246 | 0 | "push %%"FF_REG_BP" \n\t" |
1247 | 0 | YSCALEYUV2RGB1b(%%FF_REGBP, %5) |
1248 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1249 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1250 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1251 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
1252 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
1253 | 0 | WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1254 | 0 | "pop %%"FF_REG_BP" \n\t" |
1255 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1256 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1257 | 0 | "a" (&c->redDither) |
1258 | 0 | NAMED_CONSTRAINTS_ADD(bF8) |
1259 | 0 | ); |
1260 | 0 | } |
1261 | 0 | } |
1262 | | |
1263 | | static void RENAME(yuv2rgb565_1)(SwsInternal *c, const int16_t *buf0, |
1264 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
1265 | | const int16_t *abuf0, uint8_t *dest, |
1266 | | int dstW, int uvalpha, int y) |
1267 | 0 | { |
1268 | 0 | const int16_t *ubuf0 = ubuf[0]; |
1269 | 0 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
1270 | |
|
1271 | 0 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
1272 | 0 | const int16_t *ubuf1 = ubuf[0]; |
1273 | 0 | __asm__ volatile( |
1274 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1275 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1276 | 0 | "push %%"FF_REG_BP" \n\t" |
1277 | 0 | YSCALEYUV2RGB1(%%FF_REGBP, %5) |
1278 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1279 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1280 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1281 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
1282 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
1283 | 0 | WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1284 | 0 | "pop %%"FF_REG_BP" \n\t" |
1285 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1286 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1287 | 0 | "a" (&c->redDither) |
1288 | 0 | NAMED_CONSTRAINTS_ADD(bF8,bFC) |
1289 | 0 | ); |
1290 | 0 | } else { |
1291 | 0 | const int16_t *ubuf1 = ubuf[1]; |
1292 | 0 | __asm__ volatile( |
1293 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1294 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1295 | 0 | "push %%"FF_REG_BP" \n\t" |
1296 | 0 | YSCALEYUV2RGB1b(%%FF_REGBP, %5) |
1297 | 0 | "pxor %%mm7, %%mm7 \n\t" |
1298 | | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
1299 | 0 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
1300 | 0 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
1301 | 0 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
1302 | 0 | WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1303 | 0 | "pop %%"FF_REG_BP" \n\t" |
1304 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1305 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1306 | 0 | "a" (&c->redDither) |
1307 | 0 | NAMED_CONSTRAINTS_ADD(bF8,bFC) |
1308 | 0 | ); |
1309 | 0 | } |
1310 | 0 | } |
1311 | | |
1312 | | #define REAL_YSCALEYUV2PACKED1(index, c) \ |
1313 | | "xor "#index", "#index" \n\t"\ |
1314 | | ".p2align 4 \n\t"\ |
1315 | | "1: \n\t"\ |
1316 | | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
1317 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1318 | | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
1319 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1320 | | "psraw $7, %%mm3 \n\t" \ |
1321 | | "psraw $7, %%mm4 \n\t" \ |
1322 | | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
1323 | | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
1324 | | "psraw $7, %%mm1 \n\t" \ |
1325 | | "psraw $7, %%mm7 \n\t" \ |
1326 | | |
1327 | | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
1328 | | |
1329 | | #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
1330 | | "xor "#index", "#index" \n\t"\ |
1331 | | ".p2align 4 \n\t"\ |
1332 | | "1: \n\t"\ |
1333 | | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
1334 | | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
1335 | | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1336 | | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
1337 | | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
1338 | | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
1339 | | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
1340 | | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ |
1341 | | "psrlw $8, %%mm3 \n\t" \ |
1342 | | "psrlw $8, %%mm4 \n\t" \ |
1343 | | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
1344 | | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
1345 | | "psraw $7, %%mm1 \n\t" \ |
1346 | | "psraw $7, %%mm7 \n\t" |
1347 | | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
1348 | | |
1349 | | static void RENAME(yuv2yuyv422_1)(SwsInternal *c, const int16_t *buf0, |
1350 | | const int16_t *ubuf[2], const int16_t *vbuf[2], |
1351 | | const int16_t *abuf0, uint8_t *dest, |
1352 | | int dstW, int uvalpha, int y) |
1353 | 0 | { |
1354 | 0 | const int16_t *ubuf0 = ubuf[0]; |
1355 | 0 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
1356 | |
|
1357 | 0 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
1358 | 0 | const int16_t *ubuf1 = ubuf[0]; |
1359 | 0 | __asm__ volatile( |
1360 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1361 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1362 | 0 | "push %%"FF_REG_BP" \n\t" |
1363 | 0 | YSCALEYUV2PACKED1(%%FF_REGBP, %5) |
1364 | 0 | WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1365 | 0 | "pop %%"FF_REG_BP" \n\t" |
1366 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1367 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1368 | 0 | "a" (&c->redDither) |
1369 | 0 | ); |
1370 | 0 | } else { |
1371 | 0 | const int16_t *ubuf1 = ubuf[1]; |
1372 | 0 | __asm__ volatile( |
1373 | 0 | "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" |
1374 | 0 | "mov %4, %%"FF_REG_b" \n\t" |
1375 | 0 | "push %%"FF_REG_BP" \n\t" |
1376 | 0 | YSCALEYUV2PACKED1b(%%FF_REGBP, %5) |
1377 | 0 | WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) |
1378 | 0 | "pop %%"FF_REG_BP" \n\t" |
1379 | 0 | "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" |
1380 | 0 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
1381 | 0 | "a" (&c->redDither) |
1382 | 0 | ); |
1383 | 0 | } |
1384 | 0 | } |
1385 | | static av_cold void RENAME(sws_init_swscale)(SwsInternal *c) |
1386 | 0 | { |
1387 | 0 | enum AVPixelFormat dstFormat = c->opts.dst_format; |
1388 | |
|
1389 | 0 | c->use_mmx_vfilter= 0; |
1390 | 0 | if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) |
1391 | 0 | && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE |
1392 | 0 | && !(c->opts.flags & SWS_BITEXACT)) { |
1393 | 0 | if (c->opts.flags & SWS_ACCURATE_RND) { |
1394 | 0 | if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) { |
1395 | 0 | switch (c->opts.dst_format) { |
1396 | 0 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; |
1397 | 0 | #if HAVE_6REGS |
1398 | 0 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; |
1399 | 0 | #endif |
1400 | 0 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; |
1401 | 0 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; |
1402 | 0 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; |
1403 | 0 | default: break; |
1404 | 0 | } |
1405 | 0 | } |
1406 | 0 | } else { |
1407 | 0 | c->use_mmx_vfilter= 1; |
1408 | 0 | if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) { |
1409 | 0 | switch (c->opts.dst_format) { |
1410 | 0 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; |
1411 | 0 | case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break; |
1412 | 0 | #if HAVE_6REGS |
1413 | 0 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; |
1414 | 0 | #endif |
1415 | 0 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; |
1416 | 0 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; |
1417 | 0 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; |
1418 | 0 | default: break; |
1419 | 0 | } |
1420 | 0 | } |
1421 | 0 | } |
1422 | 0 | if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) { |
1423 | 0 | switch (c->opts.dst_format) { |
1424 | 0 | case AV_PIX_FMT_RGB32: |
1425 | 0 | c->yuv2packed1 = RENAME(yuv2rgb32_1); |
1426 | 0 | c->yuv2packed2 = RENAME(yuv2rgb32_2); |
1427 | 0 | break; |
1428 | 0 | case AV_PIX_FMT_BGR24: |
1429 | 0 | c->yuv2packed1 = RENAME(yuv2bgr24_1); |
1430 | 0 | c->yuv2packed2 = RENAME(yuv2bgr24_2); |
1431 | 0 | break; |
1432 | 0 | case AV_PIX_FMT_RGB555: |
1433 | 0 | c->yuv2packed1 = RENAME(yuv2rgb555_1); |
1434 | 0 | c->yuv2packed2 = RENAME(yuv2rgb555_2); |
1435 | 0 | break; |
1436 | 0 | case AV_PIX_FMT_RGB565: |
1437 | 0 | c->yuv2packed1 = RENAME(yuv2rgb565_1); |
1438 | 0 | c->yuv2packed2 = RENAME(yuv2rgb565_2); |
1439 | 0 | break; |
1440 | 0 | case AV_PIX_FMT_YUYV422: |
1441 | 0 | c->yuv2packed1 = RENAME(yuv2yuyv422_1); |
1442 | 0 | c->yuv2packed2 = RENAME(yuv2yuyv422_2); |
1443 | 0 | break; |
1444 | 0 | default: |
1445 | 0 | break; |
1446 | 0 | } |
1447 | 0 | } |
1448 | 0 | } |
1449 | | |
1450 | 0 | if (c->srcBpc == 8 && c->dstBpc <= 14) { |
1451 | | // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). |
1452 | 0 | if (c->opts.flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { |
1453 | 0 | c->hyscale_fast = ff_hyscale_fast_mmxext; |
1454 | 0 | c->hcscale_fast = ff_hcscale_fast_mmxext; |
1455 | 0 | } else { |
1456 | 0 | c->hyscale_fast = NULL; |
1457 | | c->hcscale_fast = NULL; |
1458 | 0 | } |
1459 | 0 | } |
1460 | 0 | } |