/src/ffmpeg/libswscale/x86/rgb2rgb.c
Line | Count | Source |
1 | | /* |
2 | | * software RGB to RGB converter |
3 | | * pluralize by software PAL8 to RGB converter |
4 | | * software YUV to YUV converter |
5 | | * software YUV to RGB converter |
6 | | * Written by Nick Kurshev. |
7 | | * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) |
8 | | * |
9 | | * This file is part of FFmpeg. |
10 | | * |
11 | | * FFmpeg is free software; you can redistribute it and/or |
12 | | * modify it under the terms of the GNU Lesser General Public |
13 | | * License as published by the Free Software Foundation; either |
14 | | * version 2.1 of the License, or (at your option) any later version. |
15 | | * |
16 | | * FFmpeg is distributed in the hope that it will be useful, |
17 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19 | | * Lesser General Public License for more details. |
20 | | * |
21 | | * You should have received a copy of the GNU Lesser General Public |
22 | | * License along with FFmpeg; if not, write to the Free Software |
23 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
24 | | */ |
25 | | |
26 | | #include <stdint.h> |
27 | | |
28 | | #include "config.h" |
29 | | #include "libavutil/attributes.h" |
30 | | #include "libavutil/x86/cpu.h" |
31 | | #include "libavutil/cpu.h" |
32 | | #include "libavutil/bswap.h" |
33 | | #include "libavutil/mem_internal.h" |
34 | | |
35 | | #include "libswscale/rgb2rgb.h" |
36 | | #include "libswscale/swscale.h" |
37 | | #include "libswscale/swscale_internal.h" |
38 | | |
39 | | #if HAVE_INLINE_ASM |
40 | | #include "libavutil/x86/asm.h" |
41 | | |
42 | | DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL; |
43 | | DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL; |
44 | | DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL; |
45 | | DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL; |
46 | | DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL; |
47 | | DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL; |
48 | | DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL; |
49 | | DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL; |
50 | | DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL; |
51 | | DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL; |
52 | | DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL; |
53 | | DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL; |
54 | | DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL; |
55 | | DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ |
56 | | DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ |
57 | | DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL; |
58 | | DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL; |
59 | | DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL; |
60 | 13.5k | #define mask16b mask15b |
61 | | DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL; |
62 | | DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL; |
63 | 1.03k | #define red_16mask mask3215g |
64 | | DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL; |
65 | | DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL; |
66 | | DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL; |
67 | | DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL; |
68 | 6.82k | #define blue_15mask blue_16mask |
69 | | DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL; |
70 | | DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL; |
71 | | DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL; |
72 | | |
73 | | #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5)) |
74 | | #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5)) |
75 | | #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) |
76 | | #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5)) |
77 | | #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5)) |
78 | | #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5)) |
79 | | #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5)) |
80 | | #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) |
81 | | #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) |
82 | | |
83 | | // MMXEXT versions |
84 | | #define PREFETCH "prefetchnta" |
85 | | #define PAVGB "pavgb" |
86 | | #define MOVNTQ "movntq" |
87 | | #define SFENCE "sfence" |
88 | | |
89 | | #define EMMS "emms" |
90 | | |
91 | | static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
92 | 1.36k | { |
93 | 1.36k | uint8_t *dest = dst; |
94 | 1.36k | const uint8_t *s = src; |
95 | 1.36k | const uint8_t *end; |
96 | 1.36k | const uint8_t *mm_end; |
97 | 1.36k | end = s + src_size; |
98 | 1.36k | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
99 | 1.36k | mm_end = end - 23; |
100 | 1.36k | __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); |
101 | 3.34k | while (s < mm_end) { |
102 | 1.97k | __asm__ volatile( |
103 | 1.97k | PREFETCH" 32(%1) \n\t" |
104 | 1.97k | "movd (%1), %%mm0 \n\t" |
105 | 1.97k | "punpckldq 3(%1), %%mm0 \n\t" |
106 | 1.97k | "movd 6(%1), %%mm1 \n\t" |
107 | 1.97k | "punpckldq 9(%1), %%mm1 \n\t" |
108 | 1.97k | "movd 12(%1), %%mm2 \n\t" |
109 | 1.97k | "punpckldq 15(%1), %%mm2 \n\t" |
110 | 1.97k | "movd 18(%1), %%mm3 \n\t" |
111 | 1.97k | "punpckldq 21(%1), %%mm3 \n\t" |
112 | 1.97k | "por %%mm7, %%mm0 \n\t" |
113 | 1.97k | "por %%mm7, %%mm1 \n\t" |
114 | 1.97k | "por %%mm7, %%mm2 \n\t" |
115 | 1.97k | "por %%mm7, %%mm3 \n\t" |
116 | 1.97k | MOVNTQ" %%mm0, (%0) \n\t" |
117 | 1.97k | MOVNTQ" %%mm1, 8(%0) \n\t" |
118 | 1.97k | MOVNTQ" %%mm2, 16(%0) \n\t" |
119 | 1.97k | MOVNTQ" %%mm3, 24(%0)" |
120 | 1.97k | :: "r"(dest), "r"(s) |
121 | 1.97k | :"memory"); |
122 | 1.97k | dest += 32; |
123 | 1.97k | s += 24; |
124 | 1.97k | } |
125 | 1.36k | __asm__ volatile(SFENCE:::"memory"); |
126 | 1.36k | __asm__ volatile(EMMS:::"memory"); |
127 | 3.08k | while (s < end) { |
128 | 1.71k | *dest++ = *s++; |
129 | 1.71k | *dest++ = *s++; |
130 | 1.71k | *dest++ = *s++; |
131 | 1.71k | *dest++ = 255; |
132 | 1.71k | } |
133 | 1.36k | } |
134 | | |
135 | | #define STORE_BGR24_MMX \ |
136 | | "psrlq $8, %%mm2 \n\t" \ |
137 | | "psrlq $8, %%mm3 \n\t" \ |
138 | | "psrlq $8, %%mm6 \n\t" \ |
139 | | "psrlq $8, %%mm7 \n\t" \ |
140 | | "pand "MANGLE(mask24l)", %%mm0\n\t" \ |
141 | | "pand "MANGLE(mask24l)", %%mm1\n\t" \ |
142 | | "pand "MANGLE(mask24l)", %%mm4\n\t" \ |
143 | | "pand "MANGLE(mask24l)", %%mm5\n\t" \ |
144 | | "pand "MANGLE(mask24h)", %%mm2\n\t" \ |
145 | | "pand "MANGLE(mask24h)", %%mm3\n\t" \ |
146 | | "pand "MANGLE(mask24h)", %%mm6\n\t" \ |
147 | | "pand "MANGLE(mask24h)", %%mm7\n\t" \ |
148 | | "por %%mm2, %%mm0 \n\t" \ |
149 | | "por %%mm3, %%mm1 \n\t" \ |
150 | | "por %%mm6, %%mm4 \n\t" \ |
151 | | "por %%mm7, %%mm5 \n\t" \ |
152 | | \ |
153 | | "movq %%mm1, %%mm2 \n\t" \ |
154 | | "movq %%mm4, %%mm3 \n\t" \ |
155 | | "psllq $48, %%mm2 \n\t" \ |
156 | | "psllq $32, %%mm3 \n\t" \ |
157 | | "por %%mm2, %%mm0 \n\t" \ |
158 | | "psrlq $16, %%mm1 \n\t" \ |
159 | | "psrlq $32, %%mm4 \n\t" \ |
160 | | "psllq $16, %%mm5 \n\t" \ |
161 | | "por %%mm3, %%mm1 \n\t" \ |
162 | | "por %%mm5, %%mm4 \n\t" \ |
163 | | \ |
164 | | MOVNTQ" %%mm0, (%0) \n\t" \ |
165 | | MOVNTQ" %%mm1, 8(%0) \n\t" \ |
166 | | MOVNTQ" %%mm4, 16(%0)" |
167 | | |
168 | | |
169 | | static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
170 | 1.04k | { |
171 | 1.04k | uint8_t *dest = dst; |
172 | 1.04k | const uint8_t *s = src; |
173 | 1.04k | const uint8_t *end; |
174 | 1.04k | const uint8_t *mm_end; |
175 | 1.04k | end = s + src_size; |
176 | 1.04k | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
177 | 1.04k | mm_end = end - 31; |
178 | 6.02k | while (s < mm_end) { |
179 | 4.98k | __asm__ volatile( |
180 | 4.98k | PREFETCH" 32(%1) \n\t" |
181 | 4.98k | "movq (%1), %%mm0 \n\t" |
182 | 4.98k | "movq 8(%1), %%mm1 \n\t" |
183 | 4.98k | "movq 16(%1), %%mm4 \n\t" |
184 | 4.98k | "movq 24(%1), %%mm5 \n\t" |
185 | 4.98k | "movq %%mm0, %%mm2 \n\t" |
186 | 4.98k | "movq %%mm1, %%mm3 \n\t" |
187 | 4.98k | "movq %%mm4, %%mm6 \n\t" |
188 | 4.98k | "movq %%mm5, %%mm7 \n\t" |
189 | 4.98k | STORE_BGR24_MMX |
190 | 4.98k | :: "r"(dest), "r"(s) |
191 | 4.98k | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) |
192 | 4.98k | :"memory"); |
193 | 4.98k | dest += 24; |
194 | 4.98k | s += 32; |
195 | 4.98k | } |
196 | 1.04k | __asm__ volatile(SFENCE:::"memory"); |
197 | 1.04k | __asm__ volatile(EMMS:::"memory"); |
198 | 3.05k | while (s < end) { |
199 | 2.01k | *dest++ = *s++; |
200 | 2.01k | *dest++ = *s++; |
201 | 2.01k | *dest++ = *s++; |
202 | 2.01k | s++; |
203 | 2.01k | } |
204 | 1.04k | } |
205 | | |
206 | | /* |
207 | | original by Strepto/Astral |
208 | | ported to gcc & bugfixed: A'rpi |
209 | | MMXEXT, 3DNOW optimization by Nick Kurshev |
210 | | 32-bit C version, and and&add trick by Michael Niedermayer |
211 | | */ |
212 | | static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
213 | 20 | { |
214 | 20 | register const uint8_t* s=src; |
215 | 20 | register uint8_t* d=dst; |
216 | 20 | register const uint8_t *end; |
217 | 20 | const uint8_t *mm_end; |
218 | 20 | end = s + src_size; |
219 | 20 | __asm__ volatile(PREFETCH" %0"::"m"(*s)); |
220 | 20 | __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); |
221 | 20 | mm_end = end - 15; |
222 | 957 | while (s<mm_end) { |
223 | 937 | __asm__ volatile( |
224 | 937 | PREFETCH" 32(%1) \n\t" |
225 | 937 | "movq (%1), %%mm0 \n\t" |
226 | 937 | "movq 8(%1), %%mm2 \n\t" |
227 | 937 | "movq %%mm0, %%mm1 \n\t" |
228 | 937 | "movq %%mm2, %%mm3 \n\t" |
229 | 937 | "pand %%mm4, %%mm0 \n\t" |
230 | 937 | "pand %%mm4, %%mm2 \n\t" |
231 | 937 | "paddw %%mm1, %%mm0 \n\t" |
232 | 937 | "paddw %%mm3, %%mm2 \n\t" |
233 | 937 | MOVNTQ" %%mm0, (%0) \n\t" |
234 | 937 | MOVNTQ" %%mm2, 8(%0)" |
235 | 937 | :: "r"(d), "r"(s) |
236 | 937 | ); |
237 | 937 | d+=16; |
238 | 937 | s+=16; |
239 | 937 | } |
240 | 20 | __asm__ volatile(SFENCE:::"memory"); |
241 | 20 | __asm__ volatile(EMMS:::"memory"); |
242 | 20 | mm_end = end - 3; |
243 | 50 | while (s < mm_end) { |
244 | 30 | register unsigned x= *((const uint32_t *)s); |
245 | 30 | *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); |
246 | 30 | d+=4; |
247 | 30 | s+=4; |
248 | 30 | } |
249 | 20 | if (s < end) { |
250 | 6 | register unsigned short x= *((const uint16_t *)s); |
251 | 6 | *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); |
252 | 6 | } |
253 | 20 | } |
254 | | |
255 | | static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
256 | 1.38k | { |
257 | 1.38k | register const uint8_t* s=src; |
258 | 1.38k | register uint8_t* d=dst; |
259 | 1.38k | register const uint8_t *end; |
260 | 1.38k | const uint8_t *mm_end; |
261 | 1.38k | end = s + src_size; |
262 | 1.38k | __asm__ volatile(PREFETCH" %0"::"m"(*s)); |
263 | 1.38k | __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); |
264 | 1.38k | __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); |
265 | 1.38k | mm_end = end - 15; |
266 | 5.28k | while (s<mm_end) { |
267 | 3.90k | __asm__ volatile( |
268 | 3.90k | PREFETCH" 32(%1) \n\t" |
269 | 3.90k | "movq (%1), %%mm0 \n\t" |
270 | 3.90k | "movq 8(%1), %%mm2 \n\t" |
271 | 3.90k | "movq %%mm0, %%mm1 \n\t" |
272 | 3.90k | "movq %%mm2, %%mm3 \n\t" |
273 | 3.90k | "psrlq $1, %%mm0 \n\t" |
274 | 3.90k | "psrlq $1, %%mm2 \n\t" |
275 | 3.90k | "pand %%mm7, %%mm0 \n\t" |
276 | 3.90k | "pand %%mm7, %%mm2 \n\t" |
277 | 3.90k | "pand %%mm6, %%mm1 \n\t" |
278 | 3.90k | "pand %%mm6, %%mm3 \n\t" |
279 | 3.90k | "por %%mm1, %%mm0 \n\t" |
280 | 3.90k | "por %%mm3, %%mm2 \n\t" |
281 | 3.90k | MOVNTQ" %%mm0, (%0) \n\t" |
282 | 3.90k | MOVNTQ" %%mm2, 8(%0)" |
283 | 3.90k | :: "r"(d), "r"(s) |
284 | 3.90k | ); |
285 | 3.90k | d+=16; |
286 | 3.90k | s+=16; |
287 | 3.90k | } |
288 | 1.38k | __asm__ volatile(SFENCE:::"memory"); |
289 | 1.38k | __asm__ volatile(EMMS:::"memory"); |
290 | 1.38k | mm_end = end - 3; |
291 | 1.70k | while (s < mm_end) { |
292 | 324 | register uint32_t x= *((const uint32_t*)s); |
293 | 324 | *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); |
294 | 324 | s+=4; |
295 | 324 | d+=4; |
296 | 324 | } |
297 | 1.38k | if (s < end) { |
298 | 470 | register uint16_t x= *((const uint16_t*)s); |
299 | 470 | *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); |
300 | 470 | } |
301 | 1.38k | } |
302 | | |
303 | | static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
304 | 10 | { |
305 | 10 | const uint8_t *s = src; |
306 | 10 | const uint8_t *end; |
307 | 10 | const uint8_t *mm_end; |
308 | 10 | uint16_t *d = (uint16_t *)dst; |
309 | 10 | end = s + src_size; |
310 | 10 | mm_end = end - 15; |
311 | 10 | __asm__ volatile( |
312 | 10 | "movq %3, %%mm5 \n\t" |
313 | 10 | "movq %4, %%mm6 \n\t" |
314 | 10 | "movq %5, %%mm7 \n\t" |
315 | 10 | "jmp 2f \n\t" |
316 | 10 | ".p2align 4 \n\t" |
317 | 10 | "1: \n\t" |
318 | 10 | PREFETCH" 32(%1) \n\t" |
319 | 10 | "movd (%1), %%mm0 \n\t" |
320 | 10 | "movd 4(%1), %%mm3 \n\t" |
321 | 10 | "punpckldq 8(%1), %%mm0 \n\t" |
322 | 10 | "punpckldq 12(%1), %%mm3 \n\t" |
323 | 10 | "movq %%mm0, %%mm1 \n\t" |
324 | 10 | "movq %%mm3, %%mm4 \n\t" |
325 | 10 | "pand %%mm6, %%mm0 \n\t" |
326 | 10 | "pand %%mm6, %%mm3 \n\t" |
327 | 10 | "pmaddwd %%mm7, %%mm0 \n\t" |
328 | 10 | "pmaddwd %%mm7, %%mm3 \n\t" |
329 | 10 | "pand %%mm5, %%mm1 \n\t" |
330 | 10 | "pand %%mm5, %%mm4 \n\t" |
331 | 10 | "por %%mm1, %%mm0 \n\t" |
332 | 10 | "por %%mm4, %%mm3 \n\t" |
333 | 10 | "psrld $5, %%mm0 \n\t" |
334 | 10 | "pslld $11, %%mm3 \n\t" |
335 | 10 | "por %%mm3, %%mm0 \n\t" |
336 | 10 | MOVNTQ" %%mm0, (%0) \n\t" |
337 | 10 | "add $16, %1 \n\t" |
338 | 10 | "add $8, %0 \n\t" |
339 | 10 | "2: \n\t" |
340 | 10 | "cmp %2, %1 \n\t" |
341 | 10 | " jb 1b \n\t" |
342 | 10 | : "+r" (d), "+r"(s) |
343 | 10 | : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) |
344 | 10 | ); |
345 | 10 | __asm__ volatile(SFENCE:::"memory"); |
346 | 10 | __asm__ volatile(EMMS:::"memory"); |
347 | 22 | while (s < end) { |
348 | 12 | register int rgb = *(const uint32_t*)s; s += 4; |
349 | 12 | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); |
350 | 12 | } |
351 | 10 | } |
352 | | |
353 | | static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
354 | 28 | { |
355 | 28 | const uint8_t *s = src; |
356 | 28 | const uint8_t *end; |
357 | 28 | const uint8_t *mm_end; |
358 | 28 | uint16_t *d = (uint16_t *)dst; |
359 | 28 | end = s + src_size; |
360 | 28 | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
361 | 28 | __asm__ volatile( |
362 | 28 | "movq %0, %%mm7 \n\t" |
363 | 28 | "movq %1, %%mm6 \n\t" |
364 | 28 | ::"m"(red_16mask),"m"(green_16mask)); |
365 | 28 | mm_end = end - 15; |
366 | 3.54k | while (s < mm_end) { |
367 | 3.51k | __asm__ volatile( |
368 | 3.51k | PREFETCH" 32(%1) \n\t" |
369 | 3.51k | "movd (%1), %%mm0 \n\t" |
370 | 3.51k | "movd 4(%1), %%mm3 \n\t" |
371 | 3.51k | "punpckldq 8(%1), %%mm0 \n\t" |
372 | 3.51k | "punpckldq 12(%1), %%mm3 \n\t" |
373 | 3.51k | "movq %%mm0, %%mm1 \n\t" |
374 | 3.51k | "movq %%mm0, %%mm2 \n\t" |
375 | 3.51k | "movq %%mm3, %%mm4 \n\t" |
376 | 3.51k | "movq %%mm3, %%mm5 \n\t" |
377 | 3.51k | "psllq $8, %%mm0 \n\t" |
378 | 3.51k | "psllq $8, %%mm3 \n\t" |
379 | 3.51k | "pand %%mm7, %%mm0 \n\t" |
380 | 3.51k | "pand %%mm7, %%mm3 \n\t" |
381 | 3.51k | "psrlq $5, %%mm1 \n\t" |
382 | 3.51k | "psrlq $5, %%mm4 \n\t" |
383 | 3.51k | "pand %%mm6, %%mm1 \n\t" |
384 | 3.51k | "pand %%mm6, %%mm4 \n\t" |
385 | 3.51k | "psrlq $19, %%mm2 \n\t" |
386 | 3.51k | "psrlq $19, %%mm5 \n\t" |
387 | 3.51k | "pand %2, %%mm2 \n\t" |
388 | 3.51k | "pand %2, %%mm5 \n\t" |
389 | 3.51k | "por %%mm1, %%mm0 \n\t" |
390 | 3.51k | "por %%mm4, %%mm3 \n\t" |
391 | 3.51k | "por %%mm2, %%mm0 \n\t" |
392 | 3.51k | "por %%mm5, %%mm3 \n\t" |
393 | 3.51k | "psllq $16, %%mm3 \n\t" |
394 | 3.51k | "por %%mm3, %%mm0 \n\t" |
395 | 3.51k | MOVNTQ" %%mm0, (%0) \n\t" |
396 | 3.51k | :: "r"(d),"r"(s),"m"(blue_16mask):"memory"); |
397 | 3.51k | d += 4; |
398 | 3.51k | s += 16; |
399 | 3.51k | } |
400 | 28 | __asm__ volatile(SFENCE:::"memory"); |
401 | 28 | __asm__ volatile(EMMS:::"memory"); |
402 | 58 | while (s < end) { |
403 | 30 | register int rgb = *(const uint32_t*)s; s += 4; |
404 | 30 | *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); |
405 | 30 | } |
406 | 28 | } |
407 | | |
408 | | static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
409 | 1.70k | { |
410 | 1.70k | const uint8_t *s = src; |
411 | 1.70k | const uint8_t *end; |
412 | 1.70k | const uint8_t *mm_end; |
413 | 1.70k | uint16_t *d = (uint16_t *)dst; |
414 | 1.70k | end = s + src_size; |
415 | 1.70k | mm_end = end - 15; |
416 | 1.70k | __asm__ volatile( |
417 | 1.70k | "movq %3, %%mm5 \n\t" |
418 | 1.70k | "movq %4, %%mm6 \n\t" |
419 | 1.70k | "movq %5, %%mm7 \n\t" |
420 | 1.70k | "jmp 2f \n\t" |
421 | 1.70k | ".p2align 4 \n\t" |
422 | 1.70k | "1: \n\t" |
423 | 1.70k | PREFETCH" 32(%1) \n\t" |
424 | 1.70k | "movd (%1), %%mm0 \n\t" |
425 | 1.70k | "movd 4(%1), %%mm3 \n\t" |
426 | 1.70k | "punpckldq 8(%1), %%mm0 \n\t" |
427 | 1.70k | "punpckldq 12(%1), %%mm3 \n\t" |
428 | 1.70k | "movq %%mm0, %%mm1 \n\t" |
429 | 1.70k | "movq %%mm3, %%mm4 \n\t" |
430 | 1.70k | "pand %%mm6, %%mm0 \n\t" |
431 | 1.70k | "pand %%mm6, %%mm3 \n\t" |
432 | 1.70k | "pmaddwd %%mm7, %%mm0 \n\t" |
433 | 1.70k | "pmaddwd %%mm7, %%mm3 \n\t" |
434 | 1.70k | "pand %%mm5, %%mm1 \n\t" |
435 | 1.70k | "pand %%mm5, %%mm4 \n\t" |
436 | 1.70k | "por %%mm1, %%mm0 \n\t" |
437 | 1.70k | "por %%mm4, %%mm3 \n\t" |
438 | 1.70k | "psrld $6, %%mm0 \n\t" |
439 | 1.70k | "pslld $10, %%mm3 \n\t" |
440 | 1.70k | "por %%mm3, %%mm0 \n\t" |
441 | 1.70k | MOVNTQ" %%mm0, (%0) \n\t" |
442 | 1.70k | "add $16, %1 \n\t" |
443 | 1.70k | "add $8, %0 \n\t" |
444 | 1.70k | "2: \n\t" |
445 | 1.70k | "cmp %2, %1 \n\t" |
446 | 1.70k | " jb 1b \n\t" |
447 | 1.70k | : "+r" (d), "+r"(s) |
448 | 1.70k | : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) |
449 | 1.70k | ); |
450 | 1.70k | __asm__ volatile(SFENCE:::"memory"); |
451 | 1.70k | __asm__ volatile(EMMS:::"memory"); |
452 | 3.14k | while (s < end) { |
453 | 1.43k | register int rgb = *(const uint32_t*)s; s += 4; |
454 | 1.43k | *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); |
455 | 1.43k | } |
456 | 1.70k | } |
457 | | |
458 | | static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
459 | 598 | { |
460 | 598 | const uint8_t *s = src; |
461 | 598 | const uint8_t *end; |
462 | 598 | const uint8_t *mm_end; |
463 | 598 | uint16_t *d = (uint16_t *)dst; |
464 | 598 | end = s + src_size; |
465 | 598 | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
466 | 598 | __asm__ volatile( |
467 | 598 | "movq %0, %%mm7 \n\t" |
468 | 598 | "movq %1, %%mm6 \n\t" |
469 | 598 | ::"m"(red_15mask),"m"(green_15mask)); |
470 | 598 | mm_end = end - 15; |
471 | 3.32k | while (s < mm_end) { |
472 | 2.72k | __asm__ volatile( |
473 | 2.72k | PREFETCH" 32(%1) \n\t" |
474 | 2.72k | "movd (%1), %%mm0 \n\t" |
475 | 2.72k | "movd 4(%1), %%mm3 \n\t" |
476 | 2.72k | "punpckldq 8(%1), %%mm0 \n\t" |
477 | 2.72k | "punpckldq 12(%1), %%mm3 \n\t" |
478 | 2.72k | "movq %%mm0, %%mm1 \n\t" |
479 | 2.72k | "movq %%mm0, %%mm2 \n\t" |
480 | 2.72k | "movq %%mm3, %%mm4 \n\t" |
481 | 2.72k | "movq %%mm3, %%mm5 \n\t" |
482 | 2.72k | "psllq $7, %%mm0 \n\t" |
483 | 2.72k | "psllq $7, %%mm3 \n\t" |
484 | 2.72k | "pand %%mm7, %%mm0 \n\t" |
485 | 2.72k | "pand %%mm7, %%mm3 \n\t" |
486 | 2.72k | "psrlq $6, %%mm1 \n\t" |
487 | 2.72k | "psrlq $6, %%mm4 \n\t" |
488 | 2.72k | "pand %%mm6, %%mm1 \n\t" |
489 | 2.72k | "pand %%mm6, %%mm4 \n\t" |
490 | 2.72k | "psrlq $19, %%mm2 \n\t" |
491 | 2.72k | "psrlq $19, %%mm5 \n\t" |
492 | 2.72k | "pand %2, %%mm2 \n\t" |
493 | 2.72k | "pand %2, %%mm5 \n\t" |
494 | 2.72k | "por %%mm1, %%mm0 \n\t" |
495 | 2.72k | "por %%mm4, %%mm3 \n\t" |
496 | 2.72k | "por %%mm2, %%mm0 \n\t" |
497 | 2.72k | "por %%mm5, %%mm3 \n\t" |
498 | 2.72k | "psllq $16, %%mm3 \n\t" |
499 | 2.72k | "por %%mm3, %%mm0 \n\t" |
500 | 2.72k | MOVNTQ" %%mm0, (%0) \n\t" |
501 | 2.72k | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); |
502 | 2.72k | d += 4; |
503 | 2.72k | s += 16; |
504 | 2.72k | } |
505 | 598 | __asm__ volatile(SFENCE:::"memory"); |
506 | 598 | __asm__ volatile(EMMS:::"memory"); |
507 | 954 | while (s < end) { |
508 | 356 | register int rgb = *(const uint32_t*)s; s += 4; |
509 | 356 | *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); |
510 | 356 | } |
511 | 598 | } |
512 | | |
513 | | static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
514 | 130 | { |
515 | 130 | const uint8_t *s = src; |
516 | 130 | const uint8_t *end; |
517 | 130 | const uint8_t *mm_end; |
518 | 130 | uint16_t *d = (uint16_t *)dst; |
519 | 130 | end = s + src_size; |
520 | 130 | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
521 | 130 | __asm__ volatile( |
522 | 130 | "movq %0, %%mm7 \n\t" |
523 | 130 | "movq %1, %%mm6 \n\t" |
524 | 130 | ::"m"(red_16mask),"m"(green_16mask)); |
525 | 130 | mm_end = end - 11; |
526 | 1.91k | while (s < mm_end) { |
527 | 1.78k | __asm__ volatile( |
528 | 1.78k | PREFETCH" 32(%1) \n\t" |
529 | 1.78k | "movd (%1), %%mm0 \n\t" |
530 | 1.78k | "movd 3(%1), %%mm3 \n\t" |
531 | 1.78k | "punpckldq 6(%1), %%mm0 \n\t" |
532 | 1.78k | "punpckldq 9(%1), %%mm3 \n\t" |
533 | 1.78k | "movq %%mm0, %%mm1 \n\t" |
534 | 1.78k | "movq %%mm0, %%mm2 \n\t" |
535 | 1.78k | "movq %%mm3, %%mm4 \n\t" |
536 | 1.78k | "movq %%mm3, %%mm5 \n\t" |
537 | 1.78k | "psrlq $3, %%mm0 \n\t" |
538 | 1.78k | "psrlq $3, %%mm3 \n\t" |
539 | 1.78k | "pand %2, %%mm0 \n\t" |
540 | 1.78k | "pand %2, %%mm3 \n\t" |
541 | 1.78k | "psrlq $5, %%mm1 \n\t" |
542 | 1.78k | "psrlq $5, %%mm4 \n\t" |
543 | 1.78k | "pand %%mm6, %%mm1 \n\t" |
544 | 1.78k | "pand %%mm6, %%mm4 \n\t" |
545 | 1.78k | "psrlq $8, %%mm2 \n\t" |
546 | 1.78k | "psrlq $8, %%mm5 \n\t" |
547 | 1.78k | "pand %%mm7, %%mm2 \n\t" |
548 | 1.78k | "pand %%mm7, %%mm5 \n\t" |
549 | 1.78k | "por %%mm1, %%mm0 \n\t" |
550 | 1.78k | "por %%mm4, %%mm3 \n\t" |
551 | 1.78k | "por %%mm2, %%mm0 \n\t" |
552 | 1.78k | "por %%mm5, %%mm3 \n\t" |
553 | 1.78k | "psllq $16, %%mm3 \n\t" |
554 | 1.78k | "por %%mm3, %%mm0 \n\t" |
555 | 1.78k | MOVNTQ" %%mm0, (%0) \n\t" |
556 | 1.78k | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); |
557 | 1.78k | d += 4; |
558 | 1.78k | s += 12; |
559 | 1.78k | } |
560 | 130 | __asm__ volatile(SFENCE:::"memory"); |
561 | 130 | __asm__ volatile(EMMS:::"memory"); |
562 | 478 | while (s < end) { |
563 | 348 | const int b = *s++; |
564 | 348 | const int g = *s++; |
565 | 348 | const int r = *s++; |
566 | 348 | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
567 | 348 | } |
568 | 130 | } |
569 | | |
570 | | static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
571 | 872 | { |
572 | 872 | const uint8_t *s = src; |
573 | 872 | const uint8_t *end; |
574 | 872 | const uint8_t *mm_end; |
575 | 872 | uint16_t *d = (uint16_t *)dst; |
576 | 872 | end = s + src_size; |
577 | 872 | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
578 | 872 | __asm__ volatile( |
579 | 872 | "movq %0, %%mm7 \n\t" |
580 | 872 | "movq %1, %%mm6 \n\t" |
581 | 872 | ::"m"(red_16mask),"m"(green_16mask)); |
582 | 872 | mm_end = end - 15; |
583 | 6.29k | while (s < mm_end) { |
584 | 5.42k | __asm__ volatile( |
585 | 5.42k | PREFETCH" 32(%1) \n\t" |
586 | 5.42k | "movd (%1), %%mm0 \n\t" |
587 | 5.42k | "movd 3(%1), %%mm3 \n\t" |
588 | 5.42k | "punpckldq 6(%1), %%mm0 \n\t" |
589 | 5.42k | "punpckldq 9(%1), %%mm3 \n\t" |
590 | 5.42k | "movq %%mm0, %%mm1 \n\t" |
591 | 5.42k | "movq %%mm0, %%mm2 \n\t" |
592 | 5.42k | "movq %%mm3, %%mm4 \n\t" |
593 | 5.42k | "movq %%mm3, %%mm5 \n\t" |
594 | 5.42k | "psllq $8, %%mm0 \n\t" |
595 | 5.42k | "psllq $8, %%mm3 \n\t" |
596 | 5.42k | "pand %%mm7, %%mm0 \n\t" |
597 | 5.42k | "pand %%mm7, %%mm3 \n\t" |
598 | 5.42k | "psrlq $5, %%mm1 \n\t" |
599 | 5.42k | "psrlq $5, %%mm4 \n\t" |
600 | 5.42k | "pand %%mm6, %%mm1 \n\t" |
601 | 5.42k | "pand %%mm6, %%mm4 \n\t" |
602 | 5.42k | "psrlq $19, %%mm2 \n\t" |
603 | 5.42k | "psrlq $19, %%mm5 \n\t" |
604 | 5.42k | "pand %2, %%mm2 \n\t" |
605 | 5.42k | "pand %2, %%mm5 \n\t" |
606 | 5.42k | "por %%mm1, %%mm0 \n\t" |
607 | 5.42k | "por %%mm4, %%mm3 \n\t" |
608 | 5.42k | "por %%mm2, %%mm0 \n\t" |
609 | 5.42k | "por %%mm5, %%mm3 \n\t" |
610 | 5.42k | "psllq $16, %%mm3 \n\t" |
611 | 5.42k | "por %%mm3, %%mm0 \n\t" |
612 | 5.42k | MOVNTQ" %%mm0, (%0) \n\t" |
613 | 5.42k | ::"r"(d),"r"(s),"m"(blue_16mask):"memory"); |
614 | 5.42k | d += 4; |
615 | 5.42k | s += 12; |
616 | 5.42k | } |
617 | 872 | __asm__ volatile(SFENCE:::"memory"); |
618 | 872 | __asm__ volatile(EMMS:::"memory"); |
619 | 3.45k | while (s < end) { |
620 | 2.57k | const int r = *s++; |
621 | 2.57k | const int g = *s++; |
622 | 2.57k | const int b = *s++; |
623 | 2.57k | *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); |
624 | 2.57k | } |
625 | 872 | } |
626 | | |
627 | | static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
628 | 1.00k | { |
629 | 1.00k | const uint8_t *s = src; |
630 | 1.00k | const uint8_t *end; |
631 | 1.00k | const uint8_t *mm_end; |
632 | 1.00k | uint16_t *d = (uint16_t *)dst; |
633 | 1.00k | end = s + src_size; |
634 | 1.00k | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
635 | 1.00k | __asm__ volatile( |
636 | 1.00k | "movq %0, %%mm7 \n\t" |
637 | 1.00k | "movq %1, %%mm6 \n\t" |
638 | 1.00k | ::"m"(red_15mask),"m"(green_15mask)); |
639 | 1.00k | mm_end = end - 11; |
640 | 2.20k | while (s < mm_end) { |
641 | 1.19k | __asm__ volatile( |
642 | 1.19k | PREFETCH" 32(%1) \n\t" |
643 | 1.19k | "movd (%1), %%mm0 \n\t" |
644 | 1.19k | "movd 3(%1), %%mm3 \n\t" |
645 | 1.19k | "punpckldq 6(%1), %%mm0 \n\t" |
646 | 1.19k | "punpckldq 9(%1), %%mm3 \n\t" |
647 | 1.19k | "movq %%mm0, %%mm1 \n\t" |
648 | 1.19k | "movq %%mm0, %%mm2 \n\t" |
649 | 1.19k | "movq %%mm3, %%mm4 \n\t" |
650 | 1.19k | "movq %%mm3, %%mm5 \n\t" |
651 | 1.19k | "psrlq $3, %%mm0 \n\t" |
652 | 1.19k | "psrlq $3, %%mm3 \n\t" |
653 | 1.19k | "pand %2, %%mm0 \n\t" |
654 | 1.19k | "pand %2, %%mm3 \n\t" |
655 | 1.19k | "psrlq $6, %%mm1 \n\t" |
656 | 1.19k | "psrlq $6, %%mm4 \n\t" |
657 | 1.19k | "pand %%mm6, %%mm1 \n\t" |
658 | 1.19k | "pand %%mm6, %%mm4 \n\t" |
659 | 1.19k | "psrlq $9, %%mm2 \n\t" |
660 | 1.19k | "psrlq $9, %%mm5 \n\t" |
661 | 1.19k | "pand %%mm7, %%mm2 \n\t" |
662 | 1.19k | "pand %%mm7, %%mm5 \n\t" |
663 | 1.19k | "por %%mm1, %%mm0 \n\t" |
664 | 1.19k | "por %%mm4, %%mm3 \n\t" |
665 | 1.19k | "por %%mm2, %%mm0 \n\t" |
666 | 1.19k | "por %%mm5, %%mm3 \n\t" |
667 | 1.19k | "psllq $16, %%mm3 \n\t" |
668 | 1.19k | "por %%mm3, %%mm0 \n\t" |
669 | 1.19k | MOVNTQ" %%mm0, (%0) \n\t" |
670 | 1.19k | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); |
671 | 1.19k | d += 4; |
672 | 1.19k | s += 12; |
673 | 1.19k | } |
674 | 1.00k | __asm__ volatile(SFENCE:::"memory"); |
675 | 1.00k | __asm__ volatile(EMMS:::"memory"); |
676 | 3.66k | while (s < end) { |
677 | 2.65k | const int b = *s++; |
678 | 2.65k | const int g = *s++; |
679 | 2.65k | const int r = *s++; |
680 | 2.65k | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
681 | 2.65k | } |
682 | 1.00k | } |
683 | | |
684 | | static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
685 | 406 | { |
686 | 406 | const uint8_t *s = src; |
687 | 406 | const uint8_t *end; |
688 | 406 | const uint8_t *mm_end; |
689 | 406 | uint16_t *d = (uint16_t *)dst; |
690 | 406 | end = s + src_size; |
691 | 406 | __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); |
692 | 406 | __asm__ volatile( |
693 | 406 | "movq %0, %%mm7 \n\t" |
694 | 406 | "movq %1, %%mm6 \n\t" |
695 | 406 | ::"m"(red_15mask),"m"(green_15mask)); |
696 | 406 | mm_end = end - 15; |
697 | 3.30k | while (s < mm_end) { |
698 | 2.90k | __asm__ volatile( |
699 | 2.90k | PREFETCH" 32(%1) \n\t" |
700 | 2.90k | "movd (%1), %%mm0 \n\t" |
701 | 2.90k | "movd 3(%1), %%mm3 \n\t" |
702 | 2.90k | "punpckldq 6(%1), %%mm0 \n\t" |
703 | 2.90k | "punpckldq 9(%1), %%mm3 \n\t" |
704 | 2.90k | "movq %%mm0, %%mm1 \n\t" |
705 | 2.90k | "movq %%mm0, %%mm2 \n\t" |
706 | 2.90k | "movq %%mm3, %%mm4 \n\t" |
707 | 2.90k | "movq %%mm3, %%mm5 \n\t" |
708 | 2.90k | "psllq $7, %%mm0 \n\t" |
709 | 2.90k | "psllq $7, %%mm3 \n\t" |
710 | 2.90k | "pand %%mm7, %%mm0 \n\t" |
711 | 2.90k | "pand %%mm7, %%mm3 \n\t" |
712 | 2.90k | "psrlq $6, %%mm1 \n\t" |
713 | 2.90k | "psrlq $6, %%mm4 \n\t" |
714 | 2.90k | "pand %%mm6, %%mm1 \n\t" |
715 | 2.90k | "pand %%mm6, %%mm4 \n\t" |
716 | 2.90k | "psrlq $19, %%mm2 \n\t" |
717 | 2.90k | "psrlq $19, %%mm5 \n\t" |
718 | 2.90k | "pand %2, %%mm2 \n\t" |
719 | 2.90k | "pand %2, %%mm5 \n\t" |
720 | 2.90k | "por %%mm1, %%mm0 \n\t" |
721 | 2.90k | "por %%mm4, %%mm3 \n\t" |
722 | 2.90k | "por %%mm2, %%mm0 \n\t" |
723 | 2.90k | "por %%mm5, %%mm3 \n\t" |
724 | 2.90k | "psllq $16, %%mm3 \n\t" |
725 | 2.90k | "por %%mm3, %%mm0 \n\t" |
726 | 2.90k | MOVNTQ" %%mm0, (%0) \n\t" |
727 | 2.90k | ::"r"(d),"r"(s),"m"(blue_15mask):"memory"); |
728 | 2.90k | d += 4; |
729 | 2.90k | s += 12; |
730 | 2.90k | } |
731 | 406 | __asm__ volatile(SFENCE:::"memory"); |
732 | 406 | __asm__ volatile(EMMS:::"memory"); |
733 | 1.33k | while (s < end) { |
734 | 928 | const int r = *s++; |
735 | 928 | const int g = *s++; |
736 | 928 | const int b = *s++; |
737 | 928 | *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); |
738 | 928 | } |
739 | 406 | } |
740 | | |
741 | | static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
742 | 88 | { |
743 | 88 | const uint16_t *end; |
744 | 88 | const uint16_t *mm_end; |
745 | 88 | uint8_t *d = dst; |
746 | 88 | const uint16_t *s = (const uint16_t*)src; |
747 | 88 | end = s + src_size/2; |
748 | 88 | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
749 | 88 | mm_end = end - 7; |
750 | 5.06k | while (s < mm_end) { |
751 | 4.97k | __asm__ volatile( |
752 | 4.97k | PREFETCH" 32(%1) \n\t" |
753 | 4.97k | "movq (%1), %%mm0 \n\t" |
754 | 4.97k | "movq (%1), %%mm1 \n\t" |
755 | 4.97k | "movq (%1), %%mm2 \n\t" |
756 | 4.97k | "pand %2, %%mm0 \n\t" |
757 | 4.97k | "pand %3, %%mm1 \n\t" |
758 | 4.97k | "pand %4, %%mm2 \n\t" |
759 | 4.97k | "psllq $5, %%mm0 \n\t" |
760 | 4.97k | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" |
761 | 4.97k | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" |
762 | 4.97k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
763 | 4.97k | "movq %%mm0, %%mm3 \n\t" |
764 | 4.97k | "movq %%mm1, %%mm4 \n\t" |
765 | 4.97k | "movq %%mm2, %%mm5 \n\t" |
766 | 4.97k | "punpcklwd %5, %%mm0 \n\t" |
767 | 4.97k | "punpcklwd %5, %%mm1 \n\t" |
768 | 4.97k | "punpcklwd %5, %%mm2 \n\t" |
769 | 4.97k | "punpckhwd %5, %%mm3 \n\t" |
770 | 4.97k | "punpckhwd %5, %%mm4 \n\t" |
771 | 4.97k | "punpckhwd %5, %%mm5 \n\t" |
772 | 4.97k | "psllq $8, %%mm1 \n\t" |
773 | 4.97k | "psllq $16, %%mm2 \n\t" |
774 | 4.97k | "por %%mm1, %%mm0 \n\t" |
775 | 4.97k | "por %%mm2, %%mm0 \n\t" |
776 | 4.97k | "psllq $8, %%mm4 \n\t" |
777 | 4.97k | "psllq $16, %%mm5 \n\t" |
778 | 4.97k | "por %%mm4, %%mm3 \n\t" |
779 | 4.97k | "por %%mm5, %%mm3 \n\t" |
780 | | |
781 | 4.97k | "movq %%mm0, %%mm6 \n\t" |
782 | 4.97k | "movq %%mm3, %%mm7 \n\t" |
783 | | |
784 | 4.97k | "movq 8(%1), %%mm0 \n\t" |
785 | 4.97k | "movq 8(%1), %%mm1 \n\t" |
786 | 4.97k | "movq 8(%1), %%mm2 \n\t" |
787 | 4.97k | "pand %2, %%mm0 \n\t" |
788 | 4.97k | "pand %3, %%mm1 \n\t" |
789 | 4.97k | "pand %4, %%mm2 \n\t" |
790 | 4.97k | "psllq $5, %%mm0 \n\t" |
791 | 4.97k | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" |
792 | 4.97k | "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t" |
793 | 4.97k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
794 | 4.97k | "movq %%mm0, %%mm3 \n\t" |
795 | 4.97k | "movq %%mm1, %%mm4 \n\t" |
796 | 4.97k | "movq %%mm2, %%mm5 \n\t" |
797 | 4.97k | "punpcklwd %5, %%mm0 \n\t" |
798 | 4.97k | "punpcklwd %5, %%mm1 \n\t" |
799 | 4.97k | "punpcklwd %5, %%mm2 \n\t" |
800 | 4.97k | "punpckhwd %5, %%mm3 \n\t" |
801 | 4.97k | "punpckhwd %5, %%mm4 \n\t" |
802 | 4.97k | "punpckhwd %5, %%mm5 \n\t" |
803 | 4.97k | "psllq $8, %%mm1 \n\t" |
804 | 4.97k | "psllq $16, %%mm2 \n\t" |
805 | 4.97k | "por %%mm1, %%mm0 \n\t" |
806 | 4.97k | "por %%mm2, %%mm0 \n\t" |
807 | 4.97k | "psllq $8, %%mm4 \n\t" |
808 | 4.97k | "psllq $16, %%mm5 \n\t" |
809 | 4.97k | "por %%mm4, %%mm3 \n\t" |
810 | 4.97k | "por %%mm5, %%mm3 \n\t" |
811 | | |
812 | 4.97k | :"=m"(*d) |
813 | 4.97k | :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) |
814 | 4.97k | NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi) |
815 | 4.97k | :"memory"); |
816 | | /* borrowed 32 to 24 */ |
817 | 4.97k | __asm__ volatile( |
818 | 4.97k | "movq %%mm0, %%mm4 \n\t" |
819 | 4.97k | "movq %%mm3, %%mm5 \n\t" |
820 | 4.97k | "movq %%mm6, %%mm0 \n\t" |
821 | 4.97k | "movq %%mm7, %%mm1 \n\t" |
822 | | |
823 | 4.97k | "movq %%mm4, %%mm6 \n\t" |
824 | 4.97k | "movq %%mm5, %%mm7 \n\t" |
825 | 4.97k | "movq %%mm0, %%mm2 \n\t" |
826 | 4.97k | "movq %%mm1, %%mm3 \n\t" |
827 | | |
828 | 4.97k | STORE_BGR24_MMX |
829 | | |
830 | 4.97k | :: "r"(d), "m"(*s) |
831 | 4.97k | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) |
832 | 4.97k | :"memory"); |
833 | 4.97k | d += 24; |
834 | 4.97k | s += 8; |
835 | 4.97k | } |
836 | 88 | __asm__ volatile(SFENCE:::"memory"); |
837 | 88 | __asm__ volatile(EMMS:::"memory"); |
838 | 507 | while (s < end) { |
839 | 419 | register uint16_t bgr; |
840 | 419 | bgr = *s++; |
841 | 419 | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); |
842 | 419 | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); |
843 | 419 | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); |
844 | 419 | } |
845 | 88 | } |
846 | | |
847 | | static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
848 | 72 | { |
849 | 72 | const uint16_t *end; |
850 | 72 | const uint16_t *mm_end; |
851 | 72 | uint8_t *d = (uint8_t *)dst; |
852 | 72 | const uint16_t *s = (const uint16_t *)src; |
853 | 72 | end = s + src_size/2; |
854 | 72 | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
855 | 72 | mm_end = end - 7; |
856 | 2.37k | while (s < mm_end) { |
857 | 2.30k | __asm__ volatile( |
858 | 2.30k | PREFETCH" 32(%1) \n\t" |
859 | 2.30k | "movq (%1), %%mm0 \n\t" |
860 | 2.30k | "movq (%1), %%mm1 \n\t" |
861 | 2.30k | "movq (%1), %%mm2 \n\t" |
862 | 2.30k | "pand %2, %%mm0 \n\t" |
863 | 2.30k | "pand %3, %%mm1 \n\t" |
864 | 2.30k | "pand %4, %%mm2 \n\t" |
865 | 2.30k | "psllq $5, %%mm0 \n\t" |
866 | 2.30k | "psrlq $1, %%mm2 \n\t" |
867 | 2.30k | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" |
868 | 2.30k | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" |
869 | 2.30k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
870 | 2.30k | "movq %%mm0, %%mm3 \n\t" |
871 | 2.30k | "movq %%mm1, %%mm4 \n\t" |
872 | 2.30k | "movq %%mm2, %%mm5 \n\t" |
873 | 2.30k | "punpcklwd %5, %%mm0 \n\t" |
874 | 2.30k | "punpcklwd %5, %%mm1 \n\t" |
875 | 2.30k | "punpcklwd %5, %%mm2 \n\t" |
876 | 2.30k | "punpckhwd %5, %%mm3 \n\t" |
877 | 2.30k | "punpckhwd %5, %%mm4 \n\t" |
878 | 2.30k | "punpckhwd %5, %%mm5 \n\t" |
879 | 2.30k | "psllq $8, %%mm1 \n\t" |
880 | 2.30k | "psllq $16, %%mm2 \n\t" |
881 | 2.30k | "por %%mm1, %%mm0 \n\t" |
882 | 2.30k | "por %%mm2, %%mm0 \n\t" |
883 | 2.30k | "psllq $8, %%mm4 \n\t" |
884 | 2.30k | "psllq $16, %%mm5 \n\t" |
885 | 2.30k | "por %%mm4, %%mm3 \n\t" |
886 | 2.30k | "por %%mm5, %%mm3 \n\t" |
887 | | |
888 | 2.30k | "movq %%mm0, %%mm6 \n\t" |
889 | 2.30k | "movq %%mm3, %%mm7 \n\t" |
890 | | |
891 | 2.30k | "movq 8(%1), %%mm0 \n\t" |
892 | 2.30k | "movq 8(%1), %%mm1 \n\t" |
893 | 2.30k | "movq 8(%1), %%mm2 \n\t" |
894 | 2.30k | "pand %2, %%mm0 \n\t" |
895 | 2.30k | "pand %3, %%mm1 \n\t" |
896 | 2.30k | "pand %4, %%mm2 \n\t" |
897 | 2.30k | "psllq $5, %%mm0 \n\t" |
898 | 2.30k | "psrlq $1, %%mm2 \n\t" |
899 | 2.30k | "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t" |
900 | 2.30k | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" |
901 | 2.30k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
902 | 2.30k | "movq %%mm0, %%mm3 \n\t" |
903 | 2.30k | "movq %%mm1, %%mm4 \n\t" |
904 | 2.30k | "movq %%mm2, %%mm5 \n\t" |
905 | 2.30k | "punpcklwd %5, %%mm0 \n\t" |
906 | 2.30k | "punpcklwd %5, %%mm1 \n\t" |
907 | 2.30k | "punpcklwd %5, %%mm2 \n\t" |
908 | 2.30k | "punpckhwd %5, %%mm3 \n\t" |
909 | 2.30k | "punpckhwd %5, %%mm4 \n\t" |
910 | 2.30k | "punpckhwd %5, %%mm5 \n\t" |
911 | 2.30k | "psllq $8, %%mm1 \n\t" |
912 | 2.30k | "psllq $16, %%mm2 \n\t" |
913 | 2.30k | "por %%mm1, %%mm0 \n\t" |
914 | 2.30k | "por %%mm2, %%mm0 \n\t" |
915 | 2.30k | "psllq $8, %%mm4 \n\t" |
916 | 2.30k | "psllq $16, %%mm5 \n\t" |
917 | 2.30k | "por %%mm4, %%mm3 \n\t" |
918 | 2.30k | "por %%mm5, %%mm3 \n\t" |
919 | 2.30k | :"=m"(*d) |
920 | 2.30k | :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) |
921 | 2.30k | NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi) |
922 | 2.30k | :"memory"); |
923 | | /* borrowed 32 to 24 */ |
924 | 2.30k | __asm__ volatile( |
925 | 2.30k | "movq %%mm0, %%mm4 \n\t" |
926 | 2.30k | "movq %%mm3, %%mm5 \n\t" |
927 | 2.30k | "movq %%mm6, %%mm0 \n\t" |
928 | 2.30k | "movq %%mm7, %%mm1 \n\t" |
929 | | |
930 | 2.30k | "movq %%mm4, %%mm6 \n\t" |
931 | 2.30k | "movq %%mm5, %%mm7 \n\t" |
932 | 2.30k | "movq %%mm0, %%mm2 \n\t" |
933 | 2.30k | "movq %%mm1, %%mm3 \n\t" |
934 | | |
935 | 2.30k | STORE_BGR24_MMX |
936 | | |
937 | 2.30k | :: "r"(d), "m"(*s) |
938 | 2.30k | NAMED_CONSTRAINTS_ADD(mask24l,mask24h) |
939 | 2.30k | :"memory"); |
940 | 2.30k | d += 24; |
941 | 2.30k | s += 8; |
942 | 2.30k | } |
943 | 72 | __asm__ volatile(SFENCE:::"memory"); |
944 | 72 | __asm__ volatile(EMMS:::"memory"); |
945 | 164 | while (s < end) { |
946 | 92 | register uint16_t bgr; |
947 | 92 | bgr = *s++; |
948 | 92 | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); |
949 | 92 | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); |
950 | 92 | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); |
951 | 92 | } |
952 | 72 | } |
953 | | |
954 | | /* |
955 | | * mm0 = 00 B3 00 B2 00 B1 00 B0 |
956 | | * mm1 = 00 G3 00 G2 00 G1 00 G0 |
957 | | * mm2 = 00 R3 00 R2 00 R1 00 R0 |
958 | | * mm6 = FF FF FF FF FF FF FF FF |
959 | | * mm7 = 00 00 00 00 00 00 00 00 |
960 | | */ |
961 | | #define PACK_RGB32 \ |
962 | | "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ |
963 | | "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ |
964 | | "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ |
965 | | "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ |
966 | | "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ |
967 | | "movq %%mm0, %%mm3 \n\t" \ |
968 | | "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ |
969 | | "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ |
970 | | MOVNTQ" %%mm0, (%0) \n\t" \ |
971 | | MOVNTQ" %%mm3, 8(%0) \n\t" \ |
972 | | |
973 | | static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
974 | 275 | { |
975 | 275 | const uint16_t *end; |
976 | 275 | const uint16_t *mm_end; |
977 | 275 | uint8_t *d = dst; |
978 | 275 | const uint16_t *s = (const uint16_t *)src; |
979 | 275 | end = s + src_size/2; |
980 | 275 | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
981 | 275 | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); |
982 | 275 | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); |
983 | 275 | mm_end = end - 3; |
984 | 1.62k | while (s < mm_end) { |
985 | 1.35k | __asm__ volatile( |
986 | 1.35k | PREFETCH" 32(%1) \n\t" |
987 | 1.35k | "movq (%1), %%mm0 \n\t" |
988 | 1.35k | "movq (%1), %%mm1 \n\t" |
989 | 1.35k | "movq (%1), %%mm2 \n\t" |
990 | 1.35k | "pand %2, %%mm0 \n\t" |
991 | 1.35k | "pand %3, %%mm1 \n\t" |
992 | 1.35k | "pand %4, %%mm2 \n\t" |
993 | 1.35k | "psllq $5, %%mm0 \n\t" |
994 | 1.35k | "pmulhw %5, %%mm0 \n\t" |
995 | 1.35k | "pmulhw %5, %%mm1 \n\t" |
996 | 1.35k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
997 | 1.35k | PACK_RGB32 |
998 | 1.35k | ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid) |
999 | 1.35k | NAMED_CONSTRAINTS_ADD(mul15_hi) |
1000 | 1.35k | :"memory"); |
1001 | 1.35k | d += 16; |
1002 | 1.35k | s += 4; |
1003 | 1.35k | } |
1004 | 275 | __asm__ volatile(SFENCE:::"memory"); |
1005 | 275 | __asm__ volatile(EMMS:::"memory"); |
1006 | 281 | while (s < end) { |
1007 | 6 | register uint16_t bgr; |
1008 | 6 | bgr = *s++; |
1009 | 6 | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); |
1010 | 6 | *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7); |
1011 | 6 | *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12); |
1012 | 6 | *d++ = 255; |
1013 | 6 | } |
1014 | 275 | } |
1015 | | |
1016 | | static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
1017 | 292 | { |
1018 | 292 | const uint16_t *end; |
1019 | 292 | const uint16_t *mm_end; |
1020 | 292 | uint8_t *d = dst; |
1021 | 292 | const uint16_t *s = (const uint16_t*)src; |
1022 | 292 | end = s + src_size/2; |
1023 | 292 | __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); |
1024 | 292 | __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); |
1025 | 292 | __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); |
1026 | 292 | mm_end = end - 3; |
1027 | 11.5k | while (s < mm_end) { |
1028 | 11.2k | __asm__ volatile( |
1029 | 11.2k | PREFETCH" 32(%1) \n\t" |
1030 | 11.2k | "movq (%1), %%mm0 \n\t" |
1031 | 11.2k | "movq (%1), %%mm1 \n\t" |
1032 | 11.2k | "movq (%1), %%mm2 \n\t" |
1033 | 11.2k | "pand %2, %%mm0 \n\t" |
1034 | 11.2k | "pand %3, %%mm1 \n\t" |
1035 | 11.2k | "pand %4, %%mm2 \n\t" |
1036 | 11.2k | "psllq $5, %%mm0 \n\t" |
1037 | 11.2k | "psrlq $1, %%mm2 \n\t" |
1038 | 11.2k | "pmulhw %5, %%mm0 \n\t" |
1039 | 11.2k | "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t" |
1040 | 11.2k | "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t" |
1041 | 11.2k | PACK_RGB32 |
1042 | 11.2k | ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid) |
1043 | 11.2k | NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi) |
1044 | 11.2k | :"memory"); |
1045 | 11.2k | d += 16; |
1046 | 11.2k | s += 4; |
1047 | 11.2k | } |
1048 | 292 | __asm__ volatile(SFENCE:::"memory"); |
1049 | 292 | __asm__ volatile(EMMS:::"memory"); |
1050 | 991 | while (s < end) { |
1051 | 699 | register uint16_t bgr; |
1052 | 699 | bgr = *s++; |
1053 | 699 | *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2); |
1054 | 699 | *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9); |
1055 | 699 | *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13); |
1056 | 699 | *d++ = 255; |
1057 | 699 | } |
1058 | 292 | } |
1059 | | |
1060 | | static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size) |
1061 | 871 | { |
1062 | 871 | x86_reg mmx_size= 23 - src_size; |
1063 | 871 | __asm__ volatile ( |
1064 | 871 | "test %%"FF_REG_a", %%"FF_REG_a" \n\t" |
1065 | 871 | "jns 2f \n\t" |
1066 | 871 | "movq "MANGLE(mask24r)", %%mm5 \n\t" |
1067 | 871 | "movq "MANGLE(mask24g)", %%mm6 \n\t" |
1068 | 871 | "movq "MANGLE(mask24b)", %%mm7 \n\t" |
1069 | 871 | ".p2align 4 \n\t" |
1070 | 871 | "1: \n\t" |
1071 | 871 | PREFETCH" 32(%1, %%"FF_REG_a") \n\t" |
1072 | 871 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG |
1073 | 871 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG |
1074 | 871 | "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B |
1075 | 871 | "psllq $16, %%mm0 \n\t" // 00 BGR BGR |
1076 | 871 | "pand %%mm5, %%mm0 \n\t" |
1077 | 871 | "pand %%mm6, %%mm1 \n\t" |
1078 | 871 | "pand %%mm7, %%mm2 \n\t" |
1079 | 871 | "por %%mm0, %%mm1 \n\t" |
1080 | 871 | "por %%mm2, %%mm1 \n\t" |
1081 | 871 | "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG |
1082 | 871 | MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG |
1083 | 871 | "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B |
1084 | 871 | "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR |
1085 | 871 | "pand %%mm7, %%mm0 \n\t" |
1086 | 871 | "pand %%mm5, %%mm1 \n\t" |
1087 | 871 | "pand %%mm6, %%mm2 \n\t" |
1088 | 871 | "por %%mm0, %%mm1 \n\t" |
1089 | 871 | "por %%mm2, %%mm1 \n\t" |
1090 | 871 | "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B |
1091 | 871 | MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R |
1092 | 871 | "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR |
1093 | 871 | "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG |
1094 | 871 | "pand %%mm6, %%mm0 \n\t" |
1095 | 871 | "pand %%mm7, %%mm1 \n\t" |
1096 | 871 | "pand %%mm5, %%mm2 \n\t" |
1097 | 871 | "por %%mm0, %%mm1 \n\t" |
1098 | 871 | "por %%mm2, %%mm1 \n\t" |
1099 | 871 | MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t" |
1100 | 871 | "add $24, %%"FF_REG_a" \n\t" |
1101 | 871 | " js 1b \n\t" |
1102 | 871 | "2: \n\t" |
1103 | 871 | : "+a" (mmx_size) |
1104 | 871 | : "r" (src-mmx_size), "r"(dst-mmx_size) |
1105 | 871 | NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b) |
1106 | 871 | ); |
1107 | | |
1108 | 871 | __asm__ volatile(SFENCE:::"memory"); |
1109 | 871 | __asm__ volatile(EMMS:::"memory"); |
1110 | | |
1111 | 871 | if (mmx_size==23) return; //finished, was multiple of 8 |
1112 | | |
1113 | 869 | src+= src_size; |
1114 | 869 | dst+= src_size; |
1115 | 869 | src_size= 23-mmx_size; |
1116 | 869 | src-= src_size; |
1117 | 869 | dst-= src_size; |
1118 | 4.86k | for (unsigned i = 0; i < src_size; i +=3) { |
1119 | 3.99k | register uint8_t x; |
1120 | 3.99k | x = src[i + 2]; |
1121 | 3.99k | dst[i + 1] = src[i + 1]; |
1122 | 3.99k | dst[i + 2] = src[i + 0]; |
1123 | 3.99k | dst[i + 0] = x; |
1124 | 3.99k | } |
1125 | 869 | } |
1126 | | |
1127 | | static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1128 | | int width, int height, |
1129 | | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) |
1130 | 13 | { |
1131 | 13 | const x86_reg chromWidth= width>>1; |
1132 | 1.11k | for (int y = 0; y < height; y++) { |
1133 | | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) |
1134 | 1.09k | __asm__ volatile( |
1135 | 1.09k | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" |
1136 | 1.09k | ".p2align 4 \n\t" |
1137 | 1.09k | "1: \n\t" |
1138 | 1.09k | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" |
1139 | 1.09k | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" |
1140 | 1.09k | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" |
1141 | 1.09k | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) |
1142 | 1.09k | "movq %%mm0, %%mm2 \n\t" // U(0) |
1143 | 1.09k | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) |
1144 | 1.09k | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1145 | 1.09k | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1146 | | |
1147 | 1.09k | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) |
1148 | 1.09k | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) |
1149 | 1.09k | "movq %%mm3, %%mm4 \n\t" // Y(0) |
1150 | 1.09k | "movq %%mm5, %%mm6 \n\t" // Y(8) |
1151 | 1.09k | "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) |
1152 | 1.09k | "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) |
1153 | 1.09k | "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) |
1154 | 1.09k | "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) |
1155 | | |
1156 | 1.09k | MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t" |
1157 | 1.09k | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" |
1158 | 1.09k | MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t" |
1159 | 1.09k | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" |
1160 | | |
1161 | 1.09k | "add $8, %%"FF_REG_a" \n\t" |
1162 | 1.09k | "cmp %4, %%"FF_REG_a" \n\t" |
1163 | 1.09k | " jb 1b \n\t" |
1164 | 1.09k | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
1165 | 1.09k | : "%"FF_REG_a |
1166 | 1.09k | ); |
1167 | 1.09k | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { |
1168 | 808 | usrc += chromStride; |
1169 | 808 | vsrc += chromStride; |
1170 | 808 | } |
1171 | 1.09k | ysrc += lumStride; |
1172 | 1.09k | dst += dstStride; |
1173 | 1.09k | } |
1174 | 13 | __asm__(EMMS" \n\t" |
1175 | 13 | SFENCE" \n\t" |
1176 | 13 | :::"memory"); |
1177 | 13 | } |
1178 | | |
1179 | | /** |
1180 | | * Height should be a multiple of 2 and width should be a multiple of 16. |
1181 | | * (If this is a problem for anyone then tell me, and I will fix it.) |
1182 | | */ |
1183 | | static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1184 | | int width, int height, |
1185 | | int lumStride, int chromStride, int dstStride) |
1186 | 7 | { |
1187 | | //FIXME interpolate chroma |
1188 | 7 | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); |
1189 | 7 | } |
1190 | | |
1191 | | static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1192 | | int width, int height, |
1193 | | int lumStride, int chromStride, int dstStride, int vertLumPerChroma) |
1194 | 8 | { |
1195 | 8 | const x86_reg chromWidth= width>>1; |
1196 | 206 | for (int y = 0; y < height; y++) { |
1197 | | //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) |
1198 | 198 | __asm__ volatile( |
1199 | 198 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" |
1200 | 198 | ".p2align 4 \n\t" |
1201 | 198 | "1: \n\t" |
1202 | 198 | PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" |
1203 | 198 | PREFETCH" 32(%2, %%"FF_REG_a") \n\t" |
1204 | 198 | PREFETCH" 32(%3, %%"FF_REG_a") \n\t" |
1205 | 198 | "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) |
1206 | 198 | "movq %%mm0, %%mm2 \n\t" // U(0) |
1207 | 198 | "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) |
1208 | 198 | "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1209 | 198 | "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) |
1210 | | |
1211 | 198 | "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) |
1212 | 198 | "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) |
1213 | 198 | "movq %%mm0, %%mm4 \n\t" // Y(0) |
1214 | 198 | "movq %%mm2, %%mm6 \n\t" // Y(8) |
1215 | 198 | "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) |
1216 | 198 | "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) |
1217 | 198 | "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) |
1218 | 198 | "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) |
1219 | | |
1220 | 198 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t" |
1221 | 198 | MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" |
1222 | 198 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t" |
1223 | 198 | MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" |
1224 | | |
1225 | 198 | "add $8, %%"FF_REG_a" \n\t" |
1226 | 198 | "cmp %4, %%"FF_REG_a" \n\t" |
1227 | 198 | " jb 1b \n\t" |
1228 | 198 | ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) |
1229 | 198 | : "%"FF_REG_a |
1230 | 198 | ); |
1231 | 198 | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { |
1232 | 114 | usrc += chromStride; |
1233 | 114 | vsrc += chromStride; |
1234 | 114 | } |
1235 | 198 | ysrc += lumStride; |
1236 | 198 | dst += dstStride; |
1237 | 198 | } |
1238 | 8 | __asm__(EMMS" \n\t" |
1239 | 8 | SFENCE" \n\t" |
1240 | 8 | :::"memory"); |
1241 | 8 | } |
1242 | | |
1243 | | /** |
1244 | | * Height should be a multiple of 2 and width should be a multiple of 16 |
1245 | | * (If this is a problem for anyone then tell me, and I will fix it.) |
1246 | | */ |
1247 | | static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1248 | | int width, int height, |
1249 | | int lumStride, int chromStride, int dstStride) |
1250 | 5 | { |
1251 | | //FIXME interpolate chroma |
1252 | 5 | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); |
1253 | 5 | } |
1254 | | |
1255 | | /** |
1256 | | * Width should be a multiple of 16. |
1257 | | */ |
1258 | | static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1259 | | int width, int height, |
1260 | | int lumStride, int chromStride, int dstStride) |
1261 | 3 | { |
1262 | 3 | yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); |
1263 | 3 | } |
1264 | | |
1265 | | /** |
1266 | | * Width should be a multiple of 16. |
1267 | | */ |
1268 | | static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1269 | | int width, int height, |
1270 | | int lumStride, int chromStride, int dstStride) |
1271 | 6 | { |
1272 | 6 | yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); |
1273 | 6 | } |
1274 | | |
1275 | | /** |
1276 | | * Height should be a multiple of 2 and width should be a multiple of 16. |
1277 | | * (If this is a problem for anyone then tell me, and I will fix it.) |
1278 | | */ |
1279 | | static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
1280 | | int width, int height, |
1281 | | int lumStride, int chromStride, int srcStride) |
1282 | 0 | { |
1283 | 0 | const x86_reg chromWidth= width>>1; |
1284 | 0 | for (int y = 0; y < height; y += 2) { |
1285 | 0 | __asm__ volatile( |
1286 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" |
1287 | 0 | "pcmpeqw %%mm7, %%mm7 \n\t" |
1288 | 0 | "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... |
1289 | 0 | ".p2align 4 \n\t" |
1290 | 0 | "1: \n\t" |
1291 | 0 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" |
1292 | 0 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1293 | 0 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) |
1294 | 0 | "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) |
1295 | 0 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) |
1296 | 0 | "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) |
1297 | 0 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) |
1298 | 0 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) |
1299 | 0 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) |
1300 | 0 | "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) |
1301 | 0 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) |
1302 | |
|
1303 | 0 | MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" |
1304 | |
|
1305 | 0 | "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) |
1306 | 0 | "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) |
1307 | 0 | "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) |
1308 | 0 | "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) |
1309 | 0 | "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) |
1310 | 0 | "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) |
1311 | 0 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) |
1312 | 0 | "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) |
1313 | 0 | "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) |
1314 | 0 | "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) |
1315 | |
|
1316 | 0 | MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" |
1317 | |
|
1318 | 0 | "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) |
1319 | 0 | "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) |
1320 | 0 | "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) |
1321 | 0 | "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) |
1322 | 0 | "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) |
1323 | 0 | "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) |
1324 | 0 | "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) |
1325 | 0 | "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) |
1326 | |
|
1327 | 0 | MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" |
1328 | 0 | MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" |
1329 | |
|
1330 | 0 | "add $8, %%"FF_REG_a" \n\t" |
1331 | 0 | "cmp %4, %%"FF_REG_a" \n\t" |
1332 | 0 | " jb 1b \n\t" |
1333 | 0 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
1334 | 0 | : "memory", "%"FF_REG_a |
1335 | 0 | ); |
1336 | |
|
1337 | 0 | ydst += lumStride; |
1338 | 0 | src += srcStride; |
1339 | |
|
1340 | 0 | __asm__ volatile( |
1341 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a"\n\t" |
1342 | 0 | ".p2align 4 \n\t" |
1343 | 0 | "1: \n\t" |
1344 | 0 | PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" |
1345 | 0 | "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) |
1346 | 0 | "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) |
1347 | 0 | "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) |
1348 | 0 | "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) |
1349 | 0 | "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) |
1350 | 0 | "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) |
1351 | 0 | "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) |
1352 | 0 | "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) |
1353 | 0 | "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) |
1354 | 0 | "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) |
1355 | |
|
1356 | 0 | MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" |
1357 | 0 | MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" |
1358 | |
|
1359 | 0 | "add $8, %%"FF_REG_a"\n\t" |
1360 | 0 | "cmp %4, %%"FF_REG_a"\n\t" |
1361 | 0 | " jb 1b \n\t" |
1362 | |
|
1363 | 0 | ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) |
1364 | 0 | : "memory", "%"FF_REG_a |
1365 | 0 | ); |
1366 | 0 | udst += chromStride; |
1367 | 0 | vdst += chromStride; |
1368 | 0 | ydst += lumStride; |
1369 | 0 | src += srcStride; |
1370 | 0 | } |
1371 | 0 | __asm__ volatile(EMMS" \n\t" |
1372 | 0 | SFENCE" \n\t" |
1373 | 0 | :::"memory"); |
1374 | 0 | } |
1375 | | |
1376 | | static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) |
1377 | 18 | { |
1378 | 18 | dst[0]= src[0]; |
1379 | | |
1380 | | // first line |
1381 | 262 | for (int x = 0; x < srcWidth - 1; x++) { |
1382 | 244 | dst[2*x+1]= (3*src[x] + src[x+1])>>2; |
1383 | 244 | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; |
1384 | 244 | } |
1385 | 18 | dst[2*srcWidth-1]= src[srcWidth-1]; |
1386 | | |
1387 | 18 | dst+= dstStride; |
1388 | | |
1389 | 2.01k | for (int y = 1; y < srcHeight; y++) { |
1390 | 1.99k | x86_reg mmxSize= srcWidth&~15; |
1391 | | |
1392 | 1.99k | if (mmxSize) { |
1393 | 10 | __asm__ volatile( |
1394 | 10 | "mov %4, %%"FF_REG_a" \n\t" |
1395 | 10 | "movq "MANGLE(mmx_ff)", %%mm0 \n\t" |
1396 | 10 | "movq (%0, %%"FF_REG_a"), %%mm4 \n\t" |
1397 | 10 | "movq %%mm4, %%mm2 \n\t" |
1398 | 10 | "psllq $8, %%mm4 \n\t" |
1399 | 10 | "pand %%mm0, %%mm2 \n\t" |
1400 | 10 | "por %%mm2, %%mm4 \n\t" |
1401 | 10 | "movq (%1, %%"FF_REG_a"), %%mm5 \n\t" |
1402 | 10 | "movq %%mm5, %%mm3 \n\t" |
1403 | 10 | "psllq $8, %%mm5 \n\t" |
1404 | 10 | "pand %%mm0, %%mm3 \n\t" |
1405 | 10 | "por %%mm3, %%mm5 \n\t" |
1406 | 10 | "1: \n\t" |
1407 | 10 | "movq (%0, %%"FF_REG_a"), %%mm0 \n\t" |
1408 | 10 | "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" |
1409 | 10 | "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t" |
1410 | 10 | "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t" |
1411 | 10 | PAVGB" %%mm0, %%mm5 \n\t" |
1412 | 10 | PAVGB" %%mm0, %%mm3 \n\t" |
1413 | 10 | PAVGB" %%mm0, %%mm5 \n\t" |
1414 | 10 | PAVGB" %%mm0, %%mm3 \n\t" |
1415 | 10 | PAVGB" %%mm1, %%mm4 \n\t" |
1416 | 10 | PAVGB" %%mm1, %%mm2 \n\t" |
1417 | 10 | PAVGB" %%mm1, %%mm4 \n\t" |
1418 | 10 | PAVGB" %%mm1, %%mm2 \n\t" |
1419 | 10 | "movq %%mm5, %%mm7 \n\t" |
1420 | 10 | "movq %%mm4, %%mm6 \n\t" |
1421 | 10 | "punpcklbw %%mm3, %%mm5 \n\t" |
1422 | 10 | "punpckhbw %%mm3, %%mm7 \n\t" |
1423 | 10 | "punpcklbw %%mm2, %%mm4 \n\t" |
1424 | 10 | "punpckhbw %%mm2, %%mm6 \n\t" |
1425 | 10 | MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t" |
1426 | 10 | MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t" |
1427 | 10 | MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t" |
1428 | 10 | MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t" |
1429 | 10 | "add $8, %%"FF_REG_a" \n\t" |
1430 | 10 | "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t" |
1431 | 10 | "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t" |
1432 | 10 | " js 1b \n\t" |
1433 | 10 | :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), |
1434 | 10 | "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), |
1435 | 10 | "g" (-mmxSize) |
1436 | 10 | NAMED_CONSTRAINTS_ADD(mmx_ff) |
1437 | 10 | : "%"FF_REG_a |
1438 | 10 | ); |
1439 | 1.98k | } else { |
1440 | 1.98k | mmxSize = 1; |
1441 | 1.98k | dst[0] = (src[0] * 3 + src[srcStride]) >> 2; |
1442 | 1.98k | dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2; |
1443 | 1.98k | } |
1444 | | |
1445 | 2.88k | for (int x = mmxSize - 1; x < srcWidth - 1; x++) { |
1446 | 894 | dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; |
1447 | 894 | dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; |
1448 | 894 | dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; |
1449 | 894 | dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; |
1450 | 894 | } |
1451 | 1.99k | dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; |
1452 | 1.99k | dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; |
1453 | | |
1454 | 1.99k | dst+=dstStride*2; |
1455 | 1.99k | src+=srcStride; |
1456 | 1.99k | } |
1457 | | |
1458 | | // last line |
1459 | 18 | dst[0]= src[0]; |
1460 | | |
1461 | 262 | for (int x = 0; x < srcWidth - 1; x++) { |
1462 | 244 | dst[2*x+1]= (3*src[x] + src[x+1])>>2; |
1463 | 244 | dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; |
1464 | 244 | } |
1465 | 18 | dst[2*srcWidth-1]= src[srcWidth-1]; |
1466 | | |
1467 | 18 | __asm__ volatile(EMMS" \n\t" |
1468 | 18 | SFENCE" \n\t" |
1469 | 18 | :::"memory"); |
1470 | 18 | } |
1471 | | |
1472 | | /** |
1473 | | * Height should be a multiple of 2 and width should be a multiple of 2. |
1474 | | * (If this is a problem for anyone then tell me, and I will fix it.) |
1475 | | * Chrominance data is only taken from every second line, |
1476 | | * others are ignored in the C version. |
1477 | | * FIXME: Write HQ version. |
1478 | | */ |
1479 | | #if ARCH_X86_32 && HAVE_7REGS |
1480 | | DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset) = 0x1010101010101010ULL; |
1481 | | DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL; |
1482 | | DECLARE_ASM_CONST(8, uint64_t, w1111) = 0x0001000100010001ULL; |
1483 | | |
1484 | | static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
1485 | | int width, int height, |
1486 | | int lumStride, int chromStride, int srcStride, |
1487 | | const int32_t *rgb2yuv) |
1488 | | { |
1489 | | #define BGR2Y_IDX "16*4+16*32" |
1490 | | #define BGR2U_IDX "16*4+16*33" |
1491 | | #define BGR2V_IDX "16*4+16*34" |
1492 | | int y; |
1493 | | const x86_reg chromWidth= width>>1; |
1494 | | |
1495 | | if (height > 2) { |
1496 | | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv); |
1497 | | src += 2*srcStride; |
1498 | | ydst += 2*lumStride; |
1499 | | udst += chromStride; |
1500 | | vdst += chromStride; |
1501 | | height -= 2; |
1502 | | } |
1503 | | |
1504 | | for (y = 0; y < height - 2; y += 2) { |
1505 | | for (int i = 0; i < 2; i++) { |
1506 | | __asm__ volatile( |
1507 | | "mov %2, %%"FF_REG_a"\n\t" |
1508 | | "movq "BGR2Y_IDX"(%3), %%mm6 \n\t" |
1509 | | "movq "MANGLE(w1111)", %%mm5 \n\t" |
1510 | | "pxor %%mm7, %%mm7 \n\t" |
1511 | | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" |
1512 | | ".p2align 4 \n\t" |
1513 | | "1: \n\t" |
1514 | | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" |
1515 | | "movd (%0, %%"FF_REG_d"), %%mm0 \n\t" |
1516 | | "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" |
1517 | | "punpcklbw %%mm7, %%mm0 \n\t" |
1518 | | "punpcklbw %%mm7, %%mm1 \n\t" |
1519 | | "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" |
1520 | | "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" |
1521 | | "punpcklbw %%mm7, %%mm2 \n\t" |
1522 | | "punpcklbw %%mm7, %%mm3 \n\t" |
1523 | | "pmaddwd %%mm6, %%mm0 \n\t" |
1524 | | "pmaddwd %%mm6, %%mm1 \n\t" |
1525 | | "pmaddwd %%mm6, %%mm2 \n\t" |
1526 | | "pmaddwd %%mm6, %%mm3 \n\t" |
1527 | | "psrad $8, %%mm0 \n\t" |
1528 | | "psrad $8, %%mm1 \n\t" |
1529 | | "psrad $8, %%mm2 \n\t" |
1530 | | "psrad $8, %%mm3 \n\t" |
1531 | | "packssdw %%mm1, %%mm0 \n\t" |
1532 | | "packssdw %%mm3, %%mm2 \n\t" |
1533 | | "pmaddwd %%mm5, %%mm0 \n\t" |
1534 | | "pmaddwd %%mm5, %%mm2 \n\t" |
1535 | | "packssdw %%mm2, %%mm0 \n\t" |
1536 | | "psraw $7, %%mm0 \n\t" |
1537 | | |
1538 | | "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" |
1539 | | "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" |
1540 | | "punpcklbw %%mm7, %%mm4 \n\t" |
1541 | | "punpcklbw %%mm7, %%mm1 \n\t" |
1542 | | "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" |
1543 | | "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" |
1544 | | "punpcklbw %%mm7, %%mm2 \n\t" |
1545 | | "punpcklbw %%mm7, %%mm3 \n\t" |
1546 | | "pmaddwd %%mm6, %%mm4 \n\t" |
1547 | | "pmaddwd %%mm6, %%mm1 \n\t" |
1548 | | "pmaddwd %%mm6, %%mm2 \n\t" |
1549 | | "pmaddwd %%mm6, %%mm3 \n\t" |
1550 | | "psrad $8, %%mm4 \n\t" |
1551 | | "psrad $8, %%mm1 \n\t" |
1552 | | "psrad $8, %%mm2 \n\t" |
1553 | | "psrad $8, %%mm3 \n\t" |
1554 | | "packssdw %%mm1, %%mm4 \n\t" |
1555 | | "packssdw %%mm3, %%mm2 \n\t" |
1556 | | "pmaddwd %%mm5, %%mm4 \n\t" |
1557 | | "pmaddwd %%mm5, %%mm2 \n\t" |
1558 | | "add $24, %%"FF_REG_d"\n\t" |
1559 | | "packssdw %%mm2, %%mm4 \n\t" |
1560 | | "psraw $7, %%mm4 \n\t" |
1561 | | |
1562 | | "packuswb %%mm4, %%mm0 \n\t" |
1563 | | "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" |
1564 | | |
1565 | | MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" |
1566 | | "add $8, %%"FF_REG_a" \n\t" |
1567 | | " js 1b \n\t" |
1568 | | : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) |
1569 | | NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset) |
1570 | | : "%"FF_REG_a, "%"FF_REG_d |
1571 | | ); |
1572 | | ydst += lumStride; |
1573 | | src += srcStride; |
1574 | | } |
1575 | | src -= srcStride*2; |
1576 | | __asm__ volatile( |
1577 | | "mov %4, %%"FF_REG_a"\n\t" |
1578 | | "movq "MANGLE(w1111)", %%mm5 \n\t" |
1579 | | "movq "BGR2U_IDX"(%5), %%mm6 \n\t" |
1580 | | "pxor %%mm7, %%mm7 \n\t" |
1581 | | "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" |
1582 | | "add %%"FF_REG_d", %%"FF_REG_d"\n\t" |
1583 | | ".p2align 4 \n\t" |
1584 | | "1: \n\t" |
1585 | | PREFETCH" 64(%0, %%"FF_REG_d") \n\t" |
1586 | | PREFETCH" 64(%1, %%"FF_REG_d") \n\t" |
1587 | | "movq (%0, %%"FF_REG_d"), %%mm0 \n\t" |
1588 | | "movq (%1, %%"FF_REG_d"), %%mm1 \n\t" |
1589 | | "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" |
1590 | | "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" |
1591 | | PAVGB" %%mm1, %%mm0 \n\t" |
1592 | | PAVGB" %%mm3, %%mm2 \n\t" |
1593 | | "movq %%mm0, %%mm1 \n\t" |
1594 | | "movq %%mm2, %%mm3 \n\t" |
1595 | | "psrlq $24, %%mm0 \n\t" |
1596 | | "psrlq $24, %%mm2 \n\t" |
1597 | | PAVGB" %%mm1, %%mm0 \n\t" |
1598 | | PAVGB" %%mm3, %%mm2 \n\t" |
1599 | | "punpcklbw %%mm7, %%mm0 \n\t" |
1600 | | "punpcklbw %%mm7, %%mm2 \n\t" |
1601 | | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" |
1602 | | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" |
1603 | | |
1604 | | "pmaddwd %%mm0, %%mm1 \n\t" |
1605 | | "pmaddwd %%mm2, %%mm3 \n\t" |
1606 | | "pmaddwd %%mm6, %%mm0 \n\t" |
1607 | | "pmaddwd %%mm6, %%mm2 \n\t" |
1608 | | "psrad $8, %%mm0 \n\t" |
1609 | | "psrad $8, %%mm1 \n\t" |
1610 | | "psrad $8, %%mm2 \n\t" |
1611 | | "psrad $8, %%mm3 \n\t" |
1612 | | "packssdw %%mm2, %%mm0 \n\t" |
1613 | | "packssdw %%mm3, %%mm1 \n\t" |
1614 | | "pmaddwd %%mm5, %%mm0 \n\t" |
1615 | | "pmaddwd %%mm5, %%mm1 \n\t" |
1616 | | "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 |
1617 | | "psraw $7, %%mm0 \n\t" |
1618 | | |
1619 | | "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" |
1620 | | "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" |
1621 | | "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" |
1622 | | "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" |
1623 | | PAVGB" %%mm1, %%mm4 \n\t" |
1624 | | PAVGB" %%mm3, %%mm2 \n\t" |
1625 | | "movq %%mm4, %%mm1 \n\t" |
1626 | | "movq %%mm2, %%mm3 \n\t" |
1627 | | "psrlq $24, %%mm4 \n\t" |
1628 | | "psrlq $24, %%mm2 \n\t" |
1629 | | PAVGB" %%mm1, %%mm4 \n\t" |
1630 | | PAVGB" %%mm3, %%mm2 \n\t" |
1631 | | "punpcklbw %%mm7, %%mm4 \n\t" |
1632 | | "punpcklbw %%mm7, %%mm2 \n\t" |
1633 | | "movq "BGR2V_IDX"(%5), %%mm1 \n\t" |
1634 | | "movq "BGR2V_IDX"(%5), %%mm3 \n\t" |
1635 | | |
1636 | | "pmaddwd %%mm4, %%mm1 \n\t" |
1637 | | "pmaddwd %%mm2, %%mm3 \n\t" |
1638 | | "pmaddwd %%mm6, %%mm4 \n\t" |
1639 | | "pmaddwd %%mm6, %%mm2 \n\t" |
1640 | | "psrad $8, %%mm4 \n\t" |
1641 | | "psrad $8, %%mm1 \n\t" |
1642 | | "psrad $8, %%mm2 \n\t" |
1643 | | "psrad $8, %%mm3 \n\t" |
1644 | | "packssdw %%mm2, %%mm4 \n\t" |
1645 | | "packssdw %%mm3, %%mm1 \n\t" |
1646 | | "pmaddwd %%mm5, %%mm4 \n\t" |
1647 | | "pmaddwd %%mm5, %%mm1 \n\t" |
1648 | | "add $24, %%"FF_REG_d"\n\t" |
1649 | | "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 |
1650 | | "psraw $7, %%mm4 \n\t" |
1651 | | |
1652 | | "movq %%mm0, %%mm1 \n\t" |
1653 | | "punpckldq %%mm4, %%mm0 \n\t" |
1654 | | "punpckhdq %%mm4, %%mm1 \n\t" |
1655 | | "packsswb %%mm1, %%mm0 \n\t" |
1656 | | "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" |
1657 | | "movd %%mm0, (%2, %%"FF_REG_a") \n\t" |
1658 | | "punpckhdq %%mm0, %%mm0 \n\t" |
1659 | | "movd %%mm0, (%3, %%"FF_REG_a") \n\t" |
1660 | | "add $4, %%"FF_REG_a" \n\t" |
1661 | | " js 1b \n\t" |
1662 | | : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) |
1663 | | NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset) |
1664 | | : "%"FF_REG_a, "%"FF_REG_d |
1665 | | ); |
1666 | | |
1667 | | udst += chromStride; |
1668 | | vdst += chromStride; |
1669 | | src += srcStride*2; |
1670 | | } |
1671 | | |
1672 | | __asm__ volatile(EMMS" \n\t" |
1673 | | SFENCE" \n\t" |
1674 | | :::"memory"); |
1675 | | |
1676 | | ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv); |
1677 | | } |
1678 | | #endif /* HAVE_7REGS */ |
1679 | | |
1680 | | static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2, |
1681 | | uint8_t *dst1, uint8_t *dst2, |
1682 | | int width, int height, |
1683 | | int srcStride1, int srcStride2, |
1684 | | int dstStride1, int dstStride2) |
1685 | 0 | { |
1686 | 0 | int w,h; |
1687 | 0 | w=width/2; h=height/2; |
1688 | 0 | __asm__ volatile( |
1689 | 0 | PREFETCH" %0 \n\t" |
1690 | 0 | PREFETCH" %1 \n\t" |
1691 | 0 | ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); |
1692 | 0 | for (x86_reg y = 0; y < h; y++) { |
1693 | 0 | const uint8_t* s1=src1+srcStride1*(y>>1); |
1694 | 0 | uint8_t* d=dst1+dstStride1*y; |
1695 | 0 | x86_reg x = 0; |
1696 | 0 | for (;x<w-31;x+=32) { |
1697 | 0 | __asm__ volatile( |
1698 | 0 | PREFETCH" 32(%1,%2) \n\t" |
1699 | 0 | "movq (%1,%2), %%mm0 \n\t" |
1700 | 0 | "movq 8(%1,%2), %%mm2 \n\t" |
1701 | 0 | "movq 16(%1,%2), %%mm4 \n\t" |
1702 | 0 | "movq 24(%1,%2), %%mm6 \n\t" |
1703 | 0 | "movq %%mm0, %%mm1 \n\t" |
1704 | 0 | "movq %%mm2, %%mm3 \n\t" |
1705 | 0 | "movq %%mm4, %%mm5 \n\t" |
1706 | 0 | "movq %%mm6, %%mm7 \n\t" |
1707 | 0 | "punpcklbw %%mm0, %%mm0 \n\t" |
1708 | 0 | "punpckhbw %%mm1, %%mm1 \n\t" |
1709 | 0 | "punpcklbw %%mm2, %%mm2 \n\t" |
1710 | 0 | "punpckhbw %%mm3, %%mm3 \n\t" |
1711 | 0 | "punpcklbw %%mm4, %%mm4 \n\t" |
1712 | 0 | "punpckhbw %%mm5, %%mm5 \n\t" |
1713 | 0 | "punpcklbw %%mm6, %%mm6 \n\t" |
1714 | 0 | "punpckhbw %%mm7, %%mm7 \n\t" |
1715 | 0 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" |
1716 | 0 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" |
1717 | 0 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" |
1718 | 0 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" |
1719 | 0 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" |
1720 | 0 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" |
1721 | 0 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" |
1722 | 0 | MOVNTQ" %%mm7, 56(%0,%2,2)" |
1723 | 0 | :: "r"(d), "r"(s1), "r"(x) |
1724 | 0 | :"memory"); |
1725 | 0 | } |
1726 | 0 | for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; |
1727 | 0 | } |
1728 | 0 | for (x86_reg y = 0; y < h; y++) { |
1729 | 0 | const uint8_t* s2=src2+srcStride2*(y>>1); |
1730 | 0 | uint8_t* d=dst2+dstStride2*y; |
1731 | 0 | x86_reg x = 0; |
1732 | 0 | for (;x<w-31;x+=32) { |
1733 | 0 | __asm__ volatile( |
1734 | 0 | PREFETCH" 32(%1,%2) \n\t" |
1735 | 0 | "movq (%1,%2), %%mm0 \n\t" |
1736 | 0 | "movq 8(%1,%2), %%mm2 \n\t" |
1737 | 0 | "movq 16(%1,%2), %%mm4 \n\t" |
1738 | 0 | "movq 24(%1,%2), %%mm6 \n\t" |
1739 | 0 | "movq %%mm0, %%mm1 \n\t" |
1740 | 0 | "movq %%mm2, %%mm3 \n\t" |
1741 | 0 | "movq %%mm4, %%mm5 \n\t" |
1742 | 0 | "movq %%mm6, %%mm7 \n\t" |
1743 | 0 | "punpcklbw %%mm0, %%mm0 \n\t" |
1744 | 0 | "punpckhbw %%mm1, %%mm1 \n\t" |
1745 | 0 | "punpcklbw %%mm2, %%mm2 \n\t" |
1746 | 0 | "punpckhbw %%mm3, %%mm3 \n\t" |
1747 | 0 | "punpcklbw %%mm4, %%mm4 \n\t" |
1748 | 0 | "punpckhbw %%mm5, %%mm5 \n\t" |
1749 | 0 | "punpcklbw %%mm6, %%mm6 \n\t" |
1750 | 0 | "punpckhbw %%mm7, %%mm7 \n\t" |
1751 | 0 | MOVNTQ" %%mm0, (%0,%2,2) \n\t" |
1752 | 0 | MOVNTQ" %%mm1, 8(%0,%2,2) \n\t" |
1753 | 0 | MOVNTQ" %%mm2, 16(%0,%2,2) \n\t" |
1754 | 0 | MOVNTQ" %%mm3, 24(%0,%2,2) \n\t" |
1755 | 0 | MOVNTQ" %%mm4, 32(%0,%2,2) \n\t" |
1756 | 0 | MOVNTQ" %%mm5, 40(%0,%2,2) \n\t" |
1757 | 0 | MOVNTQ" %%mm6, 48(%0,%2,2) \n\t" |
1758 | 0 | MOVNTQ" %%mm7, 56(%0,%2,2)" |
1759 | 0 | :: "r"(d), "r"(s2), "r"(x) |
1760 | 0 | :"memory"); |
1761 | 0 | } |
1762 | 0 | for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; |
1763 | 0 | } |
1764 | 0 | __asm__( |
1765 | 0 | EMMS" \n\t" |
1766 | 0 | SFENCE" \n\t" |
1767 | 0 | ::: "memory" |
1768 | 0 | ); |
1769 | 0 | } |
1770 | | |
1771 | | static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, |
1772 | | uint8_t *dst, |
1773 | | int width, int height, |
1774 | | int srcStride1, int srcStride2, |
1775 | | int srcStride3, int dstStride) |
1776 | 0 | { |
1777 | 0 | int w,h; |
1778 | 0 | w=width/2; h=height; |
1779 | 0 | for (int y = 0; y < h; y++) { |
1780 | 0 | const uint8_t* yp=src1+srcStride1*y; |
1781 | 0 | const uint8_t* up=src2+srcStride2*(y>>2); |
1782 | 0 | const uint8_t* vp=src3+srcStride3*(y>>2); |
1783 | 0 | uint8_t* d=dst+dstStride*y; |
1784 | 0 | x86_reg x = 0; |
1785 | 0 | for (;x<w-7;x+=8) { |
1786 | 0 | __asm__ volatile( |
1787 | 0 | PREFETCH" 32(%1, %0) \n\t" |
1788 | 0 | PREFETCH" 32(%2, %0) \n\t" |
1789 | 0 | PREFETCH" 32(%3, %0) \n\t" |
1790 | 0 | "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
1791 | 0 | "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ |
1792 | 0 | "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ |
1793 | 0 | "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ |
1794 | 0 | "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ |
1795 | 0 | "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ |
1796 | 0 | "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ |
1797 | 0 | "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ |
1798 | 0 | "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ |
1799 | 0 | "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ |
1800 | |
|
1801 | 0 | "movq %%mm1, %%mm6 \n\t" |
1802 | 0 | "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ |
1803 | 0 | "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ |
1804 | 0 | "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ |
1805 | 0 | MOVNTQ" %%mm0, (%4, %0, 8) \n\t" |
1806 | 0 | MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" |
1807 | |
|
1808 | 0 | "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ |
1809 | 0 | "movq 8(%1, %0, 4), %%mm0 \n\t" |
1810 | 0 | "movq %%mm0, %%mm3 \n\t" |
1811 | 0 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ |
1812 | 0 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ |
1813 | 0 | MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" |
1814 | 0 | MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" |
1815 | |
|
1816 | 0 | "movq %%mm4, %%mm6 \n\t" |
1817 | 0 | "movq 16(%1, %0, 4), %%mm0 \n\t" |
1818 | 0 | "movq %%mm0, %%mm3 \n\t" |
1819 | 0 | "punpcklbw %%mm5, %%mm4 \n\t" |
1820 | 0 | "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ |
1821 | 0 | "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ |
1822 | 0 | MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" |
1823 | 0 | MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" |
1824 | |
|
1825 | 0 | "punpckhbw %%mm5, %%mm6 \n\t" |
1826 | 0 | "movq 24(%1, %0, 4), %%mm0 \n\t" |
1827 | 0 | "movq %%mm0, %%mm3 \n\t" |
1828 | 0 | "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ |
1829 | 0 | "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ |
1830 | 0 | MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" |
1831 | 0 | MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" |
1832 | |
|
1833 | 0 | : "+r" (x) |
1834 | 0 | : "r"(yp), "r" (up), "r"(vp), "r"(d) |
1835 | 0 | :"memory"); |
1836 | 0 | } |
1837 | 0 | for (; x<w; x++) { |
1838 | 0 | const int x2 = x<<2; |
1839 | 0 | d[8*x+0] = yp[x2]; |
1840 | 0 | d[8*x+1] = up[x]; |
1841 | 0 | d[8*x+2] = yp[x2+1]; |
1842 | 0 | d[8*x+3] = vp[x]; |
1843 | 0 | d[8*x+4] = yp[x2+2]; |
1844 | 0 | d[8*x+5] = up[x]; |
1845 | 0 | d[8*x+6] = yp[x2+3]; |
1846 | 0 | d[8*x+7] = vp[x]; |
1847 | 0 | } |
1848 | 0 | } |
1849 | 0 | __asm__( |
1850 | 0 | EMMS" \n\t" |
1851 | 0 | SFENCE" \n\t" |
1852 | 0 | ::: "memory" |
1853 | 0 | ); |
1854 | 0 | } |
1855 | | |
1856 | | static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) |
1857 | 2.90k | { |
1858 | 2.90k | dst += count; |
1859 | 2.90k | src += 2*count; |
1860 | 2.90k | count= - count; |
1861 | | |
1862 | 2.90k | if(count <= -16) { |
1863 | 504 | count += 15; |
1864 | 504 | __asm__ volatile( |
1865 | 504 | "pcmpeqw %%mm7, %%mm7 \n\t" |
1866 | 504 | "psrlw $8, %%mm7 \n\t" |
1867 | 504 | "1: \n\t" |
1868 | 504 | "movq -30(%1, %0, 2), %%mm0 \n\t" |
1869 | 504 | "movq -22(%1, %0, 2), %%mm1 \n\t" |
1870 | 504 | "movq -14(%1, %0, 2), %%mm2 \n\t" |
1871 | 504 | "movq -6(%1, %0, 2), %%mm3 \n\t" |
1872 | 504 | "pand %%mm7, %%mm0 \n\t" |
1873 | 504 | "pand %%mm7, %%mm1 \n\t" |
1874 | 504 | "pand %%mm7, %%mm2 \n\t" |
1875 | 504 | "pand %%mm7, %%mm3 \n\t" |
1876 | 504 | "packuswb %%mm1, %%mm0 \n\t" |
1877 | 504 | "packuswb %%mm3, %%mm2 \n\t" |
1878 | 504 | MOVNTQ" %%mm0,-15(%2, %0) \n\t" |
1879 | 504 | MOVNTQ" %%mm2,- 7(%2, %0) \n\t" |
1880 | 504 | "add $16, %0 \n\t" |
1881 | 504 | " js 1b \n\t" |
1882 | 504 | : "+r"(count) |
1883 | 504 | : "r"(src), "r"(dst) |
1884 | 504 | ); |
1885 | 504 | count -= 15; |
1886 | 504 | } |
1887 | 15.9k | while(count<0) { |
1888 | 13.0k | dst[count]= src[2*count]; |
1889 | 13.0k | count++; |
1890 | 13.0k | } |
1891 | 2.90k | } |
1892 | | |
1893 | | static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count) |
1894 | 1.38k | { |
1895 | 1.38k | src ++; |
1896 | 1.38k | dst += count; |
1897 | 1.38k | src += 2*count; |
1898 | 1.38k | count= - count; |
1899 | | |
1900 | 1.38k | if(count < -16) { |
1901 | 279 | count += 16; |
1902 | 279 | __asm__ volatile( |
1903 | 279 | "pcmpeqw %%mm7, %%mm7 \n\t" |
1904 | 279 | "psrlw $8, %%mm7 \n\t" |
1905 | 279 | "1: \n\t" |
1906 | 279 | "movq -32(%1, %0, 2), %%mm0 \n\t" |
1907 | 279 | "movq -24(%1, %0, 2), %%mm1 \n\t" |
1908 | 279 | "movq -16(%1, %0, 2), %%mm2 \n\t" |
1909 | 279 | "movq -8(%1, %0, 2), %%mm3 \n\t" |
1910 | 279 | "pand %%mm7, %%mm0 \n\t" |
1911 | 279 | "pand %%mm7, %%mm1 \n\t" |
1912 | 279 | "pand %%mm7, %%mm2 \n\t" |
1913 | 279 | "pand %%mm7, %%mm3 \n\t" |
1914 | 279 | "packuswb %%mm1, %%mm0 \n\t" |
1915 | 279 | "packuswb %%mm3, %%mm2 \n\t" |
1916 | 279 | MOVNTQ" %%mm0,-16(%2, %0) \n\t" |
1917 | 279 | MOVNTQ" %%mm2,- 8(%2, %0) \n\t" |
1918 | 279 | "add $16, %0 \n\t" |
1919 | 279 | " js 1b \n\t" |
1920 | 279 | : "+r"(count) |
1921 | 279 | : "r"(src), "r"(dst) |
1922 | 279 | ); |
1923 | 279 | count -= 16; |
1924 | 279 | } |
1925 | 12.6k | while(count<0) { |
1926 | 11.2k | dst[count]= src[2*count]; |
1927 | 11.2k | count++; |
1928 | 11.2k | } |
1929 | 1.38k | } |
1930 | | |
1931 | | #if ARCH_X86_32 |
1932 | | static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) |
1933 | | { |
1934 | | dst0+= count; |
1935 | | dst1+= count; |
1936 | | src += 4*count; |
1937 | | count= - count; |
1938 | | if(count <= -8) { |
1939 | | count += 7; |
1940 | | __asm__ volatile( |
1941 | | "pcmpeqw %%mm7, %%mm7 \n\t" |
1942 | | "psrlw $8, %%mm7 \n\t" |
1943 | | "1: \n\t" |
1944 | | "movq -28(%1, %0, 4), %%mm0 \n\t" |
1945 | | "movq -20(%1, %0, 4), %%mm1 \n\t" |
1946 | | "movq -12(%1, %0, 4), %%mm2 \n\t" |
1947 | | "movq -4(%1, %0, 4), %%mm3 \n\t" |
1948 | | "pand %%mm7, %%mm0 \n\t" |
1949 | | "pand %%mm7, %%mm1 \n\t" |
1950 | | "pand %%mm7, %%mm2 \n\t" |
1951 | | "pand %%mm7, %%mm3 \n\t" |
1952 | | "packuswb %%mm1, %%mm0 \n\t" |
1953 | | "packuswb %%mm3, %%mm2 \n\t" |
1954 | | "movq %%mm0, %%mm1 \n\t" |
1955 | | "movq %%mm2, %%mm3 \n\t" |
1956 | | "psrlw $8, %%mm0 \n\t" |
1957 | | "psrlw $8, %%mm2 \n\t" |
1958 | | "pand %%mm7, %%mm1 \n\t" |
1959 | | "pand %%mm7, %%mm3 \n\t" |
1960 | | "packuswb %%mm2, %%mm0 \n\t" |
1961 | | "packuswb %%mm3, %%mm1 \n\t" |
1962 | | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" |
1963 | | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" |
1964 | | "add $8, %0 \n\t" |
1965 | | " js 1b \n\t" |
1966 | | : "+r"(count) |
1967 | | : "r"(src), "r"(dst0), "r"(dst1) |
1968 | | ); |
1969 | | count -= 7; |
1970 | | } |
1971 | | while(count<0) { |
1972 | | dst0[count]= src[4*count+0]; |
1973 | | dst1[count]= src[4*count+2]; |
1974 | | count++; |
1975 | | } |
1976 | | } |
1977 | | #endif /* ARCH_X86_32 */ |
1978 | | |
1979 | | static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) |
1980 | 690 | { |
1981 | 690 | dst0 += count; |
1982 | 690 | dst1 += count; |
1983 | 690 | src0 += 4*count; |
1984 | 690 | src1 += 4*count; |
1985 | 690 | count= - count; |
1986 | 690 | #ifdef PAVGB |
1987 | 690 | if(count <= -8) { |
1988 | 138 | count += 7; |
1989 | 138 | __asm__ volatile( |
1990 | 138 | "pcmpeqw %%mm7, %%mm7 \n\t" |
1991 | 138 | "psrlw $8, %%mm7 \n\t" |
1992 | 138 | "1: \n\t" |
1993 | 138 | "movq -28(%1, %0, 4), %%mm0 \n\t" |
1994 | 138 | "movq -20(%1, %0, 4), %%mm1 \n\t" |
1995 | 138 | "movq -12(%1, %0, 4), %%mm2 \n\t" |
1996 | 138 | "movq -4(%1, %0, 4), %%mm3 \n\t" |
1997 | 138 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" |
1998 | 138 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" |
1999 | 138 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" |
2000 | 138 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" |
2001 | 138 | "pand %%mm7, %%mm0 \n\t" |
2002 | 138 | "pand %%mm7, %%mm1 \n\t" |
2003 | 138 | "pand %%mm7, %%mm2 \n\t" |
2004 | 138 | "pand %%mm7, %%mm3 \n\t" |
2005 | 138 | "packuswb %%mm1, %%mm0 \n\t" |
2006 | 138 | "packuswb %%mm3, %%mm2 \n\t" |
2007 | 138 | "movq %%mm0, %%mm1 \n\t" |
2008 | 138 | "movq %%mm2, %%mm3 \n\t" |
2009 | 138 | "psrlw $8, %%mm0 \n\t" |
2010 | 138 | "psrlw $8, %%mm2 \n\t" |
2011 | 138 | "pand %%mm7, %%mm1 \n\t" |
2012 | 138 | "pand %%mm7, %%mm3 \n\t" |
2013 | 138 | "packuswb %%mm2, %%mm0 \n\t" |
2014 | 138 | "packuswb %%mm3, %%mm1 \n\t" |
2015 | 138 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" |
2016 | 138 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" |
2017 | 138 | "add $8, %0 \n\t" |
2018 | 138 | " js 1b \n\t" |
2019 | 138 | : "+r"(count) |
2020 | 138 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) |
2021 | 138 | ); |
2022 | 138 | count -= 7; |
2023 | 138 | } |
2024 | 690 | #endif |
2025 | 3.57k | while(count<0) { |
2026 | 2.88k | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; |
2027 | 2.88k | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; |
2028 | 2.88k | count++; |
2029 | 2.88k | } |
2030 | 690 | } |
2031 | | |
2032 | | static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) |
2033 | 1.22k | { |
2034 | 1.22k | dst0+= count; |
2035 | 1.22k | dst1+= count; |
2036 | 1.22k | src += 4*count; |
2037 | 1.22k | count= - count; |
2038 | 1.22k | if(count <= -8) { |
2039 | 8 | count += 7; |
2040 | 8 | __asm__ volatile( |
2041 | 8 | "pcmpeqw %%mm7, %%mm7 \n\t" |
2042 | 8 | "psrlw $8, %%mm7 \n\t" |
2043 | 8 | "1: \n\t" |
2044 | 8 | "movq -28(%1, %0, 4), %%mm0 \n\t" |
2045 | 8 | "movq -20(%1, %0, 4), %%mm1 \n\t" |
2046 | 8 | "movq -12(%1, %0, 4), %%mm2 \n\t" |
2047 | 8 | "movq -4(%1, %0, 4), %%mm3 \n\t" |
2048 | 8 | "psrlw $8, %%mm0 \n\t" |
2049 | 8 | "psrlw $8, %%mm1 \n\t" |
2050 | 8 | "psrlw $8, %%mm2 \n\t" |
2051 | 8 | "psrlw $8, %%mm3 \n\t" |
2052 | 8 | "packuswb %%mm1, %%mm0 \n\t" |
2053 | 8 | "packuswb %%mm3, %%mm2 \n\t" |
2054 | 8 | "movq %%mm0, %%mm1 \n\t" |
2055 | 8 | "movq %%mm2, %%mm3 \n\t" |
2056 | 8 | "psrlw $8, %%mm0 \n\t" |
2057 | 8 | "psrlw $8, %%mm2 \n\t" |
2058 | 8 | "pand %%mm7, %%mm1 \n\t" |
2059 | 8 | "pand %%mm7, %%mm3 \n\t" |
2060 | 8 | "packuswb %%mm2, %%mm0 \n\t" |
2061 | 8 | "packuswb %%mm3, %%mm1 \n\t" |
2062 | 8 | MOVNTQ" %%mm0,- 7(%3, %0) \n\t" |
2063 | 8 | MOVNTQ" %%mm1,- 7(%2, %0) \n\t" |
2064 | 8 | "add $8, %0 \n\t" |
2065 | 8 | " js 1b \n\t" |
2066 | 8 | : "+r"(count) |
2067 | 8 | : "r"(src), "r"(dst0), "r"(dst1) |
2068 | 8 | ); |
2069 | 8 | count -= 7; |
2070 | 8 | } |
2071 | 1.22k | src++; |
2072 | 3.81k | while(count<0) { |
2073 | 2.59k | dst0[count]= src[4*count+0]; |
2074 | 2.59k | dst1[count]= src[4*count+2]; |
2075 | 2.59k | count++; |
2076 | 2.59k | } |
2077 | 1.22k | } |
2078 | | |
2079 | | static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) |
2080 | 832 | { |
2081 | 832 | dst0 += count; |
2082 | 832 | dst1 += count; |
2083 | 832 | src0 += 4*count; |
2084 | 832 | src1 += 4*count; |
2085 | 832 | count= - count; |
2086 | 832 | #ifdef PAVGB |
2087 | 832 | if(count <= -8) { |
2088 | 246 | count += 7; |
2089 | 246 | __asm__ volatile( |
2090 | 246 | "pcmpeqw %%mm7, %%mm7 \n\t" |
2091 | 246 | "psrlw $8, %%mm7 \n\t" |
2092 | 246 | "1: \n\t" |
2093 | 246 | "movq -28(%1, %0, 4), %%mm0 \n\t" |
2094 | 246 | "movq -20(%1, %0, 4), %%mm1 \n\t" |
2095 | 246 | "movq -12(%1, %0, 4), %%mm2 \n\t" |
2096 | 246 | "movq -4(%1, %0, 4), %%mm3 \n\t" |
2097 | 246 | PAVGB" -28(%2, %0, 4), %%mm0 \n\t" |
2098 | 246 | PAVGB" -20(%2, %0, 4), %%mm1 \n\t" |
2099 | 246 | PAVGB" -12(%2, %0, 4), %%mm2 \n\t" |
2100 | 246 | PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" |
2101 | 246 | "psrlw $8, %%mm0 \n\t" |
2102 | 246 | "psrlw $8, %%mm1 \n\t" |
2103 | 246 | "psrlw $8, %%mm2 \n\t" |
2104 | 246 | "psrlw $8, %%mm3 \n\t" |
2105 | 246 | "packuswb %%mm1, %%mm0 \n\t" |
2106 | 246 | "packuswb %%mm3, %%mm2 \n\t" |
2107 | 246 | "movq %%mm0, %%mm1 \n\t" |
2108 | 246 | "movq %%mm2, %%mm3 \n\t" |
2109 | 246 | "psrlw $8, %%mm0 \n\t" |
2110 | 246 | "psrlw $8, %%mm2 \n\t" |
2111 | 246 | "pand %%mm7, %%mm1 \n\t" |
2112 | 246 | "pand %%mm7, %%mm3 \n\t" |
2113 | 246 | "packuswb %%mm2, %%mm0 \n\t" |
2114 | 246 | "packuswb %%mm3, %%mm1 \n\t" |
2115 | 246 | MOVNTQ" %%mm0,- 7(%4, %0) \n\t" |
2116 | 246 | MOVNTQ" %%mm1,- 7(%3, %0) \n\t" |
2117 | 246 | "add $8, %0 \n\t" |
2118 | 246 | " js 1b \n\t" |
2119 | 246 | : "+r"(count) |
2120 | 246 | : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) |
2121 | 246 | ); |
2122 | 246 | count -= 7; |
2123 | 246 | } |
2124 | 832 | #endif |
2125 | 832 | src0++; |
2126 | 832 | src1++; |
2127 | 2.65k | while(count<0) { |
2128 | 1.82k | dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; |
2129 | 1.82k | dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; |
2130 | 1.82k | count++; |
2131 | 1.82k | } |
2132 | 832 | } |
2133 | | |
2134 | | static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2135 | | int width, int height, |
2136 | | int lumStride, int chromStride, int srcStride) |
2137 | 22 | { |
2138 | 22 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); |
2139 | | |
2140 | 1.69k | for (int y = 0; y < height; y++) { |
2141 | 1.67k | extract_even_mmxext(src, ydst, width); |
2142 | 1.67k | if(y&1) { |
2143 | 832 | extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); |
2144 | 832 | udst+= chromStride; |
2145 | 832 | vdst+= chromStride; |
2146 | 832 | } |
2147 | | |
2148 | 1.67k | src += srcStride; |
2149 | 1.67k | ydst+= lumStride; |
2150 | 1.67k | } |
2151 | 22 | __asm__( |
2152 | 22 | EMMS" \n\t" |
2153 | 22 | SFENCE" \n\t" |
2154 | 22 | ::: "memory" |
2155 | 22 | ); |
2156 | 22 | } |
2157 | | |
2158 | | static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2159 | | int width, int height, |
2160 | | int lumStride, int chromStride, int srcStride) |
2161 | 13 | { |
2162 | 13 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); |
2163 | | |
2164 | 1.24k | for (int y = 0; y < height; y++) { |
2165 | 1.22k | extract_even_mmxext(src, ydst, width); |
2166 | 1.22k | extract_odd2_mmxext(src, udst, vdst, chromWidth); |
2167 | | |
2168 | 1.22k | src += srcStride; |
2169 | 1.22k | ydst+= lumStride; |
2170 | 1.22k | udst+= chromStride; |
2171 | 1.22k | vdst+= chromStride; |
2172 | 1.22k | } |
2173 | 13 | __asm__( |
2174 | 13 | EMMS" \n\t" |
2175 | 13 | SFENCE" \n\t" |
2176 | 13 | ::: "memory" |
2177 | 13 | ); |
2178 | 13 | } |
2179 | | |
2180 | | static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2181 | | int width, int height, |
2182 | | int lumStride, int chromStride, int srcStride) |
2183 | 14 | { |
2184 | 14 | const int chromWidth = AV_CEIL_RSHIFT(width, 1); |
2185 | | |
2186 | 1.40k | for (int y = 0; y < height; y++) { |
2187 | 1.38k | extract_odd_mmxext(src, ydst, width); |
2188 | 1.38k | if(y&1) { |
2189 | 690 | extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth); |
2190 | 690 | udst+= chromStride; |
2191 | 690 | vdst+= chromStride; |
2192 | 690 | } |
2193 | | |
2194 | 1.38k | src += srcStride; |
2195 | 1.38k | ydst+= lumStride; |
2196 | 1.38k | } |
2197 | 14 | __asm__( |
2198 | 14 | EMMS" \n\t" |
2199 | 14 | SFENCE" \n\t" |
2200 | 14 | ::: "memory" |
2201 | 14 | ); |
2202 | 14 | } |
2203 | | |
2204 | | #if ARCH_X86_32 |
2205 | | static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, |
2206 | | int width, int height, |
2207 | | int lumStride, int chromStride, int srcStride) |
2208 | | { |
2209 | | const int chromWidth = AV_CEIL_RSHIFT(width, 1); |
2210 | | |
2211 | | for (int y = 0; y < height; y++) { |
2212 | | extract_odd_mmxext(src, ydst, width); |
2213 | | extract_even2_mmxext(src, udst, vdst, chromWidth); |
2214 | | |
2215 | | src += srcStride; |
2216 | | ydst+= lumStride; |
2217 | | udst+= chromStride; |
2218 | | vdst+= chromStride; |
2219 | | } |
2220 | | __asm__( |
2221 | | EMMS" \n\t" |
2222 | | SFENCE" \n\t" |
2223 | | ::: "memory" |
2224 | | ); |
2225 | | } |
2226 | | #endif /* ARCH_X86_32 */ |
2227 | | |
2228 | | static av_cold void rgb2rgb_init_mmxext(void) |
2229 | 1 | { |
2230 | 1 | rgb15to16 = rgb15to16_mmxext; |
2231 | 1 | rgb15tobgr24 = rgb15tobgr24_mmxext; |
2232 | 1 | rgb15to32 = rgb15to32_mmxext; |
2233 | 1 | rgb16tobgr24 = rgb16tobgr24_mmxext; |
2234 | 1 | rgb16to32 = rgb16to32_mmxext; |
2235 | 1 | rgb16to15 = rgb16to15_mmxext; |
2236 | 1 | rgb24tobgr16 = rgb24tobgr16_mmxext; |
2237 | 1 | rgb24tobgr15 = rgb24tobgr15_mmxext; |
2238 | 1 | rgb24tobgr32 = rgb24tobgr32_mmxext; |
2239 | 1 | rgb32to16 = rgb32to16_mmxext; |
2240 | 1 | rgb32to15 = rgb32to15_mmxext; |
2241 | 1 | rgb32tobgr24 = rgb32tobgr24_mmxext; |
2242 | 1 | rgb24to15 = rgb24to15_mmxext; |
2243 | 1 | rgb24to16 = rgb24to16_mmxext; |
2244 | 1 | rgb24tobgr24 = rgb24tobgr24_mmxext; |
2245 | 1 | rgb32tobgr16 = rgb32tobgr16_mmxext; |
2246 | 1 | rgb32tobgr15 = rgb32tobgr15_mmxext; |
2247 | 1 | yv12toyuy2 = yv12toyuy2_mmxext; |
2248 | 1 | yv12touyvy = yv12touyvy_mmxext; |
2249 | 1 | yuv422ptoyuy2 = yuv422ptoyuy2_mmxext; |
2250 | 1 | yuv422ptouyvy = yuv422ptouyvy_mmxext; |
2251 | 1 | yuy2toyv12 = yuy2toyv12_mmxext; |
2252 | 1 | vu9_to_vu12 = vu9_to_vu12_mmxext; |
2253 | 1 | yvu9_to_yuy2 = yvu9_to_yuy2_mmxext; |
2254 | | #if ARCH_X86_32 |
2255 | | uyvytoyuv422 = uyvytoyuv422_mmxext; |
2256 | | #endif |
2257 | 1 | yuyvtoyuv422 = yuyvtoyuv422_mmxext; |
2258 | | |
2259 | 1 | planar2x = planar2x_mmxext; |
2260 | | #if ARCH_X86_32 && HAVE_7REGS |
2261 | | ff_rgb24toyv12 = rgb24toyv12_mmxext; |
2262 | | #endif /* ARCH_X86_32 && HAVE_7REGS */ |
2263 | | |
2264 | 1 | yuyvtoyuv420 = yuyvtoyuv420_mmxext; |
2265 | 1 | uyvytoyuv420 = uyvytoyuv420_mmxext; |
2266 | 1 | } |
2267 | | |
2268 | | //SSE2 versions |
2269 | | static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, |
2270 | | int width, int height, int src1Stride, |
2271 | | int src2Stride, int dstStride) |
2272 | 7 | { |
2273 | 254 | for (int h = 0; h < height; h++) { |
2274 | 247 | if (width >= 16) { |
2275 | 241 | if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) { |
2276 | 241 | __asm__( |
2277 | 241 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" |
2278 | 241 | "1: \n\t" |
2279 | 241 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" |
2280 | 241 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" |
2281 | 241 | "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" |
2282 | 241 | "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t" |
2283 | 241 | "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t" |
2284 | 241 | "punpcklbw %%xmm2, %%xmm0 \n\t" |
2285 | 241 | "punpckhbw %%xmm2, %%xmm1 \n\t" |
2286 | 241 | "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t" |
2287 | 241 | "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t" |
2288 | 241 | "add $16, %%"FF_REG_a" \n\t" |
2289 | 241 | "cmp %3, %%"FF_REG_a" \n\t" |
2290 | 241 | " jb 1b \n\t" |
2291 | 241 | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) |
2292 | 241 | : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a |
2293 | 241 | ); |
2294 | 241 | } else |
2295 | 0 | __asm__( |
2296 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" |
2297 | 0 | "1: \n\t" |
2298 | 0 | PREFETCH" 64(%1, %%"FF_REG_a") \n\t" |
2299 | 0 | PREFETCH" 64(%2, %%"FF_REG_a") \n\t" |
2300 | 0 | "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" |
2301 | 0 | "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t" |
2302 | 0 | "movq %%mm0, %%mm1 \n\t" |
2303 | 0 | "movq %%mm2, %%mm3 \n\t" |
2304 | 0 | "movq (%2, %%"FF_REG_a"), %%mm4 \n\t" |
2305 | 0 | "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t" |
2306 | 0 | "punpcklbw %%mm4, %%mm0 \n\t" |
2307 | 0 | "punpckhbw %%mm4, %%mm1 \n\t" |
2308 | 0 | "punpcklbw %%mm5, %%mm2 \n\t" |
2309 | 0 | "punpckhbw %%mm5, %%mm3 \n\t" |
2310 | 0 | MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t" |
2311 | 0 | MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t" |
2312 | 0 | MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t" |
2313 | 0 | MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t" |
2314 | 0 | "add $16, %%"FF_REG_a" \n\t" |
2315 | 0 | "cmp %3, %%"FF_REG_a" \n\t" |
2316 | 0 | " jb 1b \n\t" |
2317 | 0 | ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) |
2318 | 0 | : "memory", "%"FF_REG_a |
2319 | 0 | ); |
2320 | | |
2321 | 241 | } |
2322 | 844 | for (int w = (width & (~15)); w < width; w++) { |
2323 | 597 | dest[2*w+0] = src1[w]; |
2324 | 597 | dest[2*w+1] = src2[w]; |
2325 | 597 | } |
2326 | 247 | dest += dstStride; |
2327 | 247 | src1 += src1Stride; |
2328 | 247 | src2 += src2Stride; |
2329 | 247 | } |
2330 | 7 | __asm__( |
2331 | 7 | EMMS" \n\t" |
2332 | 7 | SFENCE" \n\t" |
2333 | 7 | ::: "memory" |
2334 | 7 | ); |
2335 | 7 | } |
2336 | | |
2337 | | /* |
2338 | | RGB15->RGB16 original by Strepto/Astral |
2339 | | ported to gcc & bugfixed : A'rpi |
2340 | | MMXEXT, 3DNOW optimization by Nick Kurshev |
2341 | | 32-bit C version, and and&add trick by Michael Niedermayer |
2342 | | */ |
2343 | | |
2344 | | #endif /* HAVE_INLINE_ASM */ |
2345 | | |
2346 | | void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2347 | | void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2348 | | void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2349 | | void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2350 | | void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2351 | | void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2352 | | void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2353 | | void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2354 | | void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size); |
2355 | | |
2356 | | #if ARCH_X86_64 |
2357 | | void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2358 | | void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2359 | | void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2360 | | void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2361 | | void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2362 | | void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2363 | | void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2364 | | void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2365 | | void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size); |
2366 | | |
2367 | | void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2368 | | void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2369 | | void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2370 | | void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2371 | | void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2372 | | void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2373 | | void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2374 | | void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2375 | | void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size); |
2376 | | |
2377 | | void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2378 | | const uint8_t *src, int width, int height, |
2379 | | int lumStride, int chromStride, int srcStride); |
2380 | | void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2381 | | const uint8_t *src, int width, int height, |
2382 | | int lumStride, int chromStride, int srcStride); |
2383 | | void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2384 | | const uint8_t *src, int width, int height, |
2385 | | int lumStride, int chromStride, int srcStride); |
2386 | | void ff_uyvytoyuv422_avx512icl(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
2387 | | const uint8_t *src, int width, int height, |
2388 | | int lumStride, int chromStride, int srcStride); |
2389 | | #endif |
2390 | | |
2391 | | #define DEINTERLEAVE_BYTES(cpuext) \ |
2392 | | void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \ |
2393 | | const uint8_t *unused, \ |
2394 | | const uint8_t *src1, \ |
2395 | | const uint8_t *src2, \ |
2396 | | int w, \ |
2397 | | uint32_t *unused2, \ |
2398 | | void *opq); \ |
2399 | | static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \ |
2400 | | int width, int height, int srcStride, \ |
2401 | 9 | int dst1Stride, int dst2Stride) \ |
2402 | 9 | { \ |
2403 | 213 | for (int h = 0; h < height; h++) { \ |
2404 | 204 | if (width >= 16) \ |
2405 | 204 | ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \ |
2406 | 777 | for (int w = (width & (~15)); w < width; w++) { \ |
2407 | 573 | dst1[w] = src[2*w+0]; \ |
2408 | 573 | dst2[w] = src[2*w+1]; \ |
2409 | 573 | } \ |
2410 | 204 | src += srcStride; \ |
2411 | 204 | dst1 += dst1Stride; \ |
2412 | 204 | dst2 += dst2Stride; \ |
2413 | 204 | } \ |
2414 | 9 | } |
2415 | | |
2416 | | #if HAVE_SSE2_EXTERNAL |
2417 | 0 | DEINTERLEAVE_BYTES(sse2) |
2418 | | #endif |
2419 | | #if HAVE_AVX_EXTERNAL |
2420 | 9 | DEINTERLEAVE_BYTES(avx) |
2421 | | #endif |
2422 | | |
2423 | | av_cold void rgb2rgb_init_x86(void) |
2424 | 1 | { |
2425 | 1 | int cpu_flags = av_get_cpu_flags(); |
2426 | | |
2427 | 1 | #if HAVE_INLINE_ASM |
2428 | 1 | if (INLINE_MMXEXT(cpu_flags)) |
2429 | 1 | rgb2rgb_init_mmxext(); |
2430 | 1 | if (INLINE_SSE2(cpu_flags)) |
2431 | 1 | interleaveBytes = interleave_bytes_sse2; |
2432 | 1 | #endif /* HAVE_INLINE_ASM */ |
2433 | | |
2434 | 1 | #if HAVE_SSE2_EXTERNAL |
2435 | 1 | if (EXTERNAL_SSE2(cpu_flags)) { |
2436 | 1 | #if ARCH_X86_64 |
2437 | 1 | uyvytoyuv422 = ff_uyvytoyuv422_sse2; |
2438 | 1 | #endif |
2439 | 1 | deinterleaveBytes = deinterleave_bytes_sse2; |
2440 | 1 | } |
2441 | 1 | #endif |
2442 | 1 | if (EXTERNAL_SSSE3(cpu_flags)) { |
2443 | 1 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_ssse3; |
2444 | 1 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_ssse3; |
2445 | 1 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_ssse3; |
2446 | 1 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_ssse3; |
2447 | 1 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_ssse3; |
2448 | 1 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_ssse3; |
2449 | 1 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_ssse3; |
2450 | 1 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_ssse3; |
2451 | 1 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_ssse3; |
2452 | 1 | } |
2453 | 1 | #if HAVE_AVX_EXTERNAL |
2454 | 1 | if (EXTERNAL_AVX(cpu_flags)) { |
2455 | 1 | deinterleaveBytes = deinterleave_bytes_avx; |
2456 | 1 | #if ARCH_X86_64 |
2457 | 1 | uyvytoyuv422 = ff_uyvytoyuv422_avx; |
2458 | 1 | } |
2459 | 1 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2460 | 1 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2; |
2461 | 1 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2; |
2462 | 1 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2; |
2463 | 1 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2; |
2464 | 1 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2; |
2465 | 1 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2; |
2466 | 1 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2; |
2467 | 1 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2; |
2468 | 1 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2; |
2469 | 1 | } |
2470 | 1 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
2471 | 0 | shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl; |
2472 | 0 | shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl; |
2473 | 0 | shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl; |
2474 | 0 | shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl; |
2475 | 0 | shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl; |
2476 | 0 | shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl; |
2477 | 0 | shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl; |
2478 | 0 | shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl; |
2479 | 0 | shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl; |
2480 | 0 | } |
2481 | 1 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { |
2482 | 1 | uyvytoyuv422 = ff_uyvytoyuv422_avx2; |
2483 | 1 | } |
2484 | 1 | if (EXTERNAL_AVX512ICL(cpu_flags)) { |
2485 | 0 | uyvytoyuv422 = ff_uyvytoyuv422_avx512icl; |
2486 | 0 | #endif |
2487 | 0 | } |
2488 | 1 | #endif |
2489 | 1 | } |