/src/ffmpeg/libswscale/x86/hscale_fast_bilinear_simd.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at> |
3 | | * |
4 | | * This file is part of FFmpeg. |
5 | | * |
6 | | * FFmpeg is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * FFmpeg is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with FFmpeg; if not, write to the Free Software |
18 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | | */ |
20 | | |
21 | | #include "../swscale_internal.h" |
22 | | #include "libavutil/attributes.h" |
23 | | #include "libavutil/x86/asm.h" |
24 | | #include "libavutil/x86/cpu.h" |
25 | | #include "libavutil/mem_internal.h" |
26 | | |
27 | 0 | #define RET 0xC3 // near return opcode for x86 |
28 | | #define PREFETCH "prefetchnta" |
29 | | |
30 | | av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, |
31 | | int16_t *filter, int32_t *filterPos, |
32 | | int numSplits) |
33 | 0 | { |
34 | 0 | uint8_t *fragmentA; |
35 | 0 | x86_reg imm8OfPShufW1A; |
36 | 0 | x86_reg imm8OfPShufW2A; |
37 | 0 | x86_reg fragmentLengthA; |
38 | 0 | uint8_t *fragmentB; |
39 | 0 | x86_reg imm8OfPShufW1B; |
40 | 0 | x86_reg imm8OfPShufW2B; |
41 | 0 | x86_reg fragmentLengthB; |
42 | 0 | int fragmentPos; |
43 | |
|
44 | 0 | int xpos, i; |
45 | | |
46 | | // create an optimized horizontal scaling routine |
47 | | /* This scaler is made of runtime-generated MMXEXT code using specially tuned |
48 | | * pshufw instructions. For every four output pixels, if four input pixels |
49 | | * are enough for the fast bilinear scaling, then a chunk of fragmentB is |
50 | | * used. If five input pixels are needed, then a chunk of fragmentA is used. |
51 | | */ |
52 | | |
53 | | // code fragment |
54 | |
|
55 | 0 | __asm__ volatile ( |
56 | 0 | "jmp 9f \n\t" |
57 | | // Begin |
58 | 0 | "0: \n\t" |
59 | 0 | "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" |
60 | 0 | "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" |
61 | 0 | "movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t" |
62 | 0 | "punpcklbw %%mm7, %%mm1 \n\t" |
63 | 0 | "punpcklbw %%mm7, %%mm0 \n\t" |
64 | 0 | "pshufw $0xFF, %%mm1, %%mm1 \n\t" |
65 | 0 | "1: \n\t" |
66 | 0 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
67 | 0 | "2: \n\t" |
68 | 0 | "psubw %%mm1, %%mm0 \n\t" |
69 | 0 | "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" |
70 | 0 | "pmullw %%mm3, %%mm0 \n\t" |
71 | 0 | "psllw $7, %%mm1 \n\t" |
72 | 0 | "paddw %%mm1, %%mm0 \n\t" |
73 | |
|
74 | 0 | "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" |
75 | |
|
76 | 0 | "add $8, %%"FF_REG_a" \n\t" |
77 | | // End |
78 | 0 | "9: \n\t" |
79 | 0 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" |
80 | 0 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" |
81 | 0 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" |
82 | 0 | "dec %1 \n\t" |
83 | 0 | "dec %2 \n\t" |
84 | 0 | "sub %0, %1 \n\t" |
85 | 0 | "sub %0, %2 \n\t" |
86 | 0 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" |
87 | 0 | "sub %0, %3 \n\t" |
88 | | |
89 | |
|
90 | 0 | : "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A), |
91 | 0 | "=r" (fragmentLengthA) |
92 | 0 | ); |
93 | |
|
94 | 0 | __asm__ volatile ( |
95 | 0 | "jmp 9f \n\t" |
96 | | // Begin |
97 | 0 | "0: \n\t" |
98 | 0 | "movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" |
99 | 0 | "movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" |
100 | 0 | "punpcklbw %%mm7, %%mm0 \n\t" |
101 | 0 | "pshufw $0xFF, %%mm0, %%mm1 \n\t" |
102 | 0 | "1: \n\t" |
103 | 0 | "pshufw $0xFF, %%mm0, %%mm0 \n\t" |
104 | 0 | "2: \n\t" |
105 | 0 | "psubw %%mm1, %%mm0 \n\t" |
106 | 0 | "movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" |
107 | 0 | "pmullw %%mm3, %%mm0 \n\t" |
108 | 0 | "psllw $7, %%mm1 \n\t" |
109 | 0 | "paddw %%mm1, %%mm0 \n\t" |
110 | |
|
111 | 0 | "movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" |
112 | |
|
113 | 0 | "add $8, %%"FF_REG_a" \n\t" |
114 | | // End |
115 | 0 | "9: \n\t" |
116 | 0 | "lea " LOCAL_MANGLE(0b) ", %0 \n\t" |
117 | 0 | "lea " LOCAL_MANGLE(1b) ", %1 \n\t" |
118 | 0 | "lea " LOCAL_MANGLE(2b) ", %2 \n\t" |
119 | 0 | "dec %1 \n\t" |
120 | 0 | "dec %2 \n\t" |
121 | 0 | "sub %0, %1 \n\t" |
122 | 0 | "sub %0, %2 \n\t" |
123 | 0 | "lea " LOCAL_MANGLE(9b) ", %3 \n\t" |
124 | 0 | "sub %0, %3 \n\t" |
125 | | |
126 | |
|
127 | 0 | : "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B), |
128 | 0 | "=r" (fragmentLengthB) |
129 | 0 | ); |
130 | |
|
131 | 0 | xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers |
132 | 0 | fragmentPos = 0; |
133 | |
|
134 | 0 | for (i = 0; i < dstW / numSplits; i++) { |
135 | 0 | int xx = xpos >> 16; |
136 | |
|
137 | 0 | if ((i & 3) == 0) { |
138 | 0 | int a = 0; |
139 | 0 | int b = ((xpos + xInc) >> 16) - xx; |
140 | 0 | int c = ((xpos + xInc * 2) >> 16) - xx; |
141 | 0 | int d = ((xpos + xInc * 3) >> 16) - xx; |
142 | 0 | int inc = (d + 1 < 4); |
143 | 0 | uint8_t *fragment = inc ? fragmentB : fragmentA; |
144 | 0 | x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A; |
145 | 0 | x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A; |
146 | 0 | x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA; |
147 | 0 | int maxShift = 3 - (d + inc); |
148 | 0 | int shift = 0; |
149 | |
|
150 | 0 | if (filterCode) { |
151 | 0 | filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9; |
152 | 0 | filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9; |
153 | 0 | filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9; |
154 | 0 | filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9; |
155 | 0 | filterPos[i / 2] = xx; |
156 | |
|
157 | 0 | memcpy(filterCode + fragmentPos, fragment, fragmentLength); |
158 | |
|
159 | 0 | filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) | |
160 | 0 | ((b + inc) << 2) | |
161 | 0 | ((c + inc) << 4) | |
162 | 0 | ((d + inc) << 6); |
163 | 0 | filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) | |
164 | 0 | (c << 4) | |
165 | 0 | (d << 6); |
166 | |
|
167 | 0 | if (i + 4 - inc >= dstW) |
168 | 0 | shift = maxShift; // avoid overread |
169 | 0 | else if ((filterPos[i / 2] & 3) <= maxShift) |
170 | 0 | shift = filterPos[i / 2] & 3; // align |
171 | |
|
172 | 0 | if (shift && i >= shift) { |
173 | 0 | filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift; |
174 | 0 | filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift; |
175 | 0 | filterPos[i / 2] -= shift; |
176 | 0 | } |
177 | 0 | } |
178 | |
|
179 | 0 | fragmentPos += fragmentLength; |
180 | |
|
181 | 0 | if (filterCode) |
182 | 0 | filterCode[fragmentPos] = RET; |
183 | 0 | } |
184 | 0 | xpos += xInc; |
185 | 0 | } |
186 | 0 | if (filterCode) |
187 | 0 | filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part |
188 | |
|
189 | 0 | return fragmentPos + 1; |
190 | 0 | } |
191 | | |
192 | | void ff_hyscale_fast_mmxext(SwsInternal *c, int16_t *dst, |
193 | | int dstWidth, const uint8_t *src, |
194 | | int srcW, int xInc) |
195 | 0 | { |
196 | 0 | int32_t *filterPos = c->hLumFilterPos; |
197 | 0 | int16_t *filter = c->hLumFilter; |
198 | 0 | void *mmxextFilterCode = c->lumMmxextFilterCode; |
199 | 0 | int i; |
200 | 0 | #if ARCH_X86_64 |
201 | 0 | uint64_t retsave; |
202 | | #else |
203 | | #if !HAVE_EBX_AVAILABLE |
204 | | uint64_t ebxsave; |
205 | | #endif |
206 | | #endif |
207 | |
|
208 | 0 | __asm__ volatile( |
209 | 0 | #if ARCH_X86_64 |
210 | 0 | "mov -8(%%rsp), %%"FF_REG_a" \n\t" |
211 | 0 | "mov %%"FF_REG_a", %5 \n\t" // retsave |
212 | | #else |
213 | | #if !HAVE_EBX_AVAILABLE |
214 | | "mov %%"FF_REG_b", %5 \n\t" // ebxsave |
215 | | #endif |
216 | | #endif |
217 | 0 | "pxor %%mm7, %%mm7 \n\t" |
218 | 0 | "mov %0, %%"FF_REG_c" \n\t" |
219 | 0 | "mov %1, %%"FF_REG_D" \n\t" |
220 | 0 | "mov %2, %%"FF_REG_d" \n\t" |
221 | 0 | "mov %3, %%"FF_REG_b" \n\t" |
222 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
223 | 0 | PREFETCH" (%%"FF_REG_c") \n\t" |
224 | 0 | PREFETCH" 32(%%"FF_REG_c") \n\t" |
225 | 0 | PREFETCH" 64(%%"FF_REG_c") \n\t" |
226 | |
|
227 | 0 | #if ARCH_X86_64 |
228 | 0 | #define CALL_MMXEXT_FILTER_CODE \ |
229 | 0 | "movl (%%"FF_REG_b"), %%esi \n\t"\ |
230 | 0 | "call *%4 \n\t"\ |
231 | 0 | "movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\ |
232 | 0 | "add %%"FF_REG_S", %%"FF_REG_c" \n\t"\ |
233 | 0 | "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ |
234 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
235 | 0 |
|
236 | | #else |
237 | | #define CALL_MMXEXT_FILTER_CODE \ |
238 | | "movl (%%"FF_REG_b"), %%esi \n\t"\ |
239 | | "call *%4 \n\t"\ |
240 | | "addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\ |
241 | | "add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ |
242 | | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ |
243 | | |
244 | | #endif /* ARCH_X86_64 */ |
245 | |
|
246 | 0 | CALL_MMXEXT_FILTER_CODE |
247 | 0 | CALL_MMXEXT_FILTER_CODE |
248 | 0 | CALL_MMXEXT_FILTER_CODE |
249 | 0 | CALL_MMXEXT_FILTER_CODE |
250 | 0 | CALL_MMXEXT_FILTER_CODE |
251 | 0 | CALL_MMXEXT_FILTER_CODE |
252 | 0 | CALL_MMXEXT_FILTER_CODE |
253 | 0 | CALL_MMXEXT_FILTER_CODE |
254 | |
|
255 | 0 | #if ARCH_X86_64 |
256 | 0 | "mov %5, %%"FF_REG_a" \n\t" |
257 | 0 | "mov %%"FF_REG_a", -8(%%rsp) \n\t" |
258 | | #else |
259 | | #if !HAVE_EBX_AVAILABLE |
260 | | "mov %5, %%"FF_REG_b" \n\t" |
261 | | #endif |
262 | | #endif |
263 | 0 | :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), |
264 | 0 | "m" (mmxextFilterCode) |
265 | 0 | #if ARCH_X86_64 |
266 | 0 | ,"m"(retsave) |
267 | | #else |
268 | | #if !HAVE_EBX_AVAILABLE |
269 | | ,"m" (ebxsave) |
270 | | #endif |
271 | | #endif |
272 | 0 | : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D |
273 | 0 | #if ARCH_X86_64 || HAVE_EBX_AVAILABLE |
274 | 0 | ,"%"FF_REG_b |
275 | 0 | #endif |
276 | 0 | ); |
277 | |
|
278 | 0 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
279 | 0 | dst[i] = src[srcW-1]*128; |
280 | 0 | } |
281 | | |
282 | | void ff_hcscale_fast_mmxext(SwsInternal *c, int16_t *dst1, int16_t *dst2, |
283 | | int dstWidth, const uint8_t *src1, |
284 | | const uint8_t *src2, int srcW, int xInc) |
285 | 0 | { |
286 | 0 | int32_t *filterPos = c->hChrFilterPos; |
287 | 0 | int16_t *filter = c->hChrFilter; |
288 | 0 | void *mmxextFilterCode = c->chrMmxextFilterCode; |
289 | 0 | int i; |
290 | 0 | #if ARCH_X86_64 |
291 | 0 | DECLARE_ALIGNED(8, uint64_t, retsave); |
292 | | #else |
293 | | #if !HAVE_EBX_AVAILABLE |
294 | | DECLARE_ALIGNED(8, uint64_t, ebxsave); |
295 | | #endif |
296 | | #endif |
297 | 0 | __asm__ volatile( |
298 | 0 | #if ARCH_X86_64 |
299 | 0 | "mov -8(%%rsp), %%"FF_REG_a" \n\t" |
300 | 0 | "mov %%"FF_REG_a", %7 \n\t" // retsave |
301 | | #else |
302 | | #if !HAVE_EBX_AVAILABLE |
303 | | "mov %%"FF_REG_b", %7 \n\t" // ebxsave |
304 | | #endif |
305 | | #endif |
306 | 0 | "pxor %%mm7, %%mm7 \n\t" |
307 | 0 | "mov %0, %%"FF_REG_c" \n\t" |
308 | 0 | "mov %1, %%"FF_REG_D" \n\t" |
309 | 0 | "mov %2, %%"FF_REG_d" \n\t" |
310 | 0 | "mov %3, %%"FF_REG_b" \n\t" |
311 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
312 | 0 | PREFETCH" (%%"FF_REG_c") \n\t" |
313 | 0 | PREFETCH" 32(%%"FF_REG_c") \n\t" |
314 | 0 | PREFETCH" 64(%%"FF_REG_c") \n\t" |
315 | |
|
316 | 0 | CALL_MMXEXT_FILTER_CODE |
317 | 0 | CALL_MMXEXT_FILTER_CODE |
318 | 0 | CALL_MMXEXT_FILTER_CODE |
319 | 0 | CALL_MMXEXT_FILTER_CODE |
320 | 0 | "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i |
321 | 0 | "mov %5, %%"FF_REG_c" \n\t" // src2 |
322 | 0 | "mov %6, %%"FF_REG_D" \n\t" // dst2 |
323 | 0 | PREFETCH" (%%"FF_REG_c") \n\t" |
324 | 0 | PREFETCH" 32(%%"FF_REG_c") \n\t" |
325 | 0 | PREFETCH" 64(%%"FF_REG_c") \n\t" |
326 | |
|
327 | 0 | CALL_MMXEXT_FILTER_CODE |
328 | 0 | CALL_MMXEXT_FILTER_CODE |
329 | 0 | CALL_MMXEXT_FILTER_CODE |
330 | 0 | CALL_MMXEXT_FILTER_CODE |
331 | |
|
332 | 0 | #if ARCH_X86_64 |
333 | 0 | "mov %7, %%"FF_REG_a" \n\t" |
334 | 0 | "mov %%"FF_REG_a", -8(%%rsp) \n\t" |
335 | | #else |
336 | | #if !HAVE_EBX_AVAILABLE |
337 | | "mov %7, %%"FF_REG_b" \n\t" |
338 | | #endif |
339 | | #endif |
340 | 0 | :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), |
341 | 0 | "m" (mmxextFilterCode), "m" (src2), "m"(dst2) |
342 | 0 | #if ARCH_X86_64 |
343 | 0 | ,"m"(retsave) |
344 | | #else |
345 | | #if !HAVE_EBX_AVAILABLE |
346 | | ,"m" (ebxsave) |
347 | | #endif |
348 | | #endif |
349 | 0 | : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D |
350 | 0 | #if ARCH_X86_64 || HAVE_EBX_AVAILABLE |
351 | 0 | ,"%"FF_REG_b |
352 | 0 | #endif |
353 | 0 | ); |
354 | |
|
355 | 0 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { |
356 | 0 | dst1[i] = src1[srcW-1]*128; |
357 | 0 | dst2[i] = src2[srcW-1]*128; |
358 | 0 | } |
359 | 0 | } |