/src/ffmpeg/libavcodec/x86/videodsp_init.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (C) 2002-2012 Michael Niedermayer |
3 | | * Copyright (C) 2012 Ronald S. Bultje |
4 | | * |
5 | | * This file is part of FFmpeg. |
6 | | * |
7 | | * FFmpeg is free software; you can redistribute it and/or |
8 | | * modify it under the terms of the GNU Lesser General Public |
9 | | * License as published by the Free Software Foundation; either |
10 | | * version 2.1 of the License, or (at your option) any later version. |
11 | | * |
12 | | * FFmpeg is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | | * Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public |
18 | | * License along with FFmpeg; if not, write to the Free Software |
19 | | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | #include "libavutil/attributes.h" |
24 | | #include "libavutil/avassert.h" |
25 | | #include "libavutil/common.h" |
26 | | #include "libavutil/cpu.h" |
27 | | #include "libavutil/x86/asm.h" |
28 | | #include "libavutil/x86/cpu.h" |
29 | | #include "libavcodec/videodsp.h" |
30 | | |
31 | | #if HAVE_X86ASM |
32 | | typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, |
33 | | const uint8_t *src, x86_reg src_stride, |
34 | | x86_reg start_y, x86_reg end_y, x86_reg bh); |
35 | | typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, |
36 | | const uint8_t *src, x86_reg src_stride, |
37 | | x86_reg start_y, x86_reg end_y, x86_reg bh, |
38 | | x86_reg w); |
39 | | |
40 | | extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2; |
41 | | extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2; |
42 | | extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2; |
43 | | extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2; |
44 | | extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2; |
45 | | extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2; |
46 | | extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2; |
47 | | extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2; |
48 | | extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2; |
49 | | extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2; |
50 | | extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2; |
51 | | extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2; |
52 | | extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2; |
53 | | extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2; |
54 | | extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2; |
55 | | extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2; |
56 | | extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2; |
57 | | extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2; |
58 | | extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2; |
59 | | extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2; |
60 | | extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2; |
61 | | extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2; |
62 | | static emu_edge_vfix_func * const vfixtbl_sse2[22] = { |
63 | | ff_emu_edge_vfix1_sse2, ff_emu_edge_vfix2_sse2, ff_emu_edge_vfix3_sse2, |
64 | | ff_emu_edge_vfix4_sse2, ff_emu_edge_vfix5_sse2, ff_emu_edge_vfix6_sse2, |
65 | | ff_emu_edge_vfix7_sse2, ff_emu_edge_vfix8_sse2, ff_emu_edge_vfix9_sse2, |
66 | | ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, ff_emu_edge_vfix12_sse2, |
67 | | ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, ff_emu_edge_vfix15_sse2, |
68 | | ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, ff_emu_edge_vfix18_sse2, |
69 | | ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, ff_emu_edge_vfix21_sse2, |
70 | | ff_emu_edge_vfix22_sse2 |
71 | | }; |
72 | | extern emu_edge_vvar_func ff_emu_edge_vvar_sse; |
73 | | |
74 | | typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, |
75 | | x86_reg start_x, x86_reg bh); |
76 | | typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, |
77 | | x86_reg start_x, x86_reg n_words, x86_reg bh); |
78 | | |
79 | | extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2; |
80 | | extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2; |
81 | | extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2; |
82 | | extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2; |
83 | | extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2; |
84 | | extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2; |
85 | | extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2; |
86 | | extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; |
87 | | extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; |
88 | | extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; |
89 | | extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; |
90 | | static emu_edge_hfix_func * const hfixtbl_sse2[11] = { |
91 | | ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, ff_emu_edge_hfix6_sse2, |
92 | | ff_emu_edge_hfix8_sse2, ff_emu_edge_hfix10_sse2, ff_emu_edge_hfix12_sse2, |
93 | | ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, |
94 | | ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 |
95 | | }; |
96 | | extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; |
97 | | #if HAVE_AVX2_EXTERNAL |
98 | | extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2; |
99 | | extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2; |
100 | | extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2; |
101 | | extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2; |
102 | | extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2; |
103 | | extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2; |
104 | | extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2; |
105 | | extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2; |
106 | | static emu_edge_hfix_func * const hfixtbl_avx2[11] = { |
107 | | ff_emu_edge_hfix2_sse2, ff_emu_edge_hfix4_sse2, ff_emu_edge_hfix6_sse2, |
108 | | ff_emu_edge_hfix8_avx2, ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2, |
109 | | ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2, |
110 | | ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2 |
111 | | }; |
112 | | extern emu_edge_hvar_func ff_emu_edge_hvar_avx2; |
113 | | #endif |
114 | | |
115 | | static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src, |
116 | | ptrdiff_t dst_stride, |
117 | | ptrdiff_t src_stride, |
118 | | x86_reg block_w, x86_reg block_h, |
119 | | x86_reg src_x, x86_reg src_y, |
120 | | x86_reg w, x86_reg h, |
121 | | emu_edge_vfix_func * const *vfix_tbl, |
122 | | emu_edge_vvar_func *v_extend_var, |
123 | | emu_edge_hfix_func * const *hfix_tbl, |
124 | | emu_edge_hvar_func *h_extend_var) |
125 | 8.50M | { |
126 | 8.50M | x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; |
127 | | |
128 | 8.50M | if (!w || !h) |
129 | 3.58k | return; |
130 | | |
131 | 8.50M | av_assert2(block_w <= FFABS(dst_stride)); |
132 | | |
133 | 8.50M | if (src_y >= h) { |
134 | 2.05M | src -= src_y*src_stride; |
135 | 2.05M | src_y_add = h - 1; |
136 | 2.05M | src_y = h - 1; |
137 | 6.44M | } else if (src_y <= -block_h) { |
138 | 890k | src -= src_y*src_stride; |
139 | 890k | src_y_add = 1 - block_h; |
140 | 890k | src_y = 1 - block_h; |
141 | 890k | } |
142 | 8.50M | if (src_x >= w) { |
143 | 1.58M | src += w - 1 - src_x; |
144 | 1.58M | src_x = w - 1; |
145 | 6.91M | } else if (src_x <= -block_w) { |
146 | 669k | src += 1 - block_w - src_x; |
147 | 669k | src_x = 1 - block_w; |
148 | 669k | } |
149 | | |
150 | 8.50M | start_y = FFMAX(0, -src_y); |
151 | 8.50M | start_x = FFMAX(0, -src_x); |
152 | 8.50M | end_y = FFMIN(block_h, h-src_y); |
153 | 8.50M | end_x = FFMIN(block_w, w-src_x); |
154 | 8.50M | av_assert2(start_x < end_x && block_w > 0); |
155 | 8.50M | av_assert2(start_y < end_y && block_h > 0); |
156 | | |
157 | | // fill in the to-be-copied part plus all above/below |
158 | 8.50M | src += (src_y_add + start_y) * src_stride + start_x; |
159 | 8.50M | w = end_x - start_x; |
160 | 8.50M | if (w <= 22) { |
161 | 8.07M | vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, |
162 | 8.07M | start_y, end_y, block_h); |
163 | 8.07M | } else { |
164 | 425k | v_extend_var(dst + start_x, dst_stride, src, src_stride, |
165 | 425k | start_y, end_y, block_h, w); |
166 | 425k | } |
167 | | |
168 | | // fill left |
169 | 8.50M | if (start_x) { |
170 | 1.59M | if (start_x <= 22) { |
171 | 1.57M | hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); |
172 | 1.57M | } else { |
173 | 18.6k | h_extend_var(dst, dst_stride, |
174 | 18.6k | start_x, (start_x + 1) >> 1, block_h); |
175 | 18.6k | } |
176 | 1.59M | } |
177 | | |
178 | | // fill right |
179 | 8.50M | p = block_w - end_x; |
180 | 8.50M | if (p) { |
181 | 3.52M | if (p <= 22) { |
182 | 3.28M | hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, |
183 | 3.28M | -!(p & 1), block_h); |
184 | 3.28M | } else { |
185 | 233k | h_extend_var(dst + end_x - (p & 1), dst_stride, |
186 | 233k | -!(p & 1), (p + 1) >> 1, block_h); |
187 | 233k | } |
188 | 3.52M | } |
189 | 8.50M | } |
190 | | |
191 | | static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, |
192 | | ptrdiff_t buf_stride, |
193 | | ptrdiff_t src_stride, |
194 | | int block_w, int block_h, |
195 | | int src_x, int src_y, int w, |
196 | | int h) |
197 | 0 | { |
198 | 0 | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, |
199 | 0 | src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse, |
200 | 0 | hfixtbl_sse2, &ff_emu_edge_hvar_sse2); |
201 | 0 | } |
202 | | |
203 | | #if HAVE_AVX2_EXTERNAL |
204 | | static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src, |
205 | | ptrdiff_t buf_stride, |
206 | | ptrdiff_t src_stride, |
207 | | int block_w, int block_h, |
208 | | int src_x, int src_y, int w, |
209 | | int h) |
210 | 8.50M | { |
211 | 8.50M | emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h, |
212 | 8.50M | src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse, |
213 | 8.50M | hfixtbl_avx2, &ff_emu_edge_hvar_avx2); |
214 | 8.50M | } |
215 | | #endif /* HAVE_AVX2_EXTERNAL */ |
216 | | #endif /* HAVE_X86ASM */ |
217 | | |
218 | | void ff_prefetch_mmxext(const uint8_t *buf, ptrdiff_t stride, int h); |
219 | | |
220 | | av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) |
221 | 3.53M | { |
222 | 3.53M | #if HAVE_X86ASM |
223 | 3.53M | int cpu_flags = av_get_cpu_flags(); |
224 | | |
225 | 3.53M | if (EXTERNAL_MMXEXT(cpu_flags)) { |
226 | 106k | ctx->prefetch = ff_prefetch_mmxext; |
227 | 106k | } |
228 | 3.53M | if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { |
229 | 93.7k | ctx->emulated_edge_mc = emulated_edge_mc_sse2; |
230 | 93.7k | } |
231 | 3.53M | #if HAVE_AVX2_EXTERNAL |
232 | 3.53M | if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) { |
233 | 93.7k | ctx->emulated_edge_mc = emulated_edge_mc_avx2; |
234 | 93.7k | } |
235 | 3.53M | #endif |
236 | 3.53M | #endif /* HAVE_X86ASM */ |
237 | 3.53M | } |