/src/libvpx/vp8/common/x86/vp8_asm_stubs.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #include "vpx_config.h" |
12 | | #include "vp8_rtcd.h" |
13 | | #include "vpx_ports/mem.h" |
14 | | |
15 | | extern const short vp8_six_tap_x86[8][6 * 8]; |
16 | | |
17 | | extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr, |
18 | | unsigned short *output_ptr, |
19 | | unsigned int src_pixels_per_line, |
20 | | unsigned int pixel_step, |
21 | | unsigned int output_height, |
22 | | unsigned int output_width, |
23 | | const short *vp8_filter); |
24 | | extern void vp8_filter_block1dc_v6_mmx( |
25 | | unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch, |
26 | | unsigned int pixels_per_line, unsigned int pixel_step, |
27 | | unsigned int output_height, unsigned int output_width, |
28 | | const short *vp8_filter); |
29 | | extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr, |
30 | | unsigned short *output_ptr, |
31 | | unsigned int src_pixels_per_line, |
32 | | unsigned int pixel_step, |
33 | | unsigned int output_height, |
34 | | unsigned int output_width, |
35 | | const short *vp8_filter); |
36 | | extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr, |
37 | | unsigned short *output_ptr, |
38 | | unsigned int src_pixels_per_line, |
39 | | unsigned int pixel_step, |
40 | | unsigned int output_height, |
41 | | unsigned int output_width, |
42 | | const short *vp8_filter); |
43 | | extern void vp8_filter_block1d8_v6_sse2( |
44 | | unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, |
45 | | unsigned int pixels_per_line, unsigned int pixel_step, |
46 | | unsigned int output_height, unsigned int output_width, |
47 | | const short *vp8_filter); |
48 | | extern void vp8_filter_block1d16_v6_sse2( |
49 | | unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich, |
50 | | unsigned int pixels_per_line, unsigned int pixel_step, |
51 | | unsigned int output_height, unsigned int output_width, |
52 | | const short *vp8_filter); |
53 | | extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr, |
54 | | unsigned short *output_ptr, |
55 | | unsigned int src_pixels_per_line, |
56 | | unsigned int output_height, |
57 | | unsigned int output_width); |
58 | | extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, |
59 | | unsigned int src_pixels_per_line, |
60 | | unsigned char *output_ptr, |
61 | | int dst_ptich, |
62 | | unsigned int output_height, |
63 | | const short *vp8_filter); |
64 | | extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, |
65 | | unsigned int src_pixels_per_line, |
66 | | unsigned char *output_ptr, |
67 | | int dst_ptich, |
68 | | unsigned int output_height, |
69 | | const short *vp8_filter); |
70 | | extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, |
71 | | unsigned int src_pixels_per_line, |
72 | | unsigned char *output_ptr, |
73 | | int dst_ptich, |
74 | | unsigned int output_height, |
75 | | const short *vp8_filter); |
76 | | |
77 | | #if HAVE_MMX |
78 | | void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, |
79 | | int xoffset, int yoffset, unsigned char *dst_ptr, |
80 | 0 | int dst_pitch) { |
81 | 0 | DECLARE_ALIGNED(16, unsigned short, |
82 | 0 | FData2[16 * 16]); /* Temp data bufffer used in filtering */ |
83 | 0 | const short *HFilter, *VFilter; |
84 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
85 | 0 | vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, |
86 | 0 | src_pixels_per_line, 1, 9, 8, HFilter); |
87 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
88 | 0 | vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4, |
89 | 0 | VFilter); |
90 | 0 | } |
91 | | #endif |
92 | | |
93 | | #if HAVE_SSE2 |
94 | | void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, |
95 | | int src_pixels_per_line, int xoffset, |
96 | | int yoffset, unsigned char *dst_ptr, |
97 | 0 | int dst_pitch) { |
98 | 0 | DECLARE_ALIGNED(16, unsigned short, |
99 | 0 | FData2[24 * 24]); /* Temp data bufffer used in filtering */ |
100 | |
|
101 | 0 | const short *HFilter, *VFilter; |
102 | |
|
103 | 0 | if (xoffset) { |
104 | 0 | if (yoffset) { |
105 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
106 | 0 | vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, |
107 | 0 | src_pixels_per_line, 1, 21, 32, HFilter); |
108 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
109 | 0 | vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, |
110 | 0 | dst_pitch, VFilter); |
111 | 0 | } else { |
112 | | /* First-pass only */ |
113 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
114 | 0 | vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, |
115 | 0 | dst_pitch, 16, HFilter); |
116 | 0 | } |
117 | 0 | } else { |
118 | | /* Second-pass only */ |
119 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
120 | 0 | vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, |
121 | 0 | src_pixels_per_line, 21, 32); |
122 | 0 | vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16, |
123 | 0 | dst_pitch, VFilter); |
124 | 0 | } |
125 | 0 | } |
126 | | |
127 | | void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, |
128 | | int xoffset, int yoffset, |
129 | 0 | unsigned char *dst_ptr, int dst_pitch) { |
130 | 0 | DECLARE_ALIGNED(16, unsigned short, |
131 | 0 | FData2[256]); /* Temp data bufffer used in filtering */ |
132 | 0 | const short *HFilter, *VFilter; |
133 | |
|
134 | 0 | if (xoffset) { |
135 | 0 | if (yoffset) { |
136 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
137 | 0 | vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, |
138 | 0 | src_pixels_per_line, 1, 13, 16, HFilter); |
139 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
140 | 0 | vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8, |
141 | 0 | dst_pitch, VFilter); |
142 | 0 | } else { |
143 | | /* First-pass only */ |
144 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
145 | 0 | vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, |
146 | 0 | dst_pitch, 8, HFilter); |
147 | 0 | } |
148 | 0 | } else { |
149 | | /* Second-pass only */ |
150 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
151 | 0 | vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), |
152 | 0 | src_pixels_per_line, dst_ptr, dst_pitch, 8, |
153 | 0 | VFilter); |
154 | 0 | } |
155 | 0 | } |
156 | | |
157 | | void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, |
158 | | int xoffset, int yoffset, |
159 | 0 | unsigned char *dst_ptr, int dst_pitch) { |
160 | 0 | DECLARE_ALIGNED(16, unsigned short, |
161 | 0 | FData2[256]); /* Temp data bufffer used in filtering */ |
162 | 0 | const short *HFilter, *VFilter; |
163 | |
|
164 | 0 | if (xoffset) { |
165 | 0 | if (yoffset) { |
166 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
167 | 0 | vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, |
168 | 0 | src_pixels_per_line, 1, 9, 16, HFilter); |
169 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
170 | 0 | vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4, |
171 | 0 | dst_pitch, VFilter); |
172 | 0 | } else { |
173 | | /* First-pass only */ |
174 | 0 | HFilter = vp8_six_tap_x86[xoffset]; |
175 | 0 | vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, |
176 | 0 | dst_pitch, 4, HFilter); |
177 | 0 | } |
178 | 0 | } else { |
179 | | /* Second-pass only */ |
180 | 0 | VFilter = vp8_six_tap_x86[yoffset]; |
181 | 0 | vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), |
182 | 0 | src_pixels_per_line, dst_ptr, dst_pitch, 4, |
183 | 0 | VFilter); |
184 | 0 | } |
185 | 0 | } |
186 | | |
187 | | #endif |
188 | | |
189 | | #if HAVE_SSSE3 |
190 | | |
191 | | extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr, |
192 | | unsigned int src_pixels_per_line, |
193 | | unsigned char *output_ptr, |
194 | | unsigned int output_pitch, |
195 | | unsigned int output_height, |
196 | | unsigned int vp8_filter_index); |
197 | | |
198 | | extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr, |
199 | | unsigned int src_pixels_per_line, |
200 | | unsigned char *output_ptr, |
201 | | unsigned int output_pitch, |
202 | | unsigned int output_height, |
203 | | unsigned int vp8_filter_index); |
204 | | |
205 | | extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr, |
206 | | unsigned int src_pitch, |
207 | | unsigned char *output_ptr, |
208 | | unsigned int out_pitch, |
209 | | unsigned int output_height, |
210 | | unsigned int vp8_filter_index); |
211 | | |
212 | | extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr, |
213 | | unsigned int src_pitch, |
214 | | unsigned char *output_ptr, |
215 | | unsigned int out_pitch, |
216 | | unsigned int output_height, |
217 | | unsigned int vp8_filter_index); |
218 | | |
219 | | extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr, |
220 | | unsigned int src_pixels_per_line, |
221 | | unsigned char *output_ptr, |
222 | | unsigned int output_pitch, |
223 | | unsigned int output_height, |
224 | | unsigned int vp8_filter_index); |
225 | | |
226 | | extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, |
227 | | unsigned int src_pitch, |
228 | | unsigned char *output_ptr, |
229 | | unsigned int out_pitch, |
230 | | unsigned int output_height, |
231 | | unsigned int vp8_filter_index); |
232 | | |
233 | | void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, |
234 | | int src_pixels_per_line, int xoffset, |
235 | | int yoffset, unsigned char *dst_ptr, |
236 | 709k | int dst_pitch) { |
237 | 709k | DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); |
238 | | |
239 | 709k | if (xoffset) { |
240 | 494k | if (yoffset) { |
241 | 332k | vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
242 | 332k | src_pixels_per_line, FData2, 16, 21, |
243 | 332k | xoffset); |
244 | 332k | vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16, |
245 | 332k | yoffset); |
246 | 332k | } else { |
247 | | /* First-pass only */ |
248 | 162k | vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, |
249 | 162k | dst_pitch, 16, xoffset); |
250 | 162k | } |
251 | 494k | } else { |
252 | 214k | if (yoffset) { |
253 | | /* Second-pass only */ |
254 | 214k | vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
255 | 214k | src_pixels_per_line, dst_ptr, dst_pitch, 16, |
256 | 214k | yoffset); |
257 | 214k | } else { |
258 | | /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
259 | | * yoffset==0) case correctly. Add copy function here to guarantee |
260 | | * six-tap function handles all possible offsets. */ |
261 | 0 | vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
262 | 0 | } |
263 | 214k | } |
264 | 709k | } |
265 | | |
266 | | void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, |
267 | | int src_pixels_per_line, int xoffset, |
268 | | int yoffset, unsigned char *dst_ptr, |
269 | 2.63M | int dst_pitch) { |
270 | 2.63M | DECLARE_ALIGNED(16, unsigned char, FData2[256]); |
271 | | |
272 | 2.63M | if (xoffset) { |
273 | 1.96M | if (yoffset) { |
274 | 1.32M | vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
275 | 1.32M | src_pixels_per_line, FData2, 8, 13, xoffset); |
276 | 1.32M | vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset); |
277 | 1.32M | } else { |
278 | 642k | vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, |
279 | 642k | dst_pitch, 8, xoffset); |
280 | 642k | } |
281 | 1.96M | } else { |
282 | 675k | if (yoffset) { |
283 | | /* Second-pass only */ |
284 | 675k | vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
285 | 675k | src_pixels_per_line, dst_ptr, dst_pitch, 8, |
286 | 675k | yoffset); |
287 | 675k | } else { |
288 | | /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
289 | | * yoffset==0) case correctly. Add copy function here to guarantee |
290 | | * six-tap function handles all possible offsets. */ |
291 | 0 | vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
292 | 0 | } |
293 | 675k | } |
294 | 2.63M | } |
295 | | |
296 | | void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, |
297 | | int src_pixels_per_line, int xoffset, |
298 | | int yoffset, unsigned char *dst_ptr, |
299 | 584k | int dst_pitch) { |
300 | 584k | DECLARE_ALIGNED(16, unsigned char, FData2[256]); |
301 | | |
302 | 584k | if (xoffset) { |
303 | 435k | if (yoffset) { |
304 | 268k | vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
305 | 268k | src_pixels_per_line, FData2, 8, 9, xoffset); |
306 | 268k | vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset); |
307 | 268k | } else { |
308 | | /* First-pass only */ |
309 | 167k | vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, |
310 | 167k | dst_pitch, 4, xoffset); |
311 | 167k | } |
312 | 435k | } else { |
313 | 148k | if (yoffset) { |
314 | | /* Second-pass only */ |
315 | 148k | vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
316 | 148k | src_pixels_per_line, dst_ptr, dst_pitch, 4, |
317 | 148k | yoffset); |
318 | 148k | } else { |
319 | | /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
320 | | * yoffset==0) case correctly. Add copy function here to guarantee |
321 | | * six-tap function handles all possible offsets. */ |
322 | 0 | vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
323 | 0 | } |
324 | 148k | } |
325 | 584k | } |
326 | | |
327 | | void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, |
328 | | int src_pixels_per_line, int xoffset, |
329 | | int yoffset, unsigned char *dst_ptr, |
330 | 16.0M | int dst_pitch) { |
331 | 16.0M | DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]); |
332 | | |
333 | 16.0M | if (xoffset) { |
334 | 11.8M | if (yoffset) { |
335 | 7.02M | vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
336 | 7.02M | src_pixels_per_line, FData2, 4, 9, xoffset); |
337 | 7.02M | vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset); |
338 | 7.02M | } else { |
339 | 4.81M | vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, |
340 | 4.81M | dst_pitch, 4, xoffset); |
341 | 4.81M | } |
342 | 11.8M | } else { |
343 | 4.17M | if (yoffset) { |
344 | 4.17M | vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
345 | 4.17M | src_pixels_per_line, dst_ptr, dst_pitch, 4, |
346 | 4.17M | yoffset); |
347 | 4.17M | } else { |
348 | | /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
349 | | * yoffset==0) case correctly. Add copy function here to guarantee |
350 | | * six-tap function handles all possible offsets. */ |
351 | 0 | int r; |
352 | |
|
353 | 0 | for (r = 0; r < 4; ++r) { |
354 | 0 | dst_ptr[0] = src_ptr[0]; |
355 | 0 | dst_ptr[1] = src_ptr[1]; |
356 | 0 | dst_ptr[2] = src_ptr[2]; |
357 | 0 | dst_ptr[3] = src_ptr[3]; |
358 | 0 | dst_ptr += dst_pitch; |
359 | 0 | src_ptr += src_pixels_per_line; |
360 | 0 | } |
361 | 0 | } |
362 | 4.17M | } |
363 | 16.0M | } |
364 | | |
365 | | #endif |