Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vp8/common/x86/vp8_asm_stubs.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#include "vpx_config.h"
12
#include "vp8_rtcd.h"
13
#include "vpx_ports/mem.h"
14
15
extern const short vp8_six_tap_x86[8][6 * 8];
16
17
extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
18
                                      unsigned short *output_ptr,
19
                                      unsigned int src_pixels_per_line,
20
                                      unsigned int pixel_step,
21
                                      unsigned int output_height,
22
                                      unsigned int output_width,
23
                                      const short *vp8_filter);
24
extern void vp8_filter_block1dc_v6_mmx(
25
    unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
26
    unsigned int pixels_per_line, unsigned int pixel_step,
27
    unsigned int output_height, unsigned int output_width,
28
    const short *vp8_filter);
29
extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
30
                                        unsigned short *output_ptr,
31
                                        unsigned int src_pixels_per_line,
32
                                        unsigned int pixel_step,
33
                                        unsigned int output_height,
34
                                        unsigned int output_width,
35
                                        const short *vp8_filter);
36
extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
37
                                         unsigned short *output_ptr,
38
                                         unsigned int src_pixels_per_line,
39
                                         unsigned int pixel_step,
40
                                         unsigned int output_height,
41
                                         unsigned int output_width,
42
                                         const short *vp8_filter);
43
extern void vp8_filter_block1d8_v6_sse2(
44
    unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
45
    unsigned int pixels_per_line, unsigned int pixel_step,
46
    unsigned int output_height, unsigned int output_width,
47
    const short *vp8_filter);
48
extern void vp8_filter_block1d16_v6_sse2(
49
    unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
50
    unsigned int pixels_per_line, unsigned int pixel_step,
51
    unsigned int output_height, unsigned int output_width,
52
    const short *vp8_filter);
53
extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
54
                                         unsigned short *output_ptr,
55
                                         unsigned int src_pixels_per_line,
56
                                         unsigned int output_height,
57
                                         unsigned int output_width);
58
extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
59
                                             unsigned int src_pixels_per_line,
60
                                             unsigned char *output_ptr,
61
                                             int dst_ptich,
62
                                             unsigned int output_height,
63
                                             const short *vp8_filter);
64
extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
65
                                              unsigned int src_pixels_per_line,
66
                                              unsigned char *output_ptr,
67
                                              int dst_ptich,
68
                                              unsigned int output_height,
69
                                              const short *vp8_filter);
70
extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
71
                                             unsigned int src_pixels_per_line,
72
                                             unsigned char *output_ptr,
73
                                             int dst_ptich,
74
                                             unsigned int output_height,
75
                                             const short *vp8_filter);
76
77
#if HAVE_MMX
78
void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
79
                               int xoffset, int yoffset, unsigned char *dst_ptr,
80
0
                               int dst_pitch) {
81
0
  DECLARE_ALIGNED(16, unsigned short,
82
0
                  FData2[16 * 16]); /* Temp data bufffer used in filtering */
83
0
  const short *HFilter, *VFilter;
84
0
  HFilter = vp8_six_tap_x86[xoffset];
85
0
  vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
86
0
                            src_pixels_per_line, 1, 9, 8, HFilter);
87
0
  VFilter = vp8_six_tap_x86[yoffset];
88
0
  vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
89
0
                             VFilter);
90
0
}
91
#endif
92
93
#if HAVE_SSE2
94
void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
95
                                  int src_pixels_per_line, int xoffset,
96
                                  int yoffset, unsigned char *dst_ptr,
97
0
                                  int dst_pitch) {
98
0
  DECLARE_ALIGNED(16, unsigned short,
99
0
                  FData2[24 * 24]); /* Temp data bufffer used in filtering */
100
101
0
  const short *HFilter, *VFilter;
102
103
0
  if (xoffset) {
104
0
    if (yoffset) {
105
0
      HFilter = vp8_six_tap_x86[xoffset];
106
0
      vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
107
0
                                   src_pixels_per_line, 1, 21, 32, HFilter);
108
0
      VFilter = vp8_six_tap_x86[yoffset];
109
0
      vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
110
0
                                   dst_pitch, VFilter);
111
0
    } else {
112
      /* First-pass only */
113
0
      HFilter = vp8_six_tap_x86[xoffset];
114
0
      vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
115
0
                                        dst_pitch, 16, HFilter);
116
0
    }
117
0
  } else {
118
    /* Second-pass only */
119
0
    VFilter = vp8_six_tap_x86[yoffset];
120
0
    vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
121
0
                                 src_pixels_per_line, 21, 32);
122
0
    vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
123
0
                                 dst_pitch, VFilter);
124
0
  }
125
0
}
126
127
void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
128
                                int xoffset, int yoffset,
129
0
                                unsigned char *dst_ptr, int dst_pitch) {
130
0
  DECLARE_ALIGNED(16, unsigned short,
131
0
                  FData2[256]); /* Temp data bufffer used in filtering */
132
0
  const short *HFilter, *VFilter;
133
134
0
  if (xoffset) {
135
0
    if (yoffset) {
136
0
      HFilter = vp8_six_tap_x86[xoffset];
137
0
      vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
138
0
                                  src_pixels_per_line, 1, 13, 16, HFilter);
139
0
      VFilter = vp8_six_tap_x86[yoffset];
140
0
      vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
141
0
                                  dst_pitch, VFilter);
142
0
    } else {
143
      /* First-pass only */
144
0
      HFilter = vp8_six_tap_x86[xoffset];
145
0
      vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
146
0
                                       dst_pitch, 8, HFilter);
147
0
    }
148
0
  } else {
149
    /* Second-pass only */
150
0
    VFilter = vp8_six_tap_x86[yoffset];
151
0
    vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
152
0
                                     src_pixels_per_line, dst_ptr, dst_pitch, 8,
153
0
                                     VFilter);
154
0
  }
155
0
}
156
157
void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
158
                                int xoffset, int yoffset,
159
0
                                unsigned char *dst_ptr, int dst_pitch) {
160
0
  DECLARE_ALIGNED(16, unsigned short,
161
0
                  FData2[256]); /* Temp data bufffer used in filtering */
162
0
  const short *HFilter, *VFilter;
163
164
0
  if (xoffset) {
165
0
    if (yoffset) {
166
0
      HFilter = vp8_six_tap_x86[xoffset];
167
0
      vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
168
0
                                  src_pixels_per_line, 1, 9, 16, HFilter);
169
0
      VFilter = vp8_six_tap_x86[yoffset];
170
0
      vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
171
0
                                  dst_pitch, VFilter);
172
0
    } else {
173
      /* First-pass only */
174
0
      HFilter = vp8_six_tap_x86[xoffset];
175
0
      vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
176
0
                                       dst_pitch, 4, HFilter);
177
0
    }
178
0
  } else {
179
    /* Second-pass only */
180
0
    VFilter = vp8_six_tap_x86[yoffset];
181
0
    vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
182
0
                                     src_pixels_per_line, dst_ptr, dst_pitch, 4,
183
0
                                     VFilter);
184
0
  }
185
0
}
186
187
#endif
188
189
#if HAVE_SSSE3
190
191
extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
192
                                         unsigned int src_pixels_per_line,
193
                                         unsigned char *output_ptr,
194
                                         unsigned int output_pitch,
195
                                         unsigned int output_height,
196
                                         unsigned int vp8_filter_index);
197
198
extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
199
                                          unsigned int src_pixels_per_line,
200
                                          unsigned char *output_ptr,
201
                                          unsigned int output_pitch,
202
                                          unsigned int output_height,
203
                                          unsigned int vp8_filter_index);
204
205
extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
206
                                          unsigned int src_pitch,
207
                                          unsigned char *output_ptr,
208
                                          unsigned int out_pitch,
209
                                          unsigned int output_height,
210
                                          unsigned int vp8_filter_index);
211
212
extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
213
                                         unsigned int src_pitch,
214
                                         unsigned char *output_ptr,
215
                                         unsigned int out_pitch,
216
                                         unsigned int output_height,
217
                                         unsigned int vp8_filter_index);
218
219
extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
220
                                         unsigned int src_pixels_per_line,
221
                                         unsigned char *output_ptr,
222
                                         unsigned int output_pitch,
223
                                         unsigned int output_height,
224
                                         unsigned int vp8_filter_index);
225
226
extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
227
                                         unsigned int src_pitch,
228
                                         unsigned char *output_ptr,
229
                                         unsigned int out_pitch,
230
                                         unsigned int output_height,
231
                                         unsigned int vp8_filter_index);
232
233
void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
234
                                   int src_pixels_per_line, int xoffset,
235
                                   int yoffset, unsigned char *dst_ptr,
236
709k
                                   int dst_pitch) {
237
709k
  DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
238
239
709k
  if (xoffset) {
240
494k
    if (yoffset) {
241
332k
      vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
242
332k
                                    src_pixels_per_line, FData2, 16, 21,
243
332k
                                    xoffset);
244
332k
      vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
245
332k
                                    yoffset);
246
332k
    } else {
247
      /* First-pass only */
248
162k
      vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
249
162k
                                    dst_pitch, 16, xoffset);
250
162k
    }
251
494k
  } else {
252
214k
    if (yoffset) {
253
      /* Second-pass only */
254
214k
      vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
255
214k
                                    src_pixels_per_line, dst_ptr, dst_pitch, 16,
256
214k
                                    yoffset);
257
214k
    } else {
258
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
259
       * yoffset==0) case correctly. Add copy function here to guarantee
260
       * six-tap function handles all possible offsets. */
261
0
      vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
262
0
    }
263
214k
  }
264
709k
}
265
266
void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
267
                                 int src_pixels_per_line, int xoffset,
268
                                 int yoffset, unsigned char *dst_ptr,
269
2.63M
                                 int dst_pitch) {
270
2.63M
  DECLARE_ALIGNED(16, unsigned char, FData2[256]);
271
272
2.63M
  if (xoffset) {
273
1.96M
    if (yoffset) {
274
1.32M
      vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
275
1.32M
                                   src_pixels_per_line, FData2, 8, 13, xoffset);
276
1.32M
      vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
277
1.32M
    } else {
278
642k
      vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
279
642k
                                   dst_pitch, 8, xoffset);
280
642k
    }
281
1.96M
  } else {
282
675k
    if (yoffset) {
283
      /* Second-pass only */
284
675k
      vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
285
675k
                                   src_pixels_per_line, dst_ptr, dst_pitch, 8,
286
675k
                                   yoffset);
287
675k
    } else {
288
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
289
       * yoffset==0) case correctly. Add copy function here to guarantee
290
       * six-tap function handles all possible offsets. */
291
0
      vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
292
0
    }
293
675k
  }
294
2.63M
}
295
296
void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
297
                                 int src_pixels_per_line, int xoffset,
298
                                 int yoffset, unsigned char *dst_ptr,
299
584k
                                 int dst_pitch) {
300
584k
  DECLARE_ALIGNED(16, unsigned char, FData2[256]);
301
302
584k
  if (xoffset) {
303
435k
    if (yoffset) {
304
268k
      vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
305
268k
                                   src_pixels_per_line, FData2, 8, 9, xoffset);
306
268k
      vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
307
268k
    } else {
308
      /* First-pass only */
309
167k
      vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
310
167k
                                   dst_pitch, 4, xoffset);
311
167k
    }
312
435k
  } else {
313
148k
    if (yoffset) {
314
      /* Second-pass only */
315
148k
      vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
316
148k
                                   src_pixels_per_line, dst_ptr, dst_pitch, 4,
317
148k
                                   yoffset);
318
148k
    } else {
319
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
320
       * yoffset==0) case correctly. Add copy function here to guarantee
321
       * six-tap function handles all possible offsets. */
322
0
      vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
323
0
    }
324
148k
  }
325
584k
}
326
327
void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
328
                                 int src_pixels_per_line, int xoffset,
329
                                 int yoffset, unsigned char *dst_ptr,
330
16.0M
                                 int dst_pitch) {
331
16.0M
  DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
332
333
16.0M
  if (xoffset) {
334
11.8M
    if (yoffset) {
335
7.02M
      vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
336
7.02M
                                   src_pixels_per_line, FData2, 4, 9, xoffset);
337
7.02M
      vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
338
7.02M
    } else {
339
4.81M
      vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
340
4.81M
                                   dst_pitch, 4, xoffset);
341
4.81M
    }
342
11.8M
  } else {
343
4.17M
    if (yoffset) {
344
4.17M
      vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
345
4.17M
                                   src_pixels_per_line, dst_ptr, dst_pitch, 4,
346
4.17M
                                   yoffset);
347
4.17M
    } else {
348
      /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
349
       * yoffset==0) case correctly. Add copy function here to guarantee
350
       * six-tap function handles all possible offsets. */
351
0
      int r;
352
353
0
      for (r = 0; r < 4; ++r) {
354
0
        dst_ptr[0] = src_ptr[0];
355
0
        dst_ptr[1] = src_ptr[1];
356
0
        dst_ptr[2] = src_ptr[2];
357
0
        dst_ptr[3] = src_ptr[3];
358
0
        dst_ptr += dst_pitch;
359
0
        src_ptr += src_pixels_per_line;
360
0
      }
361
0
    }
362
4.17M
  }
363
16.0M
}
364
365
#endif