Coverage Report

Created: 2025-08-28 07:12

/src/libvpx/vpx_dsp/x86/convolve.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
11
#define VPX_VPX_DSP_X86_CONVOLVE_H_
12
13
#include <assert.h>
14
15
#include "./vpx_config.h"
16
#include "vpx/vpx_integer.h"
17
#include "vpx_ports/compiler_attributes.h"
18
19
// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
20
// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
21
// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
22
// assumes the filter is always 8 tap.
23
typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
24
                                uint8_t *output_ptr, ptrdiff_t out_pitch,
25
                                uint32_t output_height, const int16_t *filter);
26
27
// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
28
// have 4-tap vert avg filter.
29
#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
30
  void vpx_convolve8_##name##_##opt(                                         \
31
      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
32
      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
33
249M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34
249M
    const int16_t *filter_row = filter[offset];                              \
35
249M
    (void)x0_q4;                                                             \
36
249M
    (void)x_step_q4;                                                         \
37
249M
    (void)y0_q4;                                                             \
38
249M
    (void)y_step_q4;                                                         \
39
249M
    assert(filter_row[3] != 128);                                            \
40
249M
    assert(step_q4 == 16);                                                   \
41
249M
    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42
124M
      const int num_taps = 8;                                                \
43
140M
      while (w >= 16) {                                                      \
44
16.1M
        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45
16.1M
                                                 dst_stride, h, filter_row); \
46
16.1M
        src += 16;                                                           \
47
16.1M
        dst += 16;                                                           \
48
16.1M
        w -= 16;                                                             \
49
16.1M
      }                                                                      \
50
124M
      if (w == 8) {                                                          \
51
35.3M
        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52
35.3M
                                                dst_stride, h, filter_row);  \
53
89.0M
      } else if (w == 4) {                                                   \
54
75.9M
        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55
75.9M
                                                dst_stride, h, filter_row);  \
56
75.9M
      }                                                                      \
57
124M
      (void)num_taps;                                                        \
58
125M
    } else if (filter_row[2] | filter_row[5]) {                              \
59
123M
      const int num_taps = is_avg ? 8 : 4;                                   \
60
136M
      while (w >= 16) {                                                      \
61
13.0M
        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62
13.0M
                                                 dst_stride, h, filter_row); \
63
13.0M
        src += 16;                                                           \
64
13.0M
        dst += 16;                                                           \
65
13.0M
        w -= 16;                                                             \
66
13.0M
      }                                                                      \
67
123M
      if (w == 8) {                                                          \
68
28.7M
        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69
28.7M
                                                dst_stride, h, filter_row);  \
70
95.1M
      } else if (w == 4) {                                                   \
71
85.2M
        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72
85.2M
                                                dst_stride, h, filter_row);  \
73
85.2M
      }                                                                      \
74
123M
      (void)num_taps;                                                        \
75
123M
    } else {                                                                 \
76
1.38M
      const int num_taps = 2;                                                \
77
1.98M
      while (w >= 16) {                                                      \
78
596k
        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79
596k
                                                 dst_stride, h, filter_row); \
80
596k
        src += 16;                                                           \
81
596k
        dst += 16;                                                           \
82
596k
        w -= 16;                                                             \
83
596k
      }                                                                      \
84
1.38M
      if (w == 8) {                                                          \
85
414k
        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86
414k
                                                dst_stride, h, filter_row);  \
87
969k
      } else if (w == 4) {                                                   \
88
555k
        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89
555k
                                                dst_stride, h, filter_row);  \
90
555k
      }                                                                      \
91
1.38M
      (void)num_taps;                                                        \
92
1.38M
    }                                                                        \
93
249M
  }
Unexecuted instantiation: vpx_convolve8_horiz_sse2
Unexecuted instantiation: vpx_convolve8_vert_sse2
Unexecuted instantiation: vpx_convolve8_avg_horiz_sse2
Unexecuted instantiation: vpx_convolve8_avg_vert_sse2
vpx_convolve8_horiz_avx2
Line
Count
Source
33
123M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34
123M
    const int16_t *filter_row = filter[offset];                              \
35
123M
    (void)x0_q4;                                                             \
36
123M
    (void)x_step_q4;                                                         \
37
123M
    (void)y0_q4;                                                             \
38
123M
    (void)y_step_q4;                                                         \
39
123M
    assert(filter_row[3] != 128);                                            \
40
123M
    assert(step_q4 == 16);                                                   \
41
123M
    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42
61.0M
      const int num_taps = 8;                                                \
43
68.8M
      while (w >= 16) {                                                      \
44
7.82M
        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45
7.82M
                                                 dst_stride, h, filter_row); \
46
7.82M
        src += 16;                                                           \
47
7.82M
        dst += 16;                                                           \
48
7.82M
        w -= 16;                                                             \
49
7.82M
      }                                                                      \
50
61.0M
      if (w == 8) {                                                          \
51
17.4M
        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52
17.4M
                                                dst_stride, h, filter_row);  \
53
43.5M
      } else if (w == 4) {                                                   \
54
37.2M
        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55
37.2M
                                                dst_stride, h, filter_row);  \
56
37.2M
      }                                                                      \
57
61.0M
      (void)num_taps;                                                        \
58
62.8M
    } else if (filter_row[2] | filter_row[5]) {                              \
59
62.1M
      const int num_taps = is_avg ? 8 : 4;                                   \
60
68.6M
      while (w >= 16) {                                                      \
61
6.51M
        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62
6.51M
                                                 dst_stride, h, filter_row); \
63
6.51M
        src += 16;                                                           \
64
6.51M
        dst += 16;                                                           \
65
6.51M
        w -= 16;                                                             \
66
6.51M
      }                                                                      \
67
62.1M
      if (w == 8) {                                                          \
68
14.4M
        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69
14.4M
                                                dst_stride, h, filter_row);  \
70
47.7M
      } else if (w == 4) {                                                   \
71
42.7M
        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72
42.7M
                                                dst_stride, h, filter_row);  \
73
42.7M
      }                                                                      \
74
62.1M
      (void)num_taps;                                                        \
75
62.1M
    } else {                                                                 \
76
628k
      const int num_taps = 2;                                                \
77
898k
      while (w >= 16) {                                                      \
78
269k
        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79
269k
                                                 dst_stride, h, filter_row); \
80
269k
        src += 16;                                                           \
81
269k
        dst += 16;                                                           \
82
269k
        w -= 16;                                                             \
83
269k
      }                                                                      \
84
628k
      if (w == 8) {                                                          \
85
188k
        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86
188k
                                                dst_stride, h, filter_row);  \
87
440k
      } else if (w == 4) {                                                   \
88
253k
        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89
253k
                                                dst_stride, h, filter_row);  \
90
253k
      }                                                                      \
91
628k
      (void)num_taps;                                                        \
92
628k
    }                                                                        \
93
123M
  }
vpx_convolve8_vert_avx2
Line
Count
Source
33
122M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34
122M
    const int16_t *filter_row = filter[offset];                              \
35
122M
    (void)x0_q4;                                                             \
36
122M
    (void)x_step_q4;                                                         \
37
122M
    (void)y0_q4;                                                             \
38
122M
    (void)y_step_q4;                                                         \
39
122M
    assert(filter_row[3] != 128);                                            \
40
122M
    assert(step_q4 == 16);                                                   \
41
122M
    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42
60.4M
      const int num_taps = 8;                                                \
43
68.0M
      while (w >= 16) {                                                      \
44
7.60M
        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45
7.60M
                                                 dst_stride, h, filter_row); \
46
7.60M
        src += 16;                                                           \
47
7.60M
        dst += 16;                                                           \
48
7.60M
        w -= 16;                                                             \
49
7.60M
      }                                                                      \
50
60.4M
      if (w == 8) {                                                          \
51
17.0M
        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52
17.0M
                                                dst_stride, h, filter_row);  \
53
43.3M
      } else if (w == 4) {                                                   \
54
37.2M
        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55
37.2M
                                                dst_stride, h, filter_row);  \
56
37.2M
      }                                                                      \
57
60.4M
      (void)num_taps;                                                        \
58
62.2M
    } else if (filter_row[2] | filter_row[5]) {                              \
59
61.7M
      const int num_taps = is_avg ? 8 : 4;                                   \
60
68.2M
      while (w >= 16) {                                                      \
61
6.50M
        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62
6.50M
                                                 dst_stride, h, filter_row); \
63
6.50M
        src += 16;                                                           \
64
6.50M
        dst += 16;                                                           \
65
6.50M
        w -= 16;                                                             \
66
6.50M
      }                                                                      \
67
61.7M
      if (w == 8) {                                                          \
68
14.3M
        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69
14.3M
                                                dst_stride, h, filter_row);  \
70
47.4M
      } else if (w == 4) {                                                   \
71
42.4M
        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72
42.4M
                                                dst_stride, h, filter_row);  \
73
42.4M
      }                                                                      \
74
61.7M
      (void)num_taps;                                                        \
75
61.7M
    } else {                                                                 \
76
478k
      const int num_taps = 2;                                                \
77
686k
      while (w >= 16) {                                                      \
78
207k
        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79
207k
                                                 dst_stride, h, filter_row); \
80
207k
        src += 16;                                                           \
81
207k
        dst += 16;                                                           \
82
207k
        w -= 16;                                                             \
83
207k
      }                                                                      \
84
478k
      if (w == 8) {                                                          \
85
144k
        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86
144k
                                                dst_stride, h, filter_row);  \
87
334k
      } else if (w == 4) {                                                   \
88
190k
        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89
190k
                                                dst_stride, h, filter_row);  \
90
190k
      }                                                                      \
91
478k
      (void)num_taps;                                                        \
92
478k
    }                                                                        \
93
122M
  }
vpx_convolve8_avg_horiz_avx2
Line
Count
Source
33
548k
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34
548k
    const int16_t *filter_row = filter[offset];                              \
35
548k
    (void)x0_q4;                                                             \
36
548k
    (void)x_step_q4;                                                         \
37
548k
    (void)y0_q4;                                                             \
38
548k
    (void)y_step_q4;                                                         \
39
548k
    assert(filter_row[3] != 128);                                            \
40
548k
    assert(step_q4 == 16);                                                   \
41
548k
    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42
496k
      const int num_taps = 8;                                                \
43
641k
      while (w >= 16) {                                                      \
44
145k
        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45
145k
                                                 dst_stride, h, filter_row); \
46
145k
        src += 16;                                                           \
47
145k
        dst += 16;                                                           \
48
145k
        w -= 16;                                                             \
49
145k
      }                                                                      \
50
496k
      if (w == 8) {                                                          \
51
157k
        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52
157k
                                                dst_stride, h, filter_row);  \
53
338k
      } else if (w == 4) {                                                   \
54
232k
        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55
232k
                                                dst_stride, h, filter_row);  \
56
232k
      }                                                                      \
57
496k
      (void)num_taps;                                                        \
58
496k
    } else if (filter_row[2] | filter_row[5]) {                              \
59
0
      const int num_taps = is_avg ? 8 : 4;                                   \
60
0
      while (w >= 16) {                                                      \
61
0
        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62
0
                                                 dst_stride, h, filter_row); \
63
0
        src += 16;                                                           \
64
0
        dst += 16;                                                           \
65
0
        w -= 16;                                                             \
66
0
      }                                                                      \
67
0
      if (w == 8) {                                                          \
68
0
        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69
0
                                                dst_stride, h, filter_row);  \
70
0
      } else if (w == 4) {                                                   \
71
0
        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72
0
                                                dst_stride, h, filter_row);  \
73
0
      }                                                                      \
74
0
      (void)num_taps;                                                        \
75
52.4k
    } else {                                                                 \
76
52.4k
      const int num_taps = 2;                                                \
77
73.7k
      while (w >= 16) {                                                      \
78
21.3k
        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79
21.3k
                                                 dst_stride, h, filter_row); \
80
21.3k
        src += 16;                                                           \
81
21.3k
        dst += 16;                                                           \
82
21.3k
        w -= 16;                                                             \
83
21.3k
      }                                                                      \
84
52.4k
      if (w == 8) {                                                          \
85
16.0k
        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86
16.0k
                                                dst_stride, h, filter_row);  \
87
36.3k
      } else if (w == 4) {                                                   \
88
21.5k
        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89
21.5k
                                                dst_stride, h, filter_row);  \
90
21.5k
      }                                                                      \
91
52.4k
      (void)num_taps;                                                        \
92
52.4k
    }                                                                        \
93
548k
  }
vpx_convolve8_avg_vert_avx2
Line
Count
Source
33
2.61M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
34
2.61M
    const int16_t *filter_row = filter[offset];                              \
35
2.61M
    (void)x0_q4;                                                             \
36
2.61M
    (void)x_step_q4;                                                         \
37
2.61M
    (void)y0_q4;                                                             \
38
2.61M
    (void)y_step_q4;                                                         \
39
2.61M
    assert(filter_row[3] != 128);                                            \
40
2.61M
    assert(step_q4 == 16);                                                   \
41
2.61M
    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
42
2.38M
      const int num_taps = 8;                                                \
43
2.94M
      while (w >= 16) {                                                      \
44
556k
        vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
45
556k
                                                 dst_stride, h, filter_row); \
46
556k
        src += 16;                                                           \
47
556k
        dst += 16;                                                           \
48
556k
        w -= 16;                                                             \
49
556k
      }                                                                      \
50
2.38M
      if (w == 8) {                                                          \
51
706k
        vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
52
706k
                                                dst_stride, h, filter_row);  \
53
1.67M
      } else if (w == 4) {                                                   \
54
1.26M
        vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
55
1.26M
                                                dst_stride, h, filter_row);  \
56
1.26M
      }                                                                      \
57
2.38M
      (void)num_taps;                                                        \
58
2.38M
    } else if (filter_row[2] | filter_row[5]) {                              \
59
0
      const int num_taps = is_avg ? 8 : 4;                                   \
60
0
      while (w >= 16) {                                                      \
61
0
        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
62
0
                                                 dst_stride, h, filter_row); \
63
0
        src += 16;                                                           \
64
0
        dst += 16;                                                           \
65
0
        w -= 16;                                                             \
66
0
      }                                                                      \
67
0
      if (w == 8) {                                                          \
68
0
        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
69
0
                                                dst_stride, h, filter_row);  \
70
0
      } else if (w == 4) {                                                   \
71
0
        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
72
0
                                                dst_stride, h, filter_row);  \
73
0
      }                                                                      \
74
0
      (void)num_taps;                                                        \
75
224k
    } else {                                                                 \
76
224k
      const int num_taps = 2;                                                \
77
322k
      while (w >= 16) {                                                      \
78
98.6k
        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
79
98.6k
                                                 dst_stride, h, filter_row); \
80
98.6k
        src += 16;                                                           \
81
98.6k
        dst += 16;                                                           \
82
98.6k
        w -= 16;                                                             \
83
98.6k
      }                                                                      \
84
224k
      if (w == 8) {                                                          \
85
66.4k
        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
86
66.4k
                                                dst_stride, h, filter_row);  \
87
157k
      } else if (w == 4) {                                                   \
88
89.8k
        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
89
89.8k
                                                dst_stride, h, filter_row);  \
90
89.8k
      }                                                                      \
91
224k
      (void)num_taps;                                                        \
92
224k
    }                                                                        \
93
2.61M
  }
Unexecuted instantiation: vpx_convolve8_horiz_ssse3
Unexecuted instantiation: vpx_convolve8_vert_ssse3
Unexecuted instantiation: vpx_convolve8_avg_horiz_ssse3
Unexecuted instantiation: vpx_convolve8_avg_vert_ssse3
94
95
#define FUN_CONV_2D(avg, opt, is_avg)                                          \
96
  void vpx_convolve8_##avg##opt(                                               \
97
      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
98
      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
99
74.4M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
100
74.4M
    const int16_t *filter_x = filter[x0_q4];                                   \
101
74.4M
    const int16_t *filter_y = filter[y0_q4];                                   \
102
74.4M
    (void)filter_y;                                                            \
103
74.4M
    assert(filter_x[3] != 128);                                                \
104
74.4M
    assert(filter_y[3] != 128);                                                \
105
74.4M
    assert(w <= 64);                                                           \
106
74.4M
    assert(h <= 64);                                                           \
107
74.4M
    assert(x_step_q4 == 16);                                                   \
108
74.4M
    assert(y_step_q4 == 16);                                                   \
109
74.4M
    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
110
37.8M
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
111
37.8M
      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
112
37.8M
                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
113
37.8M
                                h + 7);                                        \
114
37.8M
      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
115
37.8M
                                      filter, x0_q4, x_step_q4, y0_q4,         \
116
37.8M
                                      y_step_q4, w, h);                        \
117
37.8M
    } else if (filter_x[2] | filter_x[5]) {                                    \
118
36.0M
      const int num_taps = is_avg ? 8 : 4;                                     \
119
36.0M
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
120
36.0M
      vpx_convolve8_horiz_##opt(                                               \
121
36.0M
          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
122
36.0M
          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
123
36.0M
      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
124
36.0M
                                      dst, dst_stride, filter, x0_q4,          \
125
36.0M
                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
126
36.0M
    } else {                                                                   \
127
523k
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED);         \
128
523k
      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
129
523k
                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
130
523k
      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
131
523k
                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
132
523k
                                      h);                                      \
133
523k
    }                                                                          \
134
74.4M
  }
Unexecuted instantiation: vpx_convolve8_sse2
Unexecuted instantiation: vpx_convolve8_avg_sse2
vpx_convolve8_avx2
Line
Count
Source
99
72.3M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
100
72.3M
    const int16_t *filter_x = filter[x0_q4];                                   \
101
72.3M
    const int16_t *filter_y = filter[y0_q4];                                   \
102
72.3M
    (void)filter_y;                                                            \
103
72.3M
    assert(filter_x[3] != 128);                                                \
104
72.3M
    assert(filter_y[3] != 128);                                                \
105
72.3M
    assert(w <= 64);                                                           \
106
72.3M
    assert(h <= 64);                                                           \
107
72.3M
    assert(x_step_q4 == 16);                                                   \
108
72.3M
    assert(y_step_q4 == 16);                                                   \
109
72.3M
    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
110
35.9M
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
111
35.9M
      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
112
35.9M
                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
113
35.9M
                                h + 7);                                        \
114
35.9M
      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
115
35.9M
                                      filter, x0_q4, x_step_q4, y0_q4,         \
116
35.9M
                                      y_step_q4, w, h);                        \
117
36.3M
    } else if (filter_x[2] | filter_x[5]) {                                    \
118
36.0M
      const int num_taps = is_avg ? 8 : 4;                                     \
119
36.0M
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
120
36.0M
      vpx_convolve8_horiz_##opt(                                               \
121
36.0M
          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
122
36.0M
          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
123
36.0M
      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
124
36.0M
                                      dst, dst_stride, filter, x0_q4,          \
125
36.0M
                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
126
36.0M
    } else {                                                                   \
127
356k
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED);         \
128
356k
      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
129
356k
                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
130
356k
      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
131
356k
                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
132
356k
                                      h);                                      \
133
356k
    }                                                                          \
134
72.3M
  }
vpx_convolve8_avg_avx2
Line
Count
Source
99
2.03M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
100
2.03M
    const int16_t *filter_x = filter[x0_q4];                                   \
101
2.03M
    const int16_t *filter_y = filter[y0_q4];                                   \
102
2.03M
    (void)filter_y;                                                            \
103
2.03M
    assert(filter_x[3] != 128);                                                \
104
2.03M
    assert(filter_y[3] != 128);                                                \
105
2.03M
    assert(w <= 64);                                                           \
106
2.03M
    assert(h <= 64);                                                           \
107
2.03M
    assert(x_step_q4 == 16);                                                   \
108
2.03M
    assert(y_step_q4 == 16);                                                   \
109
2.03M
    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
110
1.86M
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
111
1.86M
      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
112
1.86M
                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
113
1.86M
                                h + 7);                                        \
114
1.86M
      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
115
1.86M
                                      filter, x0_q4, x_step_q4, y0_q4,         \
116
1.86M
                                      y_step_q4, w, h);                        \
117
1.86M
    } else if (filter_x[2] | filter_x[5]) {                                    \
118
0
      const int num_taps = is_avg ? 8 : 4;                                     \
119
0
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED);         \
120
0
      vpx_convolve8_horiz_##opt(                                               \
121
0
          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
122
0
          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
123
0
      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
124
0
                                      dst, dst_stride, filter, x0_q4,          \
125
0
                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
126
167k
    } else {                                                                   \
127
167k
      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED);         \
128
167k
      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
129
167k
                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
130
167k
      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
131
167k
                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
132
167k
                                      h);                                      \
133
167k
    }                                                                          \
134
2.03M
  }
Unexecuted instantiation: vpx_convolve8_ssse3
Unexecuted instantiation: vpx_convolve8_avg_ssse3
135
136
#if CONFIG_VP9_HIGHBITDEPTH
137
138
typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
139
                                       const ptrdiff_t src_pitch,
140
                                       uint16_t *output_ptr,
141
                                       ptrdiff_t out_pitch,
142
                                       unsigned int output_height,
143
                                       const int16_t *filter, int bd);
144
145
#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
146
                         is_avg)                                              \
147
  void vpx_highbd_convolve8_##name##_##opt(                                   \
148
      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
149
      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
150
3.99M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151
3.99M
    const int16_t *filter_row = filter_kernel[offset];                        \
152
3.99M
    if (step_q4 == 16 && filter_row[3] != 128) {                              \
153
3.86M
      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154
3.01M
        const int num_taps = 8;                                               \
155
3.78M
        while (w >= 16) {                                                     \
156
767k
          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157
767k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158
767k
          src += 16;                                                          \
159
767k
          dst += 16;                                                          \
160
767k
          w -= 16;                                                            \
161
767k
        }                                                                     \
162
4.10M
        while (w >= 8) {                                                      \
163
1.08M
          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164
1.08M
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165
1.08M
          src += 8;                                                           \
166
1.08M
          dst += 8;                                                           \
167
1.08M
          w -= 8;                                                             \
168
1.08M
        }                                                                     \
169
4.42M
        while (w >= 4) {                                                      \
170
1.41M
          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171
1.41M
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172
1.41M
          src += 4;                                                           \
173
1.41M
          dst += 4;                                                           \
174
1.41M
          w -= 4;                                                             \
175
1.41M
        }                                                                     \
176
3.01M
        (void)num_taps;                                                       \
177
3.01M
      } else if (filter_row[2] | filter_row[5]) {                             \
178
0
        const int num_taps = is_avg ? 8 : 4;                                  \
179
0
        while (w >= 16) {                                                     \
180
0
          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182
0
          src += 16;                                                          \
183
0
          dst += 16;                                                          \
184
0
          w -= 16;                                                            \
185
0
        }                                                                     \
186
0
        while (w >= 8) {                                                      \
187
0
          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189
0
          src += 8;                                                           \
190
0
          dst += 8;                                                           \
191
0
          w -= 8;                                                             \
192
0
        }                                                                     \
193
0
        while (w >= 4) {                                                      \
194
0
          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196
0
          src += 4;                                                           \
197
0
          dst += 4;                                                           \
198
0
          w -= 4;                                                             \
199
0
        }                                                                     \
200
0
        (void)num_taps;                                                       \
201
854k
      } else {                                                                \
202
854k
        const int num_taps = 2;                                               \
203
1.08M
        while (w >= 16) {                                                     \
204
229k
          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205
229k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206
229k
          src += 16;                                                          \
207
229k
          dst += 16;                                                          \
208
229k
          w -= 16;                                                            \
209
229k
        }                                                                     \
210
1.11M
        while (w >= 8) {                                                      \
211
257k
          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212
257k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213
257k
          src += 8;                                                           \
214
257k
          dst += 8;                                                           \
215
257k
          w -= 8;                                                             \
216
257k
        }                                                                     \
217
1.29M
        while (w >= 4) {                                                      \
218
441k
          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219
441k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220
441k
          src += 4;                                                           \
221
441k
          dst += 4;                                                           \
222
441k
          w -= 4;                                                             \
223
441k
        }                                                                     \
224
854k
        (void)num_taps;                                                       \
225
854k
      }                                                                       \
226
3.86M
    }                                                                         \
227
3.99M
    if (w) {                                                                  \
228
127k
      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229
127k
                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
230
127k
                                      y_step_q4, w, h, bd);                   \
231
127k
    }                                                                         \
232
3.99M
  }
Unexecuted instantiation: vpx_highbd_convolve8_horiz_sse2
Unexecuted instantiation: vpx_highbd_convolve8_vert_sse2
Unexecuted instantiation: vpx_highbd_convolve8_avg_horiz_sse2
Unexecuted instantiation: vpx_highbd_convolve8_avg_vert_sse2
vpx_highbd_convolve8_horiz_avx2
Line
Count
Source
150
1.80M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151
1.80M
    const int16_t *filter_row = filter_kernel[offset];                        \
152
1.80M
    if (step_q4 == 16 && filter_row[3] != 128) {                              \
153
1.80M
      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154
1.40M
        const int num_taps = 8;                                               \
155
1.75M
        while (w >= 16) {                                                     \
156
344k
          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157
344k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158
344k
          src += 16;                                                          \
159
344k
          dst += 16;                                                          \
160
344k
          w -= 16;                                                            \
161
344k
        }                                                                     \
162
1.92M
        while (w >= 8) {                                                      \
163
521k
          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164
521k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165
521k
          src += 8;                                                           \
166
521k
          dst += 8;                                                           \
167
521k
          w -= 8;                                                             \
168
521k
        }                                                                     \
169
2.05M
        while (w >= 4) {                                                      \
170
652k
          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171
652k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172
652k
          src += 4;                                                           \
173
652k
          dst += 4;                                                           \
174
652k
          w -= 4;                                                             \
175
652k
        }                                                                     \
176
1.40M
        (void)num_taps;                                                       \
177
1.40M
      } else if (filter_row[2] | filter_row[5]) {                             \
178
0
        const int num_taps = is_avg ? 8 : 4;                                  \
179
0
        while (w >= 16) {                                                     \
180
0
          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182
0
          src += 16;                                                          \
183
0
          dst += 16;                                                          \
184
0
          w -= 16;                                                            \
185
0
        }                                                                     \
186
0
        while (w >= 8) {                                                      \
187
0
          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189
0
          src += 8;                                                           \
190
0
          dst += 8;                                                           \
191
0
          w -= 8;                                                             \
192
0
        }                                                                     \
193
0
        while (w >= 4) {                                                      \
194
0
          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196
0
          src += 4;                                                           \
197
0
          dst += 4;                                                           \
198
0
          w -= 4;                                                             \
199
0
        }                                                                     \
200
0
        (void)num_taps;                                                       \
201
394k
      } else {                                                                \
202
394k
        const int num_taps = 2;                                               \
203
497k
        while (w >= 16) {                                                     \
204
102k
          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205
102k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206
102k
          src += 16;                                                          \
207
102k
          dst += 16;                                                          \
208
102k
          w -= 16;                                                            \
209
102k
        }                                                                     \
210
513k
        while (w >= 8) {                                                      \
211
118k
          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212
118k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213
118k
          src += 8;                                                           \
214
118k
          dst += 8;                                                           \
215
118k
          w -= 8;                                                             \
216
118k
        }                                                                     \
217
600k
        while (w >= 4) {                                                      \
218
205k
          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219
205k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220
205k
          src += 4;                                                           \
221
205k
          dst += 4;                                                           \
222
205k
          w -= 4;                                                             \
223
205k
        }                                                                     \
224
394k
        (void)num_taps;                                                       \
225
394k
      }                                                                       \
226
1.80M
    }                                                                         \
227
1.80M
    if (w) {                                                                  \
228
7.48k
      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229
7.48k
                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
230
7.48k
                                      y_step_q4, w, h, bd);                   \
231
7.48k
    }                                                                         \
232
1.80M
  }
vpx_highbd_convolve8_vert_avx2
Line
Count
Source
150
1.49M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151
1.49M
    const int16_t *filter_row = filter_kernel[offset];                        \
152
1.49M
    if (step_q4 == 16 && filter_row[3] != 128) {                              \
153
1.40M
      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154
1.08M
        const int num_taps = 8;                                               \
155
1.35M
        while (w >= 16) {                                                     \
156
266k
          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157
266k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158
266k
          src += 16;                                                          \
159
266k
          dst += 16;                                                          \
160
266k
          w -= 16;                                                            \
161
266k
        }                                                                     \
162
1.49M
        while (w >= 8) {                                                      \
163
406k
          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164
406k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165
406k
          src += 8;                                                           \
166
406k
          dst += 8;                                                           \
167
406k
          w -= 8;                                                             \
168
406k
        }                                                                     \
169
1.58M
        while (w >= 4) {                                                      \
170
499k
          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171
499k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172
499k
          src += 4;                                                           \
173
499k
          dst += 4;                                                           \
174
499k
          w -= 4;                                                             \
175
499k
        }                                                                     \
176
1.08M
        (void)num_taps;                                                       \
177
1.08M
      } else if (filter_row[2] | filter_row[5]) {                             \
178
0
        const int num_taps = is_avg ? 8 : 4;                                  \
179
0
        while (w >= 16) {                                                     \
180
0
          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182
0
          src += 16;                                                          \
183
0
          dst += 16;                                                          \
184
0
          w -= 16;                                                            \
185
0
        }                                                                     \
186
0
        while (w >= 8) {                                                      \
187
0
          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189
0
          src += 8;                                                           \
190
0
          dst += 8;                                                           \
191
0
          w -= 8;                                                             \
192
0
        }                                                                     \
193
0
        while (w >= 4) {                                                      \
194
0
          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196
0
          src += 4;                                                           \
197
0
          dst += 4;                                                           \
198
0
          w -= 4;                                                             \
199
0
        }                                                                     \
200
0
        (void)num_taps;                                                       \
201
326k
      } else {                                                                \
202
326k
        const int num_taps = 2;                                               \
203
405k
        while (w >= 16) {                                                     \
204
79.4k
          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205
79.4k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206
79.4k
          src += 16;                                                          \
207
79.4k
          dst += 16;                                                          \
208
79.4k
          w -= 16;                                                            \
209
79.4k
        }                                                                     \
210
419k
        while (w >= 8) {                                                      \
211
93.6k
          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212
93.6k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213
93.6k
          src += 8;                                                           \
214
93.6k
          dst += 8;                                                           \
215
93.6k
          w -= 8;                                                             \
216
93.6k
        }                                                                     \
217
504k
        while (w >= 4) {                                                      \
218
178k
          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219
178k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220
178k
          src += 4;                                                           \
221
178k
          dst += 4;                                                           \
222
178k
          w -= 4;                                                             \
223
178k
        }                                                                     \
224
326k
        (void)num_taps;                                                       \
225
326k
      }                                                                       \
226
1.40M
    }                                                                         \
227
1.49M
    if (w) {                                                                  \
228
83.6k
      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229
83.6k
                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
230
83.6k
                                      y_step_q4, w, h, bd);                   \
231
83.6k
    }                                                                         \
232
1.49M
  }
vpx_highbd_convolve8_avg_horiz_avx2
Line
Count
Source
150
101k
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151
101k
    const int16_t *filter_row = filter_kernel[offset];                        \
152
101k
    if (step_q4 == 16 && filter_row[3] != 128) {                              \
153
95.1k
      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154
74.1k
        const int num_taps = 8;                                               \
155
100k
        while (w >= 16) {                                                     \
156
26.1k
          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157
26.1k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158
26.1k
          src += 16;                                                          \
159
26.1k
          dst += 16;                                                          \
160
26.1k
          w -= 16;                                                            \
161
26.1k
        }                                                                     \
162
96.6k
        while (w >= 8) {                                                      \
163
22.5k
          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164
22.5k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165
22.5k
          src += 8;                                                           \
166
22.5k
          dst += 8;                                                           \
167
22.5k
          w -= 8;                                                             \
168
22.5k
        }                                                                     \
169
108k
        while (w >= 4) {                                                      \
170
34.7k
          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171
34.7k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172
34.7k
          src += 4;                                                           \
173
34.7k
          dst += 4;                                                           \
174
34.7k
          w -= 4;                                                             \
175
34.7k
        }                                                                     \
176
74.1k
        (void)num_taps;                                                       \
177
74.1k
      } else if (filter_row[2] | filter_row[5]) {                             \
178
0
        const int num_taps = is_avg ? 8 : 4;                                  \
179
0
        while (w >= 16) {                                                     \
180
0
          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182
0
          src += 16;                                                          \
183
0
          dst += 16;                                                          \
184
0
          w -= 16;                                                            \
185
0
        }                                                                     \
186
0
        while (w >= 8) {                                                      \
187
0
          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189
0
          src += 8;                                                           \
190
0
          dst += 8;                                                           \
191
0
          w -= 8;                                                             \
192
0
        }                                                                     \
193
0
        while (w >= 4) {                                                      \
194
0
          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196
0
          src += 4;                                                           \
197
0
          dst += 4;                                                           \
198
0
          w -= 4;                                                             \
199
0
        }                                                                     \
200
0
        (void)num_taps;                                                       \
201
21.0k
      } else {                                                                \
202
21.0k
        const int num_taps = 2;                                               \
203
29.6k
        while (w >= 16) {                                                     \
204
8.54k
          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205
8.54k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206
8.54k
          src += 16;                                                          \
207
8.54k
          dst += 16;                                                          \
208
8.54k
          w -= 16;                                                            \
209
8.54k
        }                                                                     \
210
27.9k
        while (w >= 8) {                                                      \
211
6.86k
          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212
6.86k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213
6.86k
          src += 8;                                                           \
214
6.86k
          dst += 8;                                                           \
215
6.86k
          w -= 8;                                                             \
216
6.86k
        }                                                                     \
217
29.8k
        while (w >= 4) {                                                      \
218
8.78k
          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219
8.78k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220
8.78k
          src += 4;                                                           \
221
8.78k
          dst += 4;                                                           \
222
8.78k
          w -= 4;                                                             \
223
8.78k
        }                                                                     \
224
21.0k
        (void)num_taps;                                                       \
225
21.0k
      }                                                                       \
226
95.1k
    }                                                                         \
227
101k
    if (w) {                                                                  \
228
6.56k
      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229
6.56k
                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
230
6.56k
                                      y_step_q4, w, h, bd);                   \
231
6.56k
    }                                                                         \
232
101k
  }
vpx_highbd_convolve8_avg_vert_avx2
Line
Count
Source
150
594k
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
151
594k
    const int16_t *filter_row = filter_kernel[offset];                        \
152
594k
    if (step_q4 == 16 && filter_row[3] != 128) {                              \
153
564k
      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
154
451k
        const int num_taps = 8;                                               \
155
581k
        while (w >= 16) {                                                     \
156
129k
          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
157
129k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
158
129k
          src += 16;                                                          \
159
129k
          dst += 16;                                                          \
160
129k
          w -= 16;                                                            \
161
129k
        }                                                                     \
162
589k
        while (w >= 8) {                                                      \
163
138k
          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
164
138k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
165
138k
          src += 8;                                                           \
166
138k
          dst += 8;                                                           \
167
138k
          w -= 8;                                                             \
168
138k
        }                                                                     \
169
677k
        while (w >= 4) {                                                      \
170
225k
          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
171
225k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
172
225k
          src += 4;                                                           \
173
225k
          dst += 4;                                                           \
174
225k
          w -= 4;                                                             \
175
225k
        }                                                                     \
176
451k
        (void)num_taps;                                                       \
177
451k
      } else if (filter_row[2] | filter_row[5]) {                             \
178
0
        const int num_taps = is_avg ? 8 : 4;                                  \
179
0
        while (w >= 16) {                                                     \
180
0
          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
181
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
182
0
          src += 16;                                                          \
183
0
          dst += 16;                                                          \
184
0
          w -= 16;                                                            \
185
0
        }                                                                     \
186
0
        while (w >= 8) {                                                      \
187
0
          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
188
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
189
0
          src += 8;                                                           \
190
0
          dst += 8;                                                           \
191
0
          w -= 8;                                                             \
192
0
        }                                                                     \
193
0
        while (w >= 4) {                                                      \
194
0
          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
195
0
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
196
0
          src += 4;                                                           \
197
0
          dst += 4;                                                           \
198
0
          w -= 4;                                                             \
199
0
        }                                                                     \
200
0
        (void)num_taps;                                                       \
201
113k
      } else {                                                                \
202
113k
        const int num_taps = 2;                                               \
203
151k
        while (w >= 16) {                                                     \
204
38.8k
          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
205
38.8k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
206
38.8k
          src += 16;                                                          \
207
38.8k
          dst += 16;                                                          \
208
38.8k
          w -= 16;                                                            \
209
38.8k
        }                                                                     \
210
151k
        while (w >= 8) {                                                      \
211
38.1k
          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
212
38.1k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
213
38.1k
          src += 8;                                                           \
214
38.1k
          dst += 8;                                                           \
215
38.1k
          w -= 8;                                                             \
216
38.1k
        }                                                                     \
217
162k
        while (w >= 4) {                                                      \
218
49.0k
          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
219
49.0k
              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
220
49.0k
          src += 4;                                                           \
221
49.0k
          dst += 4;                                                           \
222
49.0k
          w -= 4;                                                             \
223
49.0k
        }                                                                     \
224
113k
        (void)num_taps;                                                       \
225
113k
      }                                                                       \
226
564k
    }                                                                         \
227
594k
    if (w) {                                                                  \
228
29.8k
      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
229
29.8k
                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
230
29.8k
                                      y_step_q4, w, h, bd);                   \
231
29.8k
    }                                                                         \
232
594k
  }
233
234
#define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
235
  void vpx_highbd_convolve8_##avg##opt(                                        \
236
      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
237
      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
238
1.59M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
239
1.59M
    const int16_t *filter_x = filter[x0_q4];                                   \
240
1.59M
    assert(w <= 64);                                                           \
241
1.59M
    assert(h <= 64);                                                           \
242
1.59M
    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
243
1.50M
      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
244
1.50M
          filter_x[3] == 128) {                                                \
245
1.16M
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
246
1.16M
        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
247
1.16M
                                         fdata2, 64, filter, x0_q4, x_step_q4, \
248
1.16M
                                         y0_q4, y_step_q4, w, h + 7, bd);      \
249
1.16M
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
250
1.16M
            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
251
1.16M
            y0_q4, y_step_q4, w, h, bd);                                       \
252
1.16M
      } else if (filter_x[2] | filter_x[5]) {                                  \
253
0
        const int num_taps = is_avg ? 8 : 4;                                   \
254
0
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
255
0
        vpx_highbd_convolve8_horiz_##opt(                                      \
256
0
            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
257
0
            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
258
0
            bd);                                                               \
259
0
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
260
0
            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
261
0
            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
262
336k
      } else {                                                                 \
263
336k
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED);      \
264
336k
        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
265
336k
                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
266
336k
                                         w, h + 1, bd);                        \
267
336k
        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
268
336k
                                               filter, x0_q4, x_step_q4,       \
269
336k
                                               y0_q4, y_step_q4, w, h, bd);    \
270
336k
      }                                                                        \
271
1.50M
    } else {                                                                   \
272
90.8k
      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
273
90.8k
                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
274
90.8k
                                    bd);                                       \
275
90.8k
    }                                                                          \
276
1.59M
  }
Unexecuted instantiation: vpx_highbd_convolve8_sse2
Unexecuted instantiation: vpx_highbd_convolve8_avg_sse2
vpx_highbd_convolve8_avx2
Line
Count
Source
238
1.12M
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
239
1.12M
    const int16_t *filter_x = filter[x0_q4];                                   \
240
1.12M
    assert(w <= 64);                                                           \
241
1.12M
    assert(h <= 64);                                                           \
242
1.12M
    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
243
1.06M
      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
244
1.06M
          filter_x[3] == 128) {                                                \
245
818k
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
246
818k
        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
247
818k
                                         fdata2, 64, filter, x0_q4, x_step_q4, \
248
818k
                                         y0_q4, y_step_q4, w, h + 7, bd);      \
249
818k
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
250
818k
            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
251
818k
            y0_q4, y_step_q4, w, h, bd);                                       \
252
818k
      } else if (filter_x[2] | filter_x[5]) {                                  \
253
0
        const int num_taps = is_avg ? 8 : 4;                                   \
254
0
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
255
0
        vpx_highbd_convolve8_horiz_##opt(                                      \
256
0
            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
257
0
            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
258
0
            bd);                                                               \
259
0
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
260
0
            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
261
0
            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
262
248k
      } else {                                                                 \
263
248k
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED);      \
264
248k
        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
265
248k
                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
266
248k
                                         w, h + 1, bd);                        \
267
248k
        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
268
248k
                                               filter, x0_q4, x_step_q4,       \
269
248k
                                               y0_q4, y_step_q4, w, h, bd);    \
270
248k
      }                                                                        \
271
1.06M
    } else {                                                                   \
272
63.0k
      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
273
63.0k
                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
274
63.0k
                                    bd);                                       \
275
63.0k
    }                                                                          \
276
1.12M
  }
vpx_highbd_convolve8_avg_avx2
Line
Count
Source
238
463k
      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
239
463k
    const int16_t *filter_x = filter[x0_q4];                                   \
240
463k
    assert(w <= 64);                                                           \
241
463k
    assert(h <= 64);                                                           \
242
463k
    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
243
435k
      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
244
435k
          filter_x[3] == 128) {                                                \
245
346k
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
246
346k
        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
247
346k
                                         fdata2, 64, filter, x0_q4, x_step_q4, \
248
346k
                                         y0_q4, y_step_q4, w, h + 7, bd);      \
249
346k
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
250
346k
            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
251
346k
            y0_q4, y_step_q4, w, h, bd);                                       \
252
346k
      } else if (filter_x[2] | filter_x[5]) {                                  \
253
0
        const int num_taps = is_avg ? 8 : 4;                                   \
254
0
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED);      \
255
0
        vpx_highbd_convolve8_horiz_##opt(                                      \
256
0
            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
257
0
            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
258
0
            bd);                                                               \
259
0
        vpx_highbd_convolve8_##avg##vert_##opt(                                \
260
0
            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
261
0
            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
262
88.7k
      } else {                                                                 \
263
88.7k
        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED);      \
264
88.7k
        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
265
88.7k
                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
266
88.7k
                                         w, h + 1, bd);                        \
267
88.7k
        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
268
88.7k
                                               filter, x0_q4, x_step_q4,       \
269
88.7k
                                               y0_q4, y_step_q4, w, h, bd);    \
270
88.7k
      }                                                                        \
271
435k
    } else {                                                                   \
272
27.7k
      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
273
27.7k
                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
274
27.7k
                                    bd);                                       \
275
27.7k
    }                                                                          \
276
463k
  }
277
278
#endif  // CONFIG_VP9_HIGHBITDEPTH
279
#endif  // VPX_VPX_DSP_X86_CONVOLVE_H_