/src/libvpx/vpx_dsp/x86/convolve.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | #ifndef VPX_VPX_DSP_X86_CONVOLVE_H_ |
11 | | #define VPX_VPX_DSP_X86_CONVOLVE_H_ |
12 | | |
13 | | #include <assert.h> |
14 | | |
15 | | #include "./vpx_config.h" |
16 | | #include "vpx/vpx_integer.h" |
17 | | #include "vpx_ports/compiler_attributes.h" |
18 | | |
19 | | // TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty |
20 | | // hacky and awful to read. Note that there is a filter_x[3] == 128 check in |
21 | | // HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function |
22 | | // assumes the filter is always 8 tap. |
23 | | typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, |
24 | | uint8_t *output_ptr, ptrdiff_t out_pitch, |
25 | | uint32_t output_height, const int16_t *filter); |
26 | | |
27 | | // TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we |
28 | | // have 4-tap vert avg filter. |
29 | | #define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \ |
30 | | void vpx_convolve8_##name##_##opt( \ |
31 | | const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ |
32 | | ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ |
33 | 249M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ |
34 | 249M | const int16_t *filter_row = filter[offset]; \ |
35 | 249M | (void)x0_q4; \ |
36 | 249M | (void)x_step_q4; \ |
37 | 249M | (void)y0_q4; \ |
38 | 249M | (void)y_step_q4; \ |
39 | 249M | assert(filter_row[3] != 128); \ |
40 | 249M | assert(step_q4 == 16); \ |
41 | 249M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ |
42 | 124M | const int num_taps = 8; \ |
43 | 140M | while (w >= 16) { \ |
44 | 16.1M | vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ |
45 | 16.1M | dst_stride, h, filter_row); \ |
46 | 16.1M | src += 16; \ |
47 | 16.1M | dst += 16; \ |
48 | 16.1M | w -= 16; \ |
49 | 16.1M | } \ |
50 | 124M | if (w == 8) { \ |
51 | 35.3M | vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ |
52 | 35.3M | dst_stride, h, filter_row); \ |
53 | 89.0M | } else if (w == 4) { \ |
54 | 75.9M | vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ |
55 | 75.9M | dst_stride, h, filter_row); \ |
56 | 75.9M | } \ |
57 | 124M | (void)num_taps; \ |
58 | 125M | } else if (filter_row[2] | filter_row[5]) { \ |
59 | 123M | const int num_taps = is_avg ? 8 : 4; \ |
60 | 136M | while (w >= 16) { \ |
61 | 13.0M | vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ |
62 | 13.0M | dst_stride, h, filter_row); \ |
63 | 13.0M | src += 16; \ |
64 | 13.0M | dst += 16; \ |
65 | 13.0M | w -= 16; \ |
66 | 13.0M | } \ |
67 | 123M | if (w == 8) { \ |
68 | 28.7M | vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ |
69 | 28.7M | dst_stride, h, filter_row); \ |
70 | 95.1M | } else if (w == 4) { \ |
71 | 85.2M | vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ |
72 | 85.2M | dst_stride, h, filter_row); \ |
73 | 85.2M | } \ |
74 | 123M | (void)num_taps; \ |
75 | 123M | } else { \ |
76 | 1.38M | const int num_taps = 2; \ |
77 | 1.98M | while (w >= 16) { \ |
78 | 596k | vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ |
79 | 596k | dst_stride, h, filter_row); \ |
80 | 596k | src += 16; \ |
81 | 596k | dst += 16; \ |
82 | 596k | w -= 16; \ |
83 | 596k | } \ |
84 | 1.38M | if (w == 8) { \ |
85 | 414k | vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ |
86 | 414k | dst_stride, h, filter_row); \ |
87 | 969k | } else if (w == 4) { \ |
88 | 555k | vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ |
89 | 555k | dst_stride, h, filter_row); \ |
90 | 555k | } \ |
91 | 1.38M | (void)num_taps; \ |
92 | 1.38M | } \ |
93 | 249M | } Unexecuted instantiation: vpx_convolve8_horiz_sse2 Unexecuted instantiation: vpx_convolve8_vert_sse2 Unexecuted instantiation: vpx_convolve8_avg_horiz_sse2 Unexecuted instantiation: vpx_convolve8_avg_vert_sse2 Line | Count | Source | 33 | 123M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 34 | 123M | const int16_t *filter_row = filter[offset]; \ | 35 | 123M | (void)x0_q4; \ | 36 | 123M | (void)x_step_q4; \ | 37 | 123M | (void)y0_q4; \ | 38 | 123M | (void)y_step_q4; \ | 39 | 123M | assert(filter_row[3] != 128); \ | 40 | 123M | assert(step_q4 == 16); \ | 41 | 123M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 42 | 61.0M | const int num_taps = 8; \ | 43 | 68.8M | while (w >= 16) { \ | 44 | 7.82M | vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 45 | 7.82M | dst_stride, h, filter_row); \ | 46 | 7.82M | src += 16; \ | 47 | 7.82M | dst += 16; \ | 48 | 7.82M | w -= 16; \ | 49 | 7.82M | } \ | 50 | 61.0M | if (w == 8) { \ | 51 | 17.4M | vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 52 | 17.4M | dst_stride, h, filter_row); \ | 53 | 43.5M | } else if (w == 4) { \ | 54 | 37.2M | vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 55 | 37.2M | dst_stride, h, filter_row); \ | 56 | 37.2M | } \ | 57 | 61.0M | (void)num_taps; \ | 58 | 62.8M | } else if (filter_row[2] | filter_row[5]) { \ | 59 | 62.1M | const int num_taps = is_avg ? 8 : 4; \ | 60 | 68.6M | while (w >= 16) { \ | 61 | 6.51M | vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 62 | 6.51M | dst_stride, h, filter_row); \ | 63 | 6.51M | src += 16; \ | 64 | 6.51M | dst += 16; \ | 65 | 6.51M | w -= 16; \ | 66 | 6.51M | } \ | 67 | 62.1M | if (w == 8) { \ | 68 | 14.4M | vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 69 | 14.4M | dst_stride, h, filter_row); \ | 70 | 47.7M | } else if (w == 4) { \ | 71 | 42.7M | vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 72 | 42.7M | dst_stride, h, filter_row); \ | 73 | 42.7M | } \ | 74 | 62.1M | (void)num_taps; \ | 75 | 62.1M | } else { \ | 76 | 628k | const int num_taps = 2; \ | 77 | 898k | while (w >= 16) { \ | 78 | 269k | vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 79 | 269k | dst_stride, h, filter_row); \ | 80 | 269k | src += 16; \ | 81 | 269k | dst += 16; \ | 82 | 269k | w -= 16; \ | 83 | 269k | } \ | 84 | 628k | if (w == 8) { \ | 85 | 188k | vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 86 | 188k | dst_stride, h, filter_row); \ | 87 | 440k | } else if (w == 4) { \ | 88 | 253k | vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 89 | 253k | dst_stride, h, filter_row); \ | 90 | 253k | } \ | 91 | 628k | (void)num_taps; \ | 92 | 628k | } \ | 93 | 123M | } |
Line | Count | Source | 33 | 122M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 34 | 122M | const int16_t *filter_row = filter[offset]; \ | 35 | 122M | (void)x0_q4; \ | 36 | 122M | (void)x_step_q4; \ | 37 | 122M | (void)y0_q4; \ | 38 | 122M | (void)y_step_q4; \ | 39 | 122M | assert(filter_row[3] != 128); \ | 40 | 122M | assert(step_q4 == 16); \ | 41 | 122M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 42 | 60.4M | const int num_taps = 8; \ | 43 | 68.0M | while (w >= 16) { \ | 44 | 7.60M | vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 45 | 7.60M | dst_stride, h, filter_row); \ | 46 | 7.60M | src += 16; \ | 47 | 7.60M | dst += 16; \ | 48 | 7.60M | w -= 16; \ | 49 | 7.60M | } \ | 50 | 60.4M | if (w == 8) { \ | 51 | 17.0M | vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 52 | 17.0M | dst_stride, h, filter_row); \ | 53 | 43.3M | } else if (w == 4) { \ | 54 | 37.2M | vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 55 | 37.2M | dst_stride, h, filter_row); \ | 56 | 37.2M | } \ | 57 | 60.4M | (void)num_taps; \ | 58 | 62.2M | } else if (filter_row[2] | filter_row[5]) { \ | 59 | 61.7M | const int num_taps = is_avg ? 8 : 4; \ | 60 | 68.2M | while (w >= 16) { \ | 61 | 6.50M | vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 62 | 6.50M | dst_stride, h, filter_row); \ | 63 | 6.50M | src += 16; \ | 64 | 6.50M | dst += 16; \ | 65 | 6.50M | w -= 16; \ | 66 | 6.50M | } \ | 67 | 61.7M | if (w == 8) { \ | 68 | 14.3M | vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 69 | 14.3M | dst_stride, h, filter_row); \ | 70 | 47.4M | } else if (w == 4) { \ | 71 | 42.4M | vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 72 | 42.4M | dst_stride, h, filter_row); \ | 73 | 42.4M | } \ | 74 | 61.7M | (void)num_taps; \ | 75 | 61.7M | } else { \ | 76 | 478k | const int num_taps = 2; \ | 77 | 686k | while (w >= 16) { \ | 78 | 207k | vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 79 | 207k | dst_stride, h, filter_row); \ | 80 | 207k | src += 16; \ | 81 | 207k | dst += 16; \ | 82 | 207k | w -= 16; \ | 83 | 207k | } \ | 84 | 478k | if (w == 8) { \ | 85 | 144k | vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 86 | 144k | dst_stride, h, filter_row); \ | 87 | 334k | } else if (w == 4) { \ | 88 | 190k | vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 89 | 190k | dst_stride, h, filter_row); \ | 90 | 190k | } \ | 91 | 478k | (void)num_taps; \ | 92 | 478k | } \ | 93 | 122M | } |
vpx_convolve8_avg_horiz_avx2 Line | Count | Source | 33 | 548k | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 34 | 548k | const int16_t *filter_row = filter[offset]; \ | 35 | 548k | (void)x0_q4; \ | 36 | 548k | (void)x_step_q4; \ | 37 | 548k | (void)y0_q4; \ | 38 | 548k | (void)y_step_q4; \ | 39 | 548k | assert(filter_row[3] != 128); \ | 40 | 548k | assert(step_q4 == 16); \ | 41 | 548k | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 42 | 496k | const int num_taps = 8; \ | 43 | 641k | while (w >= 16) { \ | 44 | 145k | vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 45 | 145k | dst_stride, h, filter_row); \ | 46 | 145k | src += 16; \ | 47 | 145k | dst += 16; \ | 48 | 145k | w -= 16; \ | 49 | 145k | } \ | 50 | 496k | if (w == 8) { \ | 51 | 157k | vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 52 | 157k | dst_stride, h, filter_row); \ | 53 | 338k | } else if (w == 4) { \ | 54 | 232k | vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 55 | 232k | dst_stride, h, filter_row); \ | 56 | 232k | } \ | 57 | 496k | (void)num_taps; \ | 58 | 496k | } else if (filter_row[2] | filter_row[5]) { \ | 59 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 60 | 0 | while (w >= 16) { \ | 61 | 0 | vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 62 | 0 | dst_stride, h, filter_row); \ | 63 | 0 | src += 16; \ | 64 | 0 | dst += 16; \ | 65 | 0 | w -= 16; \ | 66 | 0 | } \ | 67 | 0 | if (w == 8) { \ | 68 | 0 | vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 69 | 0 | dst_stride, h, filter_row); \ | 70 | 0 | } else if (w == 4) { \ | 71 | 0 | vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 72 | 0 | dst_stride, h, filter_row); \ | 73 | 0 | } \ | 74 | 0 | (void)num_taps; \ | 75 | 52.4k | } else { \ | 76 | 52.4k | const int num_taps = 2; \ | 77 | 73.7k | while (w >= 16) { \ | 78 | 21.3k | vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 79 | 21.3k | dst_stride, h, filter_row); \ | 80 | 21.3k | src += 16; \ | 81 | 21.3k | dst += 16; \ | 82 | 21.3k | w -= 16; \ | 83 | 21.3k | } \ | 84 | 52.4k | if (w == 8) { \ | 85 | 16.0k | vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 86 | 16.0k | dst_stride, h, filter_row); \ | 87 | 36.3k | } else if (w == 4) { \ | 88 | 21.5k | vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 89 | 21.5k | dst_stride, h, filter_row); \ | 90 | 21.5k | } \ | 91 | 52.4k | (void)num_taps; \ | 92 | 52.4k | } \ | 93 | 548k | } |
vpx_convolve8_avg_vert_avx2 Line | Count | Source | 33 | 2.61M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 34 | 2.61M | const int16_t *filter_row = filter[offset]; \ | 35 | 2.61M | (void)x0_q4; \ | 36 | 2.61M | (void)x_step_q4; \ | 37 | 2.61M | (void)y0_q4; \ | 38 | 2.61M | (void)y_step_q4; \ | 39 | 2.61M | assert(filter_row[3] != 128); \ | 40 | 2.61M | assert(step_q4 == 16); \ | 41 | 2.61M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 42 | 2.38M | const int num_taps = 8; \ | 43 | 2.94M | while (w >= 16) { \ | 44 | 556k | vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 45 | 556k | dst_stride, h, filter_row); \ | 46 | 556k | src += 16; \ | 47 | 556k | dst += 16; \ | 48 | 556k | w -= 16; \ | 49 | 556k | } \ | 50 | 2.38M | if (w == 8) { \ | 51 | 706k | vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 52 | 706k | dst_stride, h, filter_row); \ | 53 | 1.67M | } else if (w == 4) { \ | 54 | 1.26M | vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ | 55 | 1.26M | dst_stride, h, filter_row); \ | 56 | 1.26M | } \ | 57 | 2.38M | (void)num_taps; \ | 58 | 2.38M | } else if (filter_row[2] | filter_row[5]) { \ | 59 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 60 | 0 | while (w >= 16) { \ | 61 | 0 | vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 62 | 0 | dst_stride, h, filter_row); \ | 63 | 0 | src += 16; \ | 64 | 0 | dst += 16; \ | 65 | 0 | w -= 16; \ | 66 | 0 | } \ | 67 | 0 | if (w == 8) { \ | 68 | 0 | vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 69 | 0 | dst_stride, h, filter_row); \ | 70 | 0 | } else if (w == 4) { \ | 71 | 0 | vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ | 72 | 0 | dst_stride, h, filter_row); \ | 73 | 0 | } \ | 74 | 0 | (void)num_taps; \ | 75 | 224k | } else { \ | 76 | 224k | const int num_taps = 2; \ | 77 | 322k | while (w >= 16) { \ | 78 | 98.6k | vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 79 | 98.6k | dst_stride, h, filter_row); \ | 80 | 98.6k | src += 16; \ | 81 | 98.6k | dst += 16; \ | 82 | 98.6k | w -= 16; \ | 83 | 98.6k | } \ | 84 | 224k | if (w == 8) { \ | 85 | 66.4k | vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 86 | 66.4k | dst_stride, h, filter_row); \ | 87 | 157k | } else if (w == 4) { \ | 88 | 89.8k | vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst, \ | 89 | 89.8k | dst_stride, h, filter_row); \ | 90 | 89.8k | } \ | 91 | 224k | (void)num_taps; \ | 92 | 224k | } \ | 93 | 2.61M | } |
Unexecuted instantiation: vpx_convolve8_horiz_ssse3 Unexecuted instantiation: vpx_convolve8_vert_ssse3 Unexecuted instantiation: vpx_convolve8_avg_horiz_ssse3 Unexecuted instantiation: vpx_convolve8_avg_vert_ssse3 |
94 | | |
95 | | #define FUN_CONV_2D(avg, opt, is_avg) \ |
96 | | void vpx_convolve8_##avg##opt( \ |
97 | | const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ |
98 | | ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ |
99 | 74.4M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ |
100 | 74.4M | const int16_t *filter_x = filter[x0_q4]; \ |
101 | 74.4M | const int16_t *filter_y = filter[y0_q4]; \ |
102 | 74.4M | (void)filter_y; \ |
103 | 74.4M | assert(filter_x[3] != 128); \ |
104 | 74.4M | assert(filter_y[3] != 128); \ |
105 | 74.4M | assert(w <= 64); \ |
106 | 74.4M | assert(h <= 64); \ |
107 | 74.4M | assert(x_step_q4 == 16); \ |
108 | 74.4M | assert(y_step_q4 == 16); \ |
109 | 74.4M | if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ |
110 | 37.8M | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ |
111 | 37.8M | vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ |
112 | 37.8M | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ |
113 | 37.8M | h + 7); \ |
114 | 37.8M | vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ |
115 | 37.8M | filter, x0_q4, x_step_q4, y0_q4, \ |
116 | 37.8M | y_step_q4, w, h); \ |
117 | 37.8M | } else if (filter_x[2] | filter_x[5]) { \ |
118 | 36.0M | const int num_taps = is_avg ? 8 : 4; \ |
119 | 36.0M | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ |
120 | 36.0M | vpx_convolve8_horiz_##opt( \ |
121 | 36.0M | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ |
122 | 36.0M | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ |
123 | 36.0M | vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ |
124 | 36.0M | dst, dst_stride, filter, x0_q4, \ |
125 | 36.0M | x_step_q4, y0_q4, y_step_q4, w, h); \ |
126 | 36.0M | } else { \ |
127 | 523k | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ |
128 | 523k | vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ |
129 | 523k | x_step_q4, y0_q4, y_step_q4, w, h + 1); \ |
130 | 523k | vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ |
131 | 523k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ |
132 | 523k | h); \ |
133 | 523k | } \ |
134 | 74.4M | } Unexecuted instantiation: vpx_convolve8_sse2 Unexecuted instantiation: vpx_convolve8_avg_sse2 Line | Count | Source | 99 | 72.3M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 100 | 72.3M | const int16_t *filter_x = filter[x0_q4]; \ | 101 | 72.3M | const int16_t *filter_y = filter[y0_q4]; \ | 102 | 72.3M | (void)filter_y; \ | 103 | 72.3M | assert(filter_x[3] != 128); \ | 104 | 72.3M | assert(filter_y[3] != 128); \ | 105 | 72.3M | assert(w <= 64); \ | 106 | 72.3M | assert(h <= 64); \ | 107 | 72.3M | assert(x_step_q4 == 16); \ | 108 | 72.3M | assert(y_step_q4 == 16); \ | 109 | 72.3M | if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ | 110 | 35.9M | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 111 | 35.9M | vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ | 112 | 35.9M | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ | 113 | 35.9M | h + 7); \ | 114 | 35.9M | vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ | 115 | 35.9M | filter, x0_q4, x_step_q4, y0_q4, \ | 116 | 35.9M | y_step_q4, w, h); \ | 117 | 36.3M | } else if (filter_x[2] | filter_x[5]) { \ | 118 | 36.0M | const int num_taps = is_avg ? 8 : 4; \ | 119 | 36.0M | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 120 | 36.0M | vpx_convolve8_horiz_##opt( \ | 121 | 36.0M | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ | 122 | 36.0M | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ | 123 | 36.0M | vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ | 124 | 36.0M | dst, dst_stride, filter, x0_q4, \ | 125 | 36.0M | x_step_q4, y0_q4, y_step_q4, w, h); \ | 126 | 36.0M | } else { \ | 127 | 356k | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ | 128 | 356k | vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ | 129 | 356k | x_step_q4, y0_q4, y_step_q4, w, h + 1); \ | 130 | 356k | vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ | 131 | 356k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ | 132 | 356k | h); \ | 133 | 356k | } \ | 134 | 72.3M | } |
Line | Count | Source | 99 | 2.03M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ | 100 | 2.03M | const int16_t *filter_x = filter[x0_q4]; \ | 101 | 2.03M | const int16_t *filter_y = filter[y0_q4]; \ | 102 | 2.03M | (void)filter_y; \ | 103 | 2.03M | assert(filter_x[3] != 128); \ | 104 | 2.03M | assert(filter_y[3] != 128); \ | 105 | 2.03M | assert(w <= 64); \ | 106 | 2.03M | assert(h <= 64); \ | 107 | 2.03M | assert(x_step_q4 == 16); \ | 108 | 2.03M | assert(y_step_q4 == 16); \ | 109 | 2.03M | if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) { \ | 110 | 1.86M | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 111 | 1.86M | vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \ | 112 | 1.86M | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ | 113 | 1.86M | h + 7); \ | 114 | 1.86M | vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \ | 115 | 1.86M | filter, x0_q4, x_step_q4, y0_q4, \ | 116 | 1.86M | y_step_q4, w, h); \ | 117 | 1.86M | } else if (filter_x[2] | filter_x[5]) { \ | 118 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 119 | 0 | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 120 | 0 | vpx_convolve8_horiz_##opt( \ | 121 | 0 | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ | 122 | 0 | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1); \ | 123 | 0 | vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64, \ | 124 | 0 | dst, dst_stride, filter, x0_q4, \ | 125 | 0 | x_step_q4, y0_q4, y_step_q4, w, h); \ | 126 | 167k | } else { \ | 127 | 167k | DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ | 128 | 167k | vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4, \ | 129 | 167k | x_step_q4, y0_q4, y_step_q4, w, h + 1); \ | 130 | 167k | vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter, \ | 131 | 167k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, \ | 132 | 167k | h); \ | 133 | 167k | } \ | 134 | 2.03M | } |
Unexecuted instantiation: vpx_convolve8_ssse3 Unexecuted instantiation: vpx_convolve8_avg_ssse3 |
135 | | |
136 | | #if CONFIG_VP9_HIGHBITDEPTH |
137 | | |
138 | | typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, |
139 | | const ptrdiff_t src_pitch, |
140 | | uint16_t *output_ptr, |
141 | | ptrdiff_t out_pitch, |
142 | | unsigned int output_height, |
143 | | const int16_t *filter, int bd); |
144 | | |
145 | | #define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, \ |
146 | | is_avg) \ |
147 | | void vpx_highbd_convolve8_##name##_##opt( \ |
148 | | const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ |
149 | | ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ |
150 | 3.99M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ |
151 | 3.99M | const int16_t *filter_row = filter_kernel[offset]; \ |
152 | 3.99M | if (step_q4 == 16 && filter_row[3] != 128) { \ |
153 | 3.86M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ |
154 | 3.01M | const int num_taps = 8; \ |
155 | 3.78M | while (w >= 16) { \ |
156 | 767k | vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ |
157 | 767k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
158 | 767k | src += 16; \ |
159 | 767k | dst += 16; \ |
160 | 767k | w -= 16; \ |
161 | 767k | } \ |
162 | 4.10M | while (w >= 8) { \ |
163 | 1.08M | vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ |
164 | 1.08M | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
165 | 1.08M | src += 8; \ |
166 | 1.08M | dst += 8; \ |
167 | 1.08M | w -= 8; \ |
168 | 1.08M | } \ |
169 | 4.42M | while (w >= 4) { \ |
170 | 1.41M | vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ |
171 | 1.41M | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
172 | 1.41M | src += 4; \ |
173 | 1.41M | dst += 4; \ |
174 | 1.41M | w -= 4; \ |
175 | 1.41M | } \ |
176 | 3.01M | (void)num_taps; \ |
177 | 3.01M | } else if (filter_row[2] | filter_row[5]) { \ |
178 | 0 | const int num_taps = is_avg ? 8 : 4; \ |
179 | 0 | while (w >= 16) { \ |
180 | 0 | vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ |
181 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
182 | 0 | src += 16; \ |
183 | 0 | dst += 16; \ |
184 | 0 | w -= 16; \ |
185 | 0 | } \ |
186 | 0 | while (w >= 8) { \ |
187 | 0 | vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ |
188 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
189 | 0 | src += 8; \ |
190 | 0 | dst += 8; \ |
191 | 0 | w -= 8; \ |
192 | 0 | } \ |
193 | 0 | while (w >= 4) { \ |
194 | 0 | vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ |
195 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
196 | 0 | src += 4; \ |
197 | 0 | dst += 4; \ |
198 | 0 | w -= 4; \ |
199 | 0 | } \ |
200 | 0 | (void)num_taps; \ |
201 | 854k | } else { \ |
202 | 854k | const int num_taps = 2; \ |
203 | 1.08M | while (w >= 16) { \ |
204 | 229k | vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ |
205 | 229k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
206 | 229k | src += 16; \ |
207 | 229k | dst += 16; \ |
208 | 229k | w -= 16; \ |
209 | 229k | } \ |
210 | 1.11M | while (w >= 8) { \ |
211 | 257k | vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ |
212 | 257k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
213 | 257k | src += 8; \ |
214 | 257k | dst += 8; \ |
215 | 257k | w -= 8; \ |
216 | 257k | } \ |
217 | 1.29M | while (w >= 4) { \ |
218 | 441k | vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ |
219 | 441k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ |
220 | 441k | src += 4; \ |
221 | 441k | dst += 4; \ |
222 | 441k | w -= 4; \ |
223 | 441k | } \ |
224 | 854k | (void)num_taps; \ |
225 | 854k | } \ |
226 | 3.86M | } \ |
227 | 3.99M | if (w) { \ |
228 | 127k | vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ |
229 | 127k | filter_kernel, x0_q4, x_step_q4, y0_q4, \ |
230 | 127k | y_step_q4, w, h, bd); \ |
231 | 127k | } \ |
232 | 3.99M | } Unexecuted instantiation: vpx_highbd_convolve8_horiz_sse2 Unexecuted instantiation: vpx_highbd_convolve8_vert_sse2 Unexecuted instantiation: vpx_highbd_convolve8_avg_horiz_sse2 Unexecuted instantiation: vpx_highbd_convolve8_avg_vert_sse2 vpx_highbd_convolve8_horiz_avx2 Line | Count | Source | 150 | 1.80M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 151 | 1.80M | const int16_t *filter_row = filter_kernel[offset]; \ | 152 | 1.80M | if (step_q4 == 16 && filter_row[3] != 128) { \ | 153 | 1.80M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 154 | 1.40M | const int num_taps = 8; \ | 155 | 1.75M | while (w >= 16) { \ | 156 | 344k | vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ | 157 | 344k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 158 | 344k | src += 16; \ | 159 | 344k | dst += 16; \ | 160 | 344k | w -= 16; \ | 161 | 344k | } \ | 162 | 1.92M | while (w >= 8) { \ | 163 | 521k | vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ | 164 | 521k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 165 | 521k | src += 8; \ | 166 | 521k | dst += 8; \ | 167 | 521k | w -= 8; \ | 168 | 521k | } \ | 169 | 2.05M | while (w >= 4) { \ | 170 | 652k | vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ | 171 | 652k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 172 | 652k | src += 4; \ | 173 | 652k | dst += 4; \ | 174 | 652k | w -= 4; \ | 175 | 652k | } \ | 176 | 1.40M | (void)num_taps; \ | 177 | 1.40M | } else if (filter_row[2] | filter_row[5]) { \ | 178 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 179 | 0 | while (w >= 16) { \ | 180 | 0 | vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ | 181 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 182 | 0 | src += 16; \ | 183 | 0 | dst += 16; \ | 184 | 0 | w -= 16; \ | 185 | 0 | } \ | 186 | 0 | while (w >= 8) { \ | 187 | 0 | vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ | 188 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 189 | 0 | src += 8; \ | 190 | 0 | dst += 8; \ | 191 | 0 | w -= 8; \ | 192 | 0 | } \ | 193 | 0 | while (w >= 4) { \ | 194 | 0 | vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ | 195 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 196 | 0 | src += 4; \ | 197 | 0 | dst += 4; \ | 198 | 0 | w -= 4; \ | 199 | 0 | } \ | 200 | 0 | (void)num_taps; \ | 201 | 394k | } else { \ | 202 | 394k | const int num_taps = 2; \ | 203 | 497k | while (w >= 16) { \ | 204 | 102k | vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ | 205 | 102k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 206 | 102k | src += 16; \ | 207 | 102k | dst += 16; \ | 208 | 102k | w -= 16; \ | 209 | 102k | } \ | 210 | 513k | while (w >= 8) { \ | 211 | 118k | vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ | 212 | 118k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 213 | 118k | src += 8; \ | 214 | 118k | dst += 8; \ | 215 | 118k | w -= 8; \ | 216 | 118k | } \ | 217 | 600k | while (w >= 4) { \ | 218 | 205k | vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ | 219 | 205k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 220 | 205k | src += 4; \ | 221 | 205k | dst += 4; \ | 222 | 205k | w -= 4; \ | 223 | 205k | } \ | 224 | 394k | (void)num_taps; \ | 225 | 394k | } \ | 226 | 1.80M | } \ | 227 | 1.80M | if (w) { \ | 228 | 7.48k | vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ | 229 | 7.48k | filter_kernel, x0_q4, x_step_q4, y0_q4, \ | 230 | 7.48k | y_step_q4, w, h, bd); \ | 231 | 7.48k | } \ | 232 | 1.80M | } |
vpx_highbd_convolve8_vert_avx2 Line | Count | Source | 150 | 1.49M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 151 | 1.49M | const int16_t *filter_row = filter_kernel[offset]; \ | 152 | 1.49M | if (step_q4 == 16 && filter_row[3] != 128) { \ | 153 | 1.40M | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 154 | 1.08M | const int num_taps = 8; \ | 155 | 1.35M | while (w >= 16) { \ | 156 | 266k | vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ | 157 | 266k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 158 | 266k | src += 16; \ | 159 | 266k | dst += 16; \ | 160 | 266k | w -= 16; \ | 161 | 266k | } \ | 162 | 1.49M | while (w >= 8) { \ | 163 | 406k | vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ | 164 | 406k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 165 | 406k | src += 8; \ | 166 | 406k | dst += 8; \ | 167 | 406k | w -= 8; \ | 168 | 406k | } \ | 169 | 1.58M | while (w >= 4) { \ | 170 | 499k | vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ | 171 | 499k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 172 | 499k | src += 4; \ | 173 | 499k | dst += 4; \ | 174 | 499k | w -= 4; \ | 175 | 499k | } \ | 176 | 1.08M | (void)num_taps; \ | 177 | 1.08M | } else if (filter_row[2] | filter_row[5]) { \ | 178 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 179 | 0 | while (w >= 16) { \ | 180 | 0 | vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ | 181 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 182 | 0 | src += 16; \ | 183 | 0 | dst += 16; \ | 184 | 0 | w -= 16; \ | 185 | 0 | } \ | 186 | 0 | while (w >= 8) { \ | 187 | 0 | vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ | 188 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 189 | 0 | src += 8; \ | 190 | 0 | dst += 8; \ | 191 | 0 | w -= 8; \ | 192 | 0 | } \ | 193 | 0 | while (w >= 4) { \ | 194 | 0 | vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ | 195 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 196 | 0 | src += 4; \ | 197 | 0 | dst += 4; \ | 198 | 0 | w -= 4; \ | 199 | 0 | } \ | 200 | 0 | (void)num_taps; \ | 201 | 326k | } else { \ | 202 | 326k | const int num_taps = 2; \ | 203 | 405k | while (w >= 16) { \ | 204 | 79.4k | vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ | 205 | 79.4k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 206 | 79.4k | src += 16; \ | 207 | 79.4k | dst += 16; \ | 208 | 79.4k | w -= 16; \ | 209 | 79.4k | } \ | 210 | 419k | while (w >= 8) { \ | 211 | 93.6k | vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ | 212 | 93.6k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 213 | 93.6k | src += 8; \ | 214 | 93.6k | dst += 8; \ | 215 | 93.6k | w -= 8; \ | 216 | 93.6k | } \ | 217 | 504k | while (w >= 4) { \ | 218 | 178k | vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ | 219 | 178k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 220 | 178k | src += 4; \ | 221 | 178k | dst += 4; \ | 222 | 178k | w -= 4; \ | 223 | 178k | } \ | 224 | 326k | (void)num_taps; \ | 225 | 326k | } \ | 226 | 1.40M | } \ | 227 | 1.49M | if (w) { \ | 228 | 83.6k | vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ | 229 | 83.6k | filter_kernel, x0_q4, x_step_q4, y0_q4, \ | 230 | 83.6k | y_step_q4, w, h, bd); \ | 231 | 83.6k | } \ | 232 | 1.49M | } |
vpx_highbd_convolve8_avg_horiz_avx2 Line | Count | Source | 150 | 101k | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 151 | 101k | const int16_t *filter_row = filter_kernel[offset]; \ | 152 | 101k | if (step_q4 == 16 && filter_row[3] != 128) { \ | 153 | 95.1k | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 154 | 74.1k | const int num_taps = 8; \ | 155 | 100k | while (w >= 16) { \ | 156 | 26.1k | vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ | 157 | 26.1k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 158 | 26.1k | src += 16; \ | 159 | 26.1k | dst += 16; \ | 160 | 26.1k | w -= 16; \ | 161 | 26.1k | } \ | 162 | 96.6k | while (w >= 8) { \ | 163 | 22.5k | vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ | 164 | 22.5k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 165 | 22.5k | src += 8; \ | 166 | 22.5k | dst += 8; \ | 167 | 22.5k | w -= 8; \ | 168 | 22.5k | } \ | 169 | 108k | while (w >= 4) { \ | 170 | 34.7k | vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ | 171 | 34.7k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 172 | 34.7k | src += 4; \ | 173 | 34.7k | dst += 4; \ | 174 | 34.7k | w -= 4; \ | 175 | 34.7k | } \ | 176 | 74.1k | (void)num_taps; \ | 177 | 74.1k | } else if (filter_row[2] | filter_row[5]) { \ | 178 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 179 | 0 | while (w >= 16) { \ | 180 | 0 | vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ | 181 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 182 | 0 | src += 16; \ | 183 | 0 | dst += 16; \ | 184 | 0 | w -= 16; \ | 185 | 0 | } \ | 186 | 0 | while (w >= 8) { \ | 187 | 0 | vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ | 188 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 189 | 0 | src += 8; \ | 190 | 0 | dst += 8; \ | 191 | 0 | w -= 8; \ | 192 | 0 | } \ | 193 | 0 | while (w >= 4) { \ | 194 | 0 | vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ | 195 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 196 | 0 | src += 4; \ | 197 | 0 | dst += 4; \ | 198 | 0 | w -= 4; \ | 199 | 0 | } \ | 200 | 0 | (void)num_taps; \ | 201 | 21.0k | } else { \ | 202 | 21.0k | const int num_taps = 2; \ | 203 | 29.6k | while (w >= 16) { \ | 204 | 8.54k | vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ | 205 | 8.54k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 206 | 8.54k | src += 16; \ | 207 | 8.54k | dst += 16; \ | 208 | 8.54k | w -= 16; \ | 209 | 8.54k | } \ | 210 | 27.9k | while (w >= 8) { \ | 211 | 6.86k | vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ | 212 | 6.86k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 213 | 6.86k | src += 8; \ | 214 | 6.86k | dst += 8; \ | 215 | 6.86k | w -= 8; \ | 216 | 6.86k | } \ | 217 | 29.8k | while (w >= 4) { \ | 218 | 8.78k | vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ | 219 | 8.78k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 220 | 8.78k | src += 4; \ | 221 | 8.78k | dst += 4; \ | 222 | 8.78k | w -= 4; \ | 223 | 8.78k | } \ | 224 | 21.0k | (void)num_taps; \ | 225 | 21.0k | } \ | 226 | 95.1k | } \ | 227 | 101k | if (w) { \ | 228 | 6.56k | vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ | 229 | 6.56k | filter_kernel, x0_q4, x_step_q4, y0_q4, \ | 230 | 6.56k | y_step_q4, w, h, bd); \ | 231 | 6.56k | } \ | 232 | 101k | } |
vpx_highbd_convolve8_avg_vert_avx2 Line | Count | Source | 150 | 594k | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 151 | 594k | const int16_t *filter_row = filter_kernel[offset]; \ | 152 | 594k | if (step_q4 == 16 && filter_row[3] != 128) { \ | 153 | 564k | if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \ | 154 | 451k | const int num_taps = 8; \ | 155 | 581k | while (w >= 16) { \ | 156 | 129k | vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ | 157 | 129k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 158 | 129k | src += 16; \ | 159 | 129k | dst += 16; \ | 160 | 129k | w -= 16; \ | 161 | 129k | } \ | 162 | 589k | while (w >= 8) { \ | 163 | 138k | vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ | 164 | 138k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 165 | 138k | src += 8; \ | 166 | 138k | dst += 8; \ | 167 | 138k | w -= 8; \ | 168 | 138k | } \ | 169 | 677k | while (w >= 4) { \ | 170 | 225k | vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ | 171 | 225k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 172 | 225k | src += 4; \ | 173 | 225k | dst += 4; \ | 174 | 225k | w -= 4; \ | 175 | 225k | } \ | 176 | 451k | (void)num_taps; \ | 177 | 451k | } else if (filter_row[2] | filter_row[5]) { \ | 178 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 179 | 0 | while (w >= 16) { \ | 180 | 0 | vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ | 181 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 182 | 0 | src += 16; \ | 183 | 0 | dst += 16; \ | 184 | 0 | w -= 16; \ | 185 | 0 | } \ | 186 | 0 | while (w >= 8) { \ | 187 | 0 | vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ | 188 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 189 | 0 | src += 8; \ | 190 | 0 | dst += 8; \ | 191 | 0 | w -= 8; \ | 192 | 0 | } \ | 193 | 0 | while (w >= 4) { \ | 194 | 0 | vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ | 195 | 0 | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 196 | 0 | src += 4; \ | 197 | 0 | dst += 4; \ | 198 | 0 | w -= 4; \ | 199 | 0 | } \ | 200 | 0 | (void)num_taps; \ | 201 | 113k | } else { \ | 202 | 113k | const int num_taps = 2; \ | 203 | 151k | while (w >= 16) { \ | 204 | 38.8k | vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ | 205 | 38.8k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 206 | 38.8k | src += 16; \ | 207 | 38.8k | dst += 16; \ | 208 | 38.8k | w -= 16; \ | 209 | 38.8k | } \ | 210 | 151k | while (w >= 8) { \ | 211 | 38.1k | vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ | 212 | 38.1k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 213 | 38.1k | src += 8; \ | 214 | 38.1k | dst += 8; \ | 215 | 38.1k | w -= 8; \ | 216 | 38.1k | } \ | 217 | 162k | while (w >= 4) { \ | 218 | 49.0k | vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ | 219 | 49.0k | src_start, src_stride, dst, dst_stride, h, filter_row, bd); \ | 220 | 49.0k | src += 4; \ | 221 | 49.0k | dst += 4; \ | 222 | 49.0k | w -= 4; \ | 223 | 49.0k | } \ | 224 | 113k | (void)num_taps; \ | 225 | 113k | } \ | 226 | 564k | } \ | 227 | 594k | if (w) { \ | 228 | 29.8k | vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ | 229 | 29.8k | filter_kernel, x0_q4, x_step_q4, y0_q4, \ | 230 | 29.8k | y_step_q4, w, h, bd); \ | 231 | 29.8k | } \ | 232 | 594k | } |
|
233 | | |
234 | | #define HIGH_FUN_CONV_2D(avg, opt, is_avg) \ |
235 | | void vpx_highbd_convolve8_##avg##opt( \ |
236 | | const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ |
237 | | ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ |
238 | 1.59M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ |
239 | 1.59M | const int16_t *filter_x = filter[x0_q4]; \ |
240 | 1.59M | assert(w <= 64); \ |
241 | 1.59M | assert(h <= 64); \ |
242 | 1.59M | if (x_step_q4 == 16 && y_step_q4 == 16) { \ |
243 | 1.50M | if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ |
244 | 1.50M | filter_x[3] == 128) { \ |
245 | 1.16M | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ |
246 | 1.16M | vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ |
247 | 1.16M | fdata2, 64, filter, x0_q4, x_step_q4, \ |
248 | 1.16M | y0_q4, y_step_q4, w, h + 7, bd); \ |
249 | 1.16M | vpx_highbd_convolve8_##avg##vert_##opt( \ |
250 | 1.16M | fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ |
251 | 1.16M | y0_q4, y_step_q4, w, h, bd); \ |
252 | 1.16M | } else if (filter_x[2] | filter_x[5]) { \ |
253 | 0 | const int num_taps = is_avg ? 8 : 4; \ |
254 | 0 | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ |
255 | 0 | vpx_highbd_convolve8_horiz_##opt( \ |
256 | 0 | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ |
257 | 0 | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ |
258 | 0 | bd); \ |
259 | 0 | vpx_highbd_convolve8_##avg##vert_##opt( \ |
260 | 0 | fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ |
261 | 0 | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ |
262 | 336k | } else { \ |
263 | 336k | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ |
264 | 336k | vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ |
265 | 336k | x0_q4, x_step_q4, y0_q4, y_step_q4, \ |
266 | 336k | w, h + 1, bd); \ |
267 | 336k | vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ |
268 | 336k | filter, x0_q4, x_step_q4, \ |
269 | 336k | y0_q4, y_step_q4, w, h, bd); \ |
270 | 336k | } \ |
271 | 1.50M | } else { \ |
272 | 90.8k | vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ |
273 | 90.8k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ |
274 | 90.8k | bd); \ |
275 | 90.8k | } \ |
276 | 1.59M | } Unexecuted instantiation: vpx_highbd_convolve8_sse2 Unexecuted instantiation: vpx_highbd_convolve8_avg_sse2 vpx_highbd_convolve8_avx2 Line | Count | Source | 238 | 1.12M | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 239 | 1.12M | const int16_t *filter_x = filter[x0_q4]; \ | 240 | 1.12M | assert(w <= 64); \ | 241 | 1.12M | assert(h <= 64); \ | 242 | 1.12M | if (x_step_q4 == 16 && y_step_q4 == 16) { \ | 243 | 1.06M | if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ | 244 | 1.06M | filter_x[3] == 128) { \ | 245 | 818k | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 246 | 818k | vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ | 247 | 818k | fdata2, 64, filter, x0_q4, x_step_q4, \ | 248 | 818k | y0_q4, y_step_q4, w, h + 7, bd); \ | 249 | 818k | vpx_highbd_convolve8_##avg##vert_##opt( \ | 250 | 818k | fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ | 251 | 818k | y0_q4, y_step_q4, w, h, bd); \ | 252 | 818k | } else if (filter_x[2] | filter_x[5]) { \ | 253 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 254 | 0 | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 255 | 0 | vpx_highbd_convolve8_horiz_##opt( \ | 256 | 0 | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ | 257 | 0 | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ | 258 | 0 | bd); \ | 259 | 0 | vpx_highbd_convolve8_##avg##vert_##opt( \ | 260 | 0 | fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ | 261 | 0 | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ | 262 | 248k | } else { \ | 263 | 248k | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ | 264 | 248k | vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ | 265 | 248k | x0_q4, x_step_q4, y0_q4, y_step_q4, \ | 266 | 248k | w, h + 1, bd); \ | 267 | 248k | vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ | 268 | 248k | filter, x0_q4, x_step_q4, \ | 269 | 248k | y0_q4, y_step_q4, w, h, bd); \ | 270 | 248k | } \ | 271 | 1.06M | } else { \ | 272 | 63.0k | vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ | 273 | 63.0k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ | 274 | 63.0k | bd); \ | 275 | 63.0k | } \ | 276 | 1.12M | } |
vpx_highbd_convolve8_avg_avx2 Line | Count | Source | 238 | 463k | int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ | 239 | 463k | const int16_t *filter_x = filter[x0_q4]; \ | 240 | 463k | assert(w <= 64); \ | 241 | 463k | assert(h <= 64); \ | 242 | 463k | if (x_step_q4 == 16 && y_step_q4 == 16) { \ | 243 | 435k | if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) || \ | 244 | 435k | filter_x[3] == 128) { \ | 245 | 346k | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 246 | 346k | vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ | 247 | 346k | fdata2, 64, filter, x0_q4, x_step_q4, \ | 248 | 346k | y0_q4, y_step_q4, w, h + 7, bd); \ | 249 | 346k | vpx_highbd_convolve8_##avg##vert_##opt( \ | 250 | 346k | fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ | 251 | 346k | y0_q4, y_step_q4, w, h, bd); \ | 252 | 346k | } else if (filter_x[2] | filter_x[5]) { \ | 253 | 0 | const int num_taps = is_avg ? 8 : 4; \ | 254 | 0 | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71] VPX_UNINITIALIZED); \ | 255 | 0 | vpx_highbd_convolve8_horiz_##opt( \ | 256 | 0 | src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64, \ | 257 | 0 | filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1, \ | 258 | 0 | bd); \ | 259 | 0 | vpx_highbd_convolve8_##avg##vert_##opt( \ | 260 | 0 | fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter, \ | 261 | 0 | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ | 262 | 88.7k | } else { \ | 263 | 88.7k | DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65] VPX_UNINITIALIZED); \ | 264 | 88.7k | vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ | 265 | 88.7k | x0_q4, x_step_q4, y0_q4, y_step_q4, \ | 266 | 88.7k | w, h + 1, bd); \ | 267 | 88.7k | vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ | 268 | 88.7k | filter, x0_q4, x_step_q4, \ | 269 | 88.7k | y0_q4, y_step_q4, w, h, bd); \ | 270 | 88.7k | } \ | 271 | 435k | } else { \ | 272 | 27.7k | vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ | 273 | 27.7k | x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ | 274 | 27.7k | bd); \ | 275 | 27.7k | } \ | 276 | 463k | } |
|
277 | | |
278 | | #endif // CONFIG_VP9_HIGHBITDEPTH |
279 | | #endif // VPX_VPX_DSP_X86_CONVOLVE_H_ |