/src/aom/av1/common/cdef_block_simd.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |
13 | | #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |
14 | | |
15 | | #include "config/aom_config.h" |
16 | | #include "config/av1_rtcd.h" |
17 | | |
18 | | #include "av1/common/cdef_block.h" |
19 | | |
20 | | /* partial A is a 16-bit vector of the form: |
21 | | [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: |
22 | | [0 y1 y2 y3 y4 y5 y6 y7]. |
23 | | This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... |
24 | | (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 |
25 | | and const2. */ |
26 | | static inline v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, |
27 | 49.0k | v128 const2) { |
28 | 49.0k | v128 tmp; |
29 | | /* Reverse partial B. */ |
30 | 49.0k | partialb = v128_shuffle_8( |
31 | 49.0k | partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); |
32 | | /* Interleave the x and y values of identical indices and pair x8 with 0. */ |
33 | 49.0k | tmp = partiala; |
34 | 49.0k | partiala = v128_ziplo_16(partialb, partiala); |
35 | 49.0k | partialb = v128_ziphi_16(partialb, tmp); |
36 | | /* Square and add the corresponding x and y values. */ |
37 | 49.0k | partiala = v128_madd_s16(partiala, partiala); |
38 | 49.0k | partialb = v128_madd_s16(partialb, partialb); |
39 | | /* Multiply by constant. */ |
40 | 49.0k | partiala = v128_mullo_s32(partiala, const1); |
41 | 49.0k | partialb = v128_mullo_s32(partialb, const2); |
42 | | /* Sum all results. */ |
43 | 49.0k | partiala = v128_add_32(partiala, partialb); |
44 | 49.0k | return partiala; |
45 | 49.0k | } Unexecuted instantiation: cdef_block_sse4.c:fold_mul_and_sum cdef_block_avx2.c:fold_mul_and_sum Line | Count | Source | 27 | 49.0k | v128 const2) { | 28 | 49.0k | v128 tmp; | 29 | | /* Reverse partial B. */ | 30 | 49.0k | partialb = v128_shuffle_8( | 31 | 49.0k | partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); | 32 | | /* Interleave the x and y values of identical indices and pair x8 with 0. */ | 33 | 49.0k | tmp = partiala; | 34 | 49.0k | partiala = v128_ziplo_16(partialb, partiala); | 35 | 49.0k | partialb = v128_ziphi_16(partialb, tmp); | 36 | | /* Square and add the corresponding x and y values. */ | 37 | 49.0k | partiala = v128_madd_s16(partiala, partiala); | 38 | 49.0k | partialb = v128_madd_s16(partialb, partialb); | 39 | | /* Multiply by constant. */ | 40 | 49.0k | partiala = v128_mullo_s32(partiala, const1); | 41 | 49.0k | partialb = v128_mullo_s32(partialb, const2); | 42 | | /* Sum all results. */ | 43 | 49.0k | partiala = v128_add_32(partiala, partialb); | 44 | 49.0k | return partiala; | 45 | 49.0k | } |
|
46 | | |
47 | 16.3k | static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { |
48 | 16.3k | v128 t0, t1, t2, t3; |
49 | 16.3k | t0 = v128_ziplo_32(x1, x0); |
50 | 16.3k | t1 = v128_ziplo_32(x3, x2); |
51 | 16.3k | t2 = v128_ziphi_32(x1, x0); |
52 | 16.3k | t3 = v128_ziphi_32(x3, x2); |
53 | 16.3k | x0 = v128_ziplo_64(t1, t0); |
54 | 16.3k | x1 = v128_ziphi_64(t1, t0); |
55 | 16.3k | x2 = v128_ziplo_64(t3, t2); |
56 | 16.3k | x3 = v128_ziphi_64(t3, t2); |
57 | 16.3k | return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); |
58 | 16.3k | } Unexecuted instantiation: cdef_block_sse4.c:hsum4 Line | Count | Source | 47 | 16.3k | static inline v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { | 48 | 16.3k | v128 t0, t1, t2, t3; | 49 | 16.3k | t0 = v128_ziplo_32(x1, x0); | 50 | 16.3k | t1 = v128_ziplo_32(x3, x2); | 51 | 16.3k | t2 = v128_ziphi_32(x1, x0); | 52 | 16.3k | t3 = v128_ziphi_32(x3, x2); | 53 | 16.3k | x0 = v128_ziplo_64(t1, t0); | 54 | 16.3k | x1 = v128_ziphi_64(t1, t0); | 55 | 16.3k | x2 = v128_ziplo_64(t3, t2); | 56 | 16.3k | x3 = v128_ziphi_64(t3, t2); | 57 | 16.3k | return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); | 58 | 16.3k | } |
|
59 | | |
60 | | /* Computes cost for directions 0, 5, 6 and 7. We can call this function again |
61 | | to compute the remaining directions. */ |
62 | 16.3k | static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { |
63 | 16.3k | v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; |
64 | 16.3k | v128 partial6; |
65 | 16.3k | v128 tmp; |
66 | | /* Partial sums for lines 0 and 1. */ |
67 | 16.3k | partial4a = v128_shl_n_byte(lines[0], 14); |
68 | 16.3k | partial4b = v128_shr_n_byte(lines[0], 2); |
69 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); |
70 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); |
71 | 16.3k | tmp = v128_add_16(lines[0], lines[1]); |
72 | 16.3k | partial5a = v128_shl_n_byte(tmp, 10); |
73 | 16.3k | partial5b = v128_shr_n_byte(tmp, 6); |
74 | 16.3k | partial7a = v128_shl_n_byte(tmp, 4); |
75 | 16.3k | partial7b = v128_shr_n_byte(tmp, 12); |
76 | 16.3k | partial6 = tmp; |
77 | | |
78 | | /* Partial sums for lines 2 and 3. */ |
79 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); |
80 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); |
81 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); |
82 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); |
83 | 16.3k | tmp = v128_add_16(lines[2], lines[3]); |
84 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); |
85 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); |
86 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); |
87 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); |
88 | 16.3k | partial6 = v128_add_16(partial6, tmp); |
89 | | |
90 | | /* Partial sums for lines 4 and 5. */ |
91 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); |
92 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); |
93 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); |
94 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); |
95 | 16.3k | tmp = v128_add_16(lines[4], lines[5]); |
96 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); |
97 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); |
98 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); |
99 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); |
100 | 16.3k | partial6 = v128_add_16(partial6, tmp); |
101 | | |
102 | | /* Partial sums for lines 6 and 7. */ |
103 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); |
104 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); |
105 | 16.3k | partial4a = v128_add_16(partial4a, lines[7]); |
106 | 16.3k | tmp = v128_add_16(lines[6], lines[7]); |
107 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); |
108 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); |
109 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); |
110 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); |
111 | 16.3k | partial6 = v128_add_16(partial6, tmp); |
112 | | |
113 | | /* Compute costs in terms of partial sums. */ |
114 | 16.3k | partial4a = |
115 | 16.3k | fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), |
116 | 16.3k | v128_from_32(105, 120, 140, 168)); |
117 | 16.3k | partial7a = |
118 | 16.3k | fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), |
119 | 16.3k | v128_from_32(105, 105, 105, 140)); |
120 | 16.3k | partial5a = |
121 | 16.3k | fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), |
122 | 16.3k | v128_from_32(105, 105, 105, 140)); |
123 | 16.3k | partial6 = v128_madd_s16(partial6, partial6); |
124 | 16.3k | partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); |
125 | | |
126 | 16.3k | partial4a = hsum4(partial4a, partial5a, partial6, partial7a); |
127 | 16.3k | v128_store_unaligned(tmp_cost1, partial4a); |
128 | 16.3k | return partial4a; |
129 | 16.3k | } Unexecuted instantiation: cdef_block_sse4.c:compute_directions cdef_block_avx2.c:compute_directions Line | Count | Source | 62 | 16.3k | static inline v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { | 63 | 16.3k | v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; | 64 | 16.3k | v128 partial6; | 65 | 16.3k | v128 tmp; | 66 | | /* Partial sums for lines 0 and 1. */ | 67 | 16.3k | partial4a = v128_shl_n_byte(lines[0], 14); | 68 | 16.3k | partial4b = v128_shr_n_byte(lines[0], 2); | 69 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); | 70 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); | 71 | 16.3k | tmp = v128_add_16(lines[0], lines[1]); | 72 | 16.3k | partial5a = v128_shl_n_byte(tmp, 10); | 73 | 16.3k | partial5b = v128_shr_n_byte(tmp, 6); | 74 | 16.3k | partial7a = v128_shl_n_byte(tmp, 4); | 75 | 16.3k | partial7b = v128_shr_n_byte(tmp, 12); | 76 | 16.3k | partial6 = tmp; | 77 | | | 78 | | /* Partial sums for lines 2 and 3. */ | 79 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); | 80 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); | 81 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); | 82 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); | 83 | 16.3k | tmp = v128_add_16(lines[2], lines[3]); | 84 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); | 85 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); | 86 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); | 87 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); | 88 | 16.3k | partial6 = v128_add_16(partial6, tmp); | 89 | | | 90 | | /* Partial sums for lines 4 and 5. */ | 91 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); | 92 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); | 93 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); | 94 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); | 95 | 16.3k | tmp = v128_add_16(lines[4], lines[5]); | 96 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); | 97 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); | 98 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); | 99 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); | 100 | 16.3k | partial6 = v128_add_16(partial6, tmp); | 101 | | | 102 | | /* Partial sums for lines 6 and 7. */ | 103 | 16.3k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); | 104 | 16.3k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); | 105 | 16.3k | partial4a = v128_add_16(partial4a, lines[7]); | 106 | 16.3k | tmp = v128_add_16(lines[6], lines[7]); | 107 | 16.3k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); | 108 | 16.3k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); | 109 | 16.3k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); | 110 | 16.3k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); | 111 | 16.3k | partial6 = v128_add_16(partial6, tmp); | 112 | | | 113 | | /* Compute costs in terms of partial sums. */ | 114 | 16.3k | partial4a = | 115 | 16.3k | fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), | 116 | 16.3k | v128_from_32(105, 120, 140, 168)); | 117 | 16.3k | partial7a = | 118 | 16.3k | fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), | 119 | 16.3k | v128_from_32(105, 105, 105, 140)); | 120 | 16.3k | partial5a = | 121 | 16.3k | fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), | 122 | 16.3k | v128_from_32(105, 105, 105, 140)); | 123 | 16.3k | partial6 = v128_madd_s16(partial6, partial6); | 124 | 16.3k | partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); | 125 | | | 126 | 16.3k | partial4a = hsum4(partial4a, partial5a, partial6, partial7a); | 127 | 16.3k | v128_store_unaligned(tmp_cost1, partial4a); | 128 | 16.3k | return partial4a; | 129 | 16.3k | } |
|
130 | | |
131 | | /* transpose and reverse the order of the lines -- equivalent to a 90-degree |
132 | | counter-clockwise rotation of the pixels. */ |
133 | 8.18k | static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) { |
134 | 8.18k | const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); |
135 | 8.18k | const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); |
136 | 8.18k | const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); |
137 | 8.18k | const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); |
138 | 8.18k | const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); |
139 | 8.18k | const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); |
140 | 8.18k | const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); |
141 | 8.18k | const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); |
142 | | |
143 | 8.18k | const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); |
144 | 8.18k | const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); |
145 | 8.18k | const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); |
146 | 8.18k | const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); |
147 | 8.18k | const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); |
148 | 8.18k | const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); |
149 | 8.18k | const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); |
150 | 8.18k | const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); |
151 | | |
152 | 8.18k | res[7] = v128_ziplo_64(tr1_1, tr1_0); |
153 | 8.18k | res[6] = v128_ziphi_64(tr1_1, tr1_0); |
154 | 8.18k | res[5] = v128_ziplo_64(tr1_3, tr1_2); |
155 | 8.18k | res[4] = v128_ziphi_64(tr1_3, tr1_2); |
156 | 8.18k | res[3] = v128_ziplo_64(tr1_5, tr1_4); |
157 | 8.18k | res[2] = v128_ziphi_64(tr1_5, tr1_4); |
158 | 8.18k | res[1] = v128_ziplo_64(tr1_7, tr1_6); |
159 | 8.18k | res[0] = v128_ziphi_64(tr1_7, tr1_6); |
160 | 8.18k | } Unexecuted instantiation: cdef_block_sse4.c:array_reverse_transpose_8x8 cdef_block_avx2.c:array_reverse_transpose_8x8 Line | Count | Source | 133 | 8.18k | static inline void array_reverse_transpose_8x8(v128 *in, v128 *res) { | 134 | 8.18k | const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); | 135 | 8.18k | const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); | 136 | 8.18k | const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); | 137 | 8.18k | const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); | 138 | 8.18k | const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); | 139 | 8.18k | const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); | 140 | 8.18k | const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); | 141 | 8.18k | const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); | 142 | | | 143 | 8.18k | const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); | 144 | 8.18k | const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); | 145 | 8.18k | const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); | 146 | 8.18k | const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); | 147 | 8.18k | const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); | 148 | 8.18k | const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); | 149 | 8.18k | const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); | 150 | 8.18k | const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); | 151 | | | 152 | 8.18k | res[7] = v128_ziplo_64(tr1_1, tr1_0); | 153 | 8.18k | res[6] = v128_ziphi_64(tr1_1, tr1_0); | 154 | 8.18k | res[5] = v128_ziplo_64(tr1_3, tr1_2); | 155 | 8.18k | res[4] = v128_ziphi_64(tr1_3, tr1_2); | 156 | 8.18k | res[3] = v128_ziplo_64(tr1_5, tr1_4); | 157 | 8.18k | res[2] = v128_ziphi_64(tr1_5, tr1_4); | 158 | 8.18k | res[1] = v128_ziplo_64(tr1_7, tr1_6); | 159 | 8.18k | res[0] = v128_ziphi_64(tr1_7, tr1_6); | 160 | 8.18k | } |
|
161 | | |
162 | | int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, |
163 | 8.18k | int coeff_shift) { |
164 | 8.18k | int i; |
165 | 8.18k | int32_t cost[8]; |
166 | 8.18k | int32_t best_cost = 0; |
167 | 8.18k | int best_dir = 0; |
168 | 8.18k | v128 lines[8]; |
169 | 73.6k | for (i = 0; i < 8; i++) { |
170 | 65.4k | lines[i] = v128_load_unaligned(&img[i * stride]); |
171 | 65.4k | lines[i] = |
172 | 65.4k | v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); |
173 | 65.4k | } |
174 | | |
175 | | /* Compute "mostly vertical" directions. */ |
176 | 8.18k | v128 dir47 = compute_directions(lines, cost + 4); |
177 | | |
178 | 8.18k | array_reverse_transpose_8x8(lines, lines); |
179 | | |
180 | | /* Compute "mostly horizontal" directions. */ |
181 | 8.18k | v128 dir03 = compute_directions(lines, cost); |
182 | | |
183 | 8.18k | v128 max = v128_max_s32(dir03, dir47); |
184 | 8.18k | max = v128_max_s32(max, v128_align(max, max, 8)); |
185 | 8.18k | max = v128_max_s32(max, v128_align(max, max, 4)); |
186 | 8.18k | best_cost = v128_low_u32(max); |
187 | 8.18k | v128 t = |
188 | 8.18k | v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); |
189 | 8.18k | best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); |
190 | 8.18k | best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros |
191 | | |
192 | | /* Difference between the optimal variance and the variance along the |
193 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ |
194 | 8.18k | *var = best_cost - cost[(best_dir + 4) & 7]; |
195 | | /* We'd normally divide by 840, but dividing by 1024 is close enough |
196 | | for what we're going to do with this. */ |
197 | 8.18k | *var >>= 10; |
198 | 8.18k | return best_dir; |
199 | 8.18k | } Unexecuted instantiation: cdef_find_dir_sse4_1 Line | Count | Source | 163 | 8.18k | int coeff_shift) { | 164 | 8.18k | int i; | 165 | 8.18k | int32_t cost[8]; | 166 | 8.18k | int32_t best_cost = 0; | 167 | 8.18k | int best_dir = 0; | 168 | 8.18k | v128 lines[8]; | 169 | 73.6k | for (i = 0; i < 8; i++) { | 170 | 65.4k | lines[i] = v128_load_unaligned(&img[i * stride]); | 171 | 65.4k | lines[i] = | 172 | 65.4k | v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); | 173 | 65.4k | } | 174 | | | 175 | | /* Compute "mostly vertical" directions. */ | 176 | 8.18k | v128 dir47 = compute_directions(lines, cost + 4); | 177 | | | 178 | 8.18k | array_reverse_transpose_8x8(lines, lines); | 179 | | | 180 | | /* Compute "mostly horizontal" directions. */ | 181 | 8.18k | v128 dir03 = compute_directions(lines, cost); | 182 | | | 183 | 8.18k | v128 max = v128_max_s32(dir03, dir47); | 184 | 8.18k | max = v128_max_s32(max, v128_align(max, max, 8)); | 185 | 8.18k | max = v128_max_s32(max, v128_align(max, max, 4)); | 186 | 8.18k | best_cost = v128_low_u32(max); | 187 | 8.18k | v128 t = | 188 | 8.18k | v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); | 189 | 8.18k | best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); | 190 | 8.18k | best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros | 191 | | | 192 | | /* Difference between the optimal variance and the variance along the | 193 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ | 194 | 8.18k | *var = best_cost - cost[(best_dir + 4) & 7]; | 195 | | /* We'd normally divide by 840, but dividing by 1024 is close enough | 196 | | for what we're going to do with this. */ | 197 | 8.18k | *var >>= 10; | 198 | 8.18k | return best_dir; | 199 | 8.18k | } |
|
200 | | |
201 | | // Work around compiler out of memory issues with Win32 builds. This issue has |
202 | | // been observed with Visual Studio 2017, 2019, and 2022 (version 17.10.3). |
203 | | #if defined(_MSC_VER) && defined(_M_IX86) |
204 | | #define CDEF_INLINE static inline |
205 | | #else |
206 | | #define CDEF_INLINE SIMD_INLINE |
207 | | #endif |
208 | | |
209 | | // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) |
210 | | CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, |
211 | 321M | unsigned int adjdamp) { |
212 | 321M | v256 diff = v256_sub_16(a, b); |
213 | 321M | const v256 sign = v256_shr_n_s16(diff, 15); |
214 | 321M | diff = v256_abs_s16(diff); |
215 | 321M | const v256 s = |
216 | 321M | v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); |
217 | 321M | return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); |
218 | 321M | } Unexecuted instantiation: cdef_block_sse4.c:constrain16 cdef_block_avx2.c:constrain16 Line | Count | Source | 211 | 321M | unsigned int adjdamp) { | 212 | 321M | v256 diff = v256_sub_16(a, b); | 213 | 321M | const v256 sign = v256_shr_n_s16(diff, 15); | 214 | 321M | diff = v256_abs_s16(diff); | 215 | 321M | const v256 s = | 216 | 321M | v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); | 217 | 321M | return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); | 218 | 321M | } |
|
219 | | |
220 | | SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max, |
221 | 45.6M | v256 cdef_large_value_mask) { |
222 | 45.6M | if (is_lowbd) { |
223 | 22.8M | v256 max_u8; |
224 | 22.8M | max_u8 = tap[0]; |
225 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[1]); |
226 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[2]); |
227 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[3]); |
228 | | /* The source is 16 bits, however, we only really care about the lower |
229 | | 8 bits. The upper 8 bits contain the "large" flag. After the final |
230 | | primary max has been calculated, zero out the upper 8 bits. Use this |
231 | | to find the "16 bit" max. */ |
232 | 22.8M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); |
233 | 22.8M | } else { |
234 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ |
235 | 22.8M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); |
236 | 22.8M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); |
237 | 22.8M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); |
238 | 22.8M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); |
239 | 22.8M | } |
240 | 45.6M | return max; |
241 | 45.6M | } Unexecuted instantiation: cdef_block_sse4.c:get_max_primary cdef_block_avx2.c:get_max_primary Line | Count | Source | 221 | 45.6M | v256 cdef_large_value_mask) { | 222 | 45.6M | if (is_lowbd) { | 223 | 22.8M | v256 max_u8; | 224 | 22.8M | max_u8 = tap[0]; | 225 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[1]); | 226 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[2]); | 227 | 22.8M | max_u8 = v256_max_u8(max_u8, tap[3]); | 228 | | /* The source is 16 bits, however, we only really care about the lower | 229 | | 8 bits. The upper 8 bits contain the "large" flag. After the final | 230 | | primary max has been calculated, zero out the upper 8 bits. Use this | 231 | | to find the "16 bit" max. */ | 232 | 22.8M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); | 233 | 22.8M | } else { | 234 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ | 235 | 22.8M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); | 236 | 22.8M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); | 237 | 22.8M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); | 238 | 22.8M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); | 239 | 22.8M | } | 240 | 45.6M | return max; | 241 | 45.6M | } |
|
242 | | |
243 | | SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max, |
244 | 49.2M | v256 cdef_large_value_mask) { |
245 | 49.2M | if (is_lowbd) { |
246 | 24.3M | v256 max_u8; |
247 | 24.3M | max_u8 = tap[0]; |
248 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[1]); |
249 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[2]); |
250 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[3]); |
251 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[4]); |
252 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[5]); |
253 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[6]); |
254 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[7]); |
255 | | /* The source is 16 bits, however, we only really care about the lower |
256 | | 8 bits. The upper 8 bits contain the "large" flag. After the final |
257 | | primary max has been calculated, zero out the upper 8 bits. Use this |
258 | | to find the "16 bit" max. */ |
259 | 24.3M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); |
260 | 24.9M | } else { |
261 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ |
262 | 24.9M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); |
263 | 24.9M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); |
264 | 24.9M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); |
265 | 24.9M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); |
266 | 24.9M | max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); |
267 | 24.9M | max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); |
268 | 24.9M | max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); |
269 | 24.9M | max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); |
270 | 24.9M | } |
271 | 49.2M | return max; |
272 | 49.2M | } Unexecuted instantiation: cdef_block_sse4.c:get_max_secondary cdef_block_avx2.c:get_max_secondary Line | Count | Source | 244 | 49.2M | v256 cdef_large_value_mask) { | 245 | 49.2M | if (is_lowbd) { | 246 | 24.3M | v256 max_u8; | 247 | 24.3M | max_u8 = tap[0]; | 248 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[1]); | 249 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[2]); | 250 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[3]); | 251 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[4]); | 252 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[5]); | 253 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[6]); | 254 | 24.3M | max_u8 = v256_max_u8(max_u8, tap[7]); | 255 | | /* The source is 16 bits, however, we only really care about the lower | 256 | | 8 bits. The upper 8 bits contain the "large" flag. After the final | 257 | | primary max has been calculated, zero out the upper 8 bits. Use this | 258 | | to find the "16 bit" max. */ | 259 | 24.3M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); | 260 | 24.9M | } else { | 261 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ | 262 | 24.9M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); | 263 | 24.9M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); | 264 | 24.9M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); | 265 | 24.9M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); | 266 | 24.9M | max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); | 267 | 24.9M | max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); | 268 | 24.9M | max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); | 269 | 24.9M | max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); | 270 | 24.9M | } | 271 | 49.2M | return max; | 272 | 49.2M | } |
|
273 | | |
274 | | // MSVC takes far too much time optimizing these. |
275 | | // https://bugs.chromium.org/p/aomedia/issues/detail?id=3395 |
276 | | #if defined(_MSC_VER) && !defined(__clang__) |
277 | | #pragma optimize("", off) |
278 | | #endif |
279 | | |
280 | | CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, |
281 | | const uint16_t *in, int pri_strength, |
282 | | int sec_strength, int dir, int pri_damping, |
283 | | int sec_damping, int coeff_shift, int height, |
284 | 47.0M | int enable_primary, int enable_secondary) { |
285 | 47.0M | uint8_t *dst8 = (uint8_t *)dest; |
286 | 47.0M | uint16_t *dst16 = (uint16_t *)dest; |
287 | 47.0M | const int clipping_required = enable_primary && enable_secondary; |
288 | 47.0M | v256 p0, p1, p2, p3; |
289 | 47.0M | v256 sum, row, res; |
290 | 47.0M | v256 max, min; |
291 | 47.0M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); |
292 | 47.0M | const int po1 = cdef_directions[dir][0]; |
293 | 47.0M | const int po2 = cdef_directions[dir][1]; |
294 | 47.0M | const int s1o1 = cdef_directions[dir + 2][0]; |
295 | 47.0M | const int s1o2 = cdef_directions[dir + 2][1]; |
296 | 47.0M | const int s2o1 = cdef_directions[dir - 2][0]; |
297 | 47.0M | const int s2o2 = cdef_directions[dir - 2][1]; |
298 | 47.0M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; |
299 | 47.0M | const int *sec_taps = cdef_sec_taps; |
300 | 47.0M | int i; |
301 | | |
302 | 47.0M | if (enable_primary && pri_strength) |
303 | 47.0M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
304 | 47.0M | if (enable_secondary && sec_strength) |
305 | 44.8M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
306 | | |
307 | 95.0M | for (i = 0; i < height; i += 4) { |
308 | 47.9M | sum = v256_zero(); |
309 | 47.9M | row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), |
310 | 47.9M | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), |
311 | 47.9M | v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), |
312 | 47.9M | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); |
313 | 47.9M | max = min = row; |
314 | | |
315 | 47.9M | if (enable_primary) { |
316 | 46.5M | v256 tap[4]; |
317 | | // Primary near taps |
318 | 46.5M | tap[0] = |
319 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), |
320 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), |
321 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), |
322 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); |
323 | 46.5M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); |
324 | 46.5M | tap[1] = |
325 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), |
326 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), |
327 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), |
328 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); |
329 | 46.5M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); |
330 | | |
331 | | // sum += pri_taps[0] * (p0 + p1) |
332 | 46.5M | sum = v256_add_16( |
333 | 46.5M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); |
334 | | |
335 | | // Primary far taps |
336 | 46.5M | tap[2] = |
337 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), |
338 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), |
339 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), |
340 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); |
341 | 46.5M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); |
342 | 46.5M | tap[3] = |
343 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), |
344 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), |
345 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), |
346 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); |
347 | 46.5M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); |
348 | | |
349 | | // sum += pri_taps[1] * (p0 + p1) |
350 | 46.5M | sum = v256_add_16( |
351 | 46.5M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); |
352 | 46.5M | if (clipping_required) { |
353 | 33.1M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); |
354 | | |
355 | 33.1M | min = v256_min_s16(min, tap[0]); |
356 | 33.1M | min = v256_min_s16(min, tap[1]); |
357 | 33.1M | min = v256_min_s16(min, tap[2]); |
358 | 33.1M | min = v256_min_s16(min, tap[3]); |
359 | 33.1M | } |
360 | 46.5M | } |
361 | | |
362 | 47.9M | if (enable_secondary) { |
363 | 33.5M | v256 tap[8]; |
364 | | // Secondary near taps |
365 | 33.5M | tap[0] = |
366 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), |
367 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), |
368 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), |
369 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); |
370 | 33.5M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); |
371 | 33.5M | tap[1] = |
372 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), |
373 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), |
374 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), |
375 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); |
376 | 33.5M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); |
377 | 33.5M | tap[2] = |
378 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), |
379 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), |
380 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), |
381 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); |
382 | 33.5M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); |
383 | 33.5M | tap[3] = |
384 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), |
385 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), |
386 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), |
387 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); |
388 | 33.5M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); |
389 | | |
390 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) |
391 | 33.5M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), |
392 | 33.5M | v256_add_16(v256_add_16(p0, p1), |
393 | 33.5M | v256_add_16(p2, p3)))); |
394 | | |
395 | | // Secondary far taps |
396 | 33.5M | tap[4] = |
397 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), |
398 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), |
399 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), |
400 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); |
401 | 33.5M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); |
402 | 33.5M | tap[5] = |
403 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), |
404 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), |
405 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), |
406 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); |
407 | 33.5M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); |
408 | 33.5M | tap[6] = |
409 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), |
410 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), |
411 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), |
412 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); |
413 | 33.5M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); |
414 | 33.5M | tap[7] = |
415 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), |
416 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), |
417 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), |
418 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); |
419 | 33.5M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); |
420 | | |
421 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) |
422 | 33.5M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), |
423 | 33.5M | v256_add_16(v256_add_16(p0, p1), |
424 | 33.5M | v256_add_16(p2, p3)))); |
425 | | |
426 | 34.4M | if (clipping_required) { |
427 | 34.4M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); |
428 | | |
429 | 34.4M | min = v256_min_s16(min, tap[0]); |
430 | 34.4M | min = v256_min_s16(min, tap[1]); |
431 | 34.4M | min = v256_min_s16(min, tap[2]); |
432 | 34.4M | min = v256_min_s16(min, tap[3]); |
433 | 34.4M | min = v256_min_s16(min, tap[4]); |
434 | 34.4M | min = v256_min_s16(min, tap[5]); |
435 | 34.4M | min = v256_min_s16(min, tap[6]); |
436 | 34.4M | min = v256_min_s16(min, tap[7]); |
437 | 34.4M | } |
438 | 33.5M | } |
439 | | |
440 | | // res = row + ((sum - (sum < 0) + 8) >> 4) |
441 | 47.9M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); |
442 | 47.9M | res = v256_add_16(sum, v256_dup_16(8)); |
443 | 47.9M | res = v256_shr_n_s16(res, 4); |
444 | 47.9M | res = v256_add_16(row, res); |
445 | 47.9M | if (clipping_required) { |
446 | 34.4M | res = v256_min_s16(v256_max_s16(res, min), max); |
447 | 34.4M | } |
448 | | |
449 | 47.9M | if (is_lowbd) { |
450 | 22.5M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); |
451 | 22.5M | u32_store_aligned(&dst8[(i + 0) * dstride], |
452 | 22.5M | v64_high_u32(v128_high_v64(res_128))); |
453 | 22.5M | u32_store_aligned(&dst8[(i + 1) * dstride], |
454 | 22.5M | v64_low_u32(v128_high_v64(res_128))); |
455 | 22.5M | u32_store_aligned(&dst8[(i + 2) * dstride], |
456 | 22.5M | v64_high_u32(v128_low_v64(res_128))); |
457 | 22.5M | u32_store_aligned(&dst8[(i + 3) * dstride], |
458 | 22.5M | v64_low_u32(v128_low_v64(res_128))); |
459 | 25.3M | } else { |
460 | 25.3M | v64_store_aligned(&dst16[(i + 0) * dstride], |
461 | 25.3M | v128_high_v64(v256_high_v128(res))); |
462 | 25.3M | v64_store_aligned(&dst16[(i + 1) * dstride], |
463 | 25.3M | v128_low_v64(v256_high_v128(res))); |
464 | 25.3M | v64_store_aligned(&dst16[(i + 2) * dstride], |
465 | 25.3M | v128_high_v64(v256_low_v128(res))); |
466 | 25.3M | v64_store_aligned(&dst16[(i + 3) * dstride], |
467 | 25.3M | v128_low_v64(v256_low_v128(res))); |
468 | 25.3M | } |
469 | 47.9M | } |
470 | 47.0M | } Unexecuted instantiation: cdef_block_sse4.c:filter_block_4x4 cdef_block_avx2.c:filter_block_4x4 Line | Count | Source | 284 | 47.0M | int enable_primary, int enable_secondary) { | 285 | 47.0M | uint8_t *dst8 = (uint8_t *)dest; | 286 | 47.0M | uint16_t *dst16 = (uint16_t *)dest; | 287 | 47.0M | const int clipping_required = enable_primary && enable_secondary; | 288 | 47.0M | v256 p0, p1, p2, p3; | 289 | 47.0M | v256 sum, row, res; | 290 | 47.0M | v256 max, min; | 291 | 47.0M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); | 292 | 47.0M | const int po1 = cdef_directions[dir][0]; | 293 | 47.0M | const int po2 = cdef_directions[dir][1]; | 294 | 47.0M | const int s1o1 = cdef_directions[dir + 2][0]; | 295 | 47.0M | const int s1o2 = cdef_directions[dir + 2][1]; | 296 | 47.0M | const int s2o1 = cdef_directions[dir - 2][0]; | 297 | 47.0M | const int s2o2 = cdef_directions[dir - 2][1]; | 298 | 47.0M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; | 299 | 47.0M | const int *sec_taps = cdef_sec_taps; | 300 | 47.0M | int i; | 301 | | | 302 | 47.0M | if (enable_primary && pri_strength) | 303 | 47.0M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); | 304 | 47.0M | if (enable_secondary && sec_strength) | 305 | 44.8M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); | 306 | | | 307 | 95.0M | for (i = 0; i < height; i += 4) { | 308 | 47.9M | sum = v256_zero(); | 309 | 47.9M | row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), | 310 | 47.9M | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), | 311 | 47.9M | v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), | 312 | 47.9M | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); | 313 | 47.9M | max = min = row; | 314 | | | 315 | 47.9M | if (enable_primary) { | 316 | 46.5M | v256 tap[4]; | 317 | | // Primary near taps | 318 | 46.5M | tap[0] = | 319 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), | 320 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), | 321 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), | 322 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); | 323 | 46.5M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); | 324 | 46.5M | tap[1] = | 325 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), | 326 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), | 327 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), | 328 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); | 329 | 46.5M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); | 330 | | | 331 | | // sum += pri_taps[0] * (p0 + p1) | 332 | 46.5M | sum = v256_add_16( | 333 | 46.5M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); | 334 | | | 335 | | // Primary far taps | 336 | 46.5M | tap[2] = | 337 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), | 338 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), | 339 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), | 340 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); | 341 | 46.5M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); | 342 | 46.5M | tap[3] = | 343 | 46.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), | 344 | 46.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), | 345 | 46.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), | 346 | 46.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); | 347 | 46.5M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); | 348 | | | 349 | | // sum += pri_taps[1] * (p0 + p1) | 350 | 46.5M | sum = v256_add_16( | 351 | 46.5M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); | 352 | 46.5M | if (clipping_required) { | 353 | 33.1M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); | 354 | | | 355 | 33.1M | min = v256_min_s16(min, tap[0]); | 356 | 33.1M | min = v256_min_s16(min, tap[1]); | 357 | 33.1M | min = v256_min_s16(min, tap[2]); | 358 | 33.1M | min = v256_min_s16(min, tap[3]); | 359 | 33.1M | } | 360 | 46.5M | } | 361 | | | 362 | 47.9M | if (enable_secondary) { | 363 | 33.5M | v256 tap[8]; | 364 | | // Secondary near taps | 365 | 33.5M | tap[0] = | 366 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), | 367 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), | 368 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), | 369 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); | 370 | 33.5M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); | 371 | 33.5M | tap[1] = | 372 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), | 373 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), | 374 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), | 375 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); | 376 | 33.5M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); | 377 | 33.5M | tap[2] = | 378 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), | 379 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), | 380 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), | 381 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); | 382 | 33.5M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); | 383 | 33.5M | tap[3] = | 384 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), | 385 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), | 386 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), | 387 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); | 388 | 33.5M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); | 389 | | | 390 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) | 391 | 33.5M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), | 392 | 33.5M | v256_add_16(v256_add_16(p0, p1), | 393 | 33.5M | v256_add_16(p2, p3)))); | 394 | | | 395 | | // Secondary far taps | 396 | 33.5M | tap[4] = | 397 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), | 398 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), | 399 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), | 400 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); | 401 | 33.5M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); | 402 | 33.5M | tap[5] = | 403 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), | 404 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), | 405 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), | 406 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); | 407 | 33.5M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); | 408 | 33.5M | tap[6] = | 409 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), | 410 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), | 411 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), | 412 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); | 413 | 33.5M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); | 414 | 33.5M | tap[7] = | 415 | 33.5M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), | 416 | 33.5M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), | 417 | 33.5M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), | 418 | 33.5M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); | 419 | 33.5M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); | 420 | | | 421 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) | 422 | 33.5M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), | 423 | 33.5M | v256_add_16(v256_add_16(p0, p1), | 424 | 33.5M | v256_add_16(p2, p3)))); | 425 | | | 426 | 34.4M | if (clipping_required) { | 427 | 34.4M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); | 428 | | | 429 | 34.4M | min = v256_min_s16(min, tap[0]); | 430 | 34.4M | min = v256_min_s16(min, tap[1]); | 431 | 34.4M | min = v256_min_s16(min, tap[2]); | 432 | 34.4M | min = v256_min_s16(min, tap[3]); | 433 | 34.4M | min = v256_min_s16(min, tap[4]); | 434 | 34.4M | min = v256_min_s16(min, tap[5]); | 435 | 34.4M | min = v256_min_s16(min, tap[6]); | 436 | 34.4M | min = v256_min_s16(min, tap[7]); | 437 | 34.4M | } | 438 | 33.5M | } | 439 | | | 440 | | // res = row + ((sum - (sum < 0) + 8) >> 4) | 441 | 47.9M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); | 442 | 47.9M | res = v256_add_16(sum, v256_dup_16(8)); | 443 | 47.9M | res = v256_shr_n_s16(res, 4); | 444 | 47.9M | res = v256_add_16(row, res); | 445 | 47.9M | if (clipping_required) { | 446 | 34.4M | res = v256_min_s16(v256_max_s16(res, min), max); | 447 | 34.4M | } | 448 | | | 449 | 47.9M | if (is_lowbd) { | 450 | 22.5M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); | 451 | 22.5M | u32_store_aligned(&dst8[(i + 0) * dstride], | 452 | 22.5M | v64_high_u32(v128_high_v64(res_128))); | 453 | 22.5M | u32_store_aligned(&dst8[(i + 1) * dstride], | 454 | 22.5M | v64_low_u32(v128_high_v64(res_128))); | 455 | 22.5M | u32_store_aligned(&dst8[(i + 2) * dstride], | 456 | 22.5M | v64_high_u32(v128_low_v64(res_128))); | 457 | 22.5M | u32_store_aligned(&dst8[(i + 3) * dstride], | 458 | 22.5M | v64_low_u32(v128_low_v64(res_128))); | 459 | 25.3M | } else { | 460 | 25.3M | v64_store_aligned(&dst16[(i + 0) * dstride], | 461 | 25.3M | v128_high_v64(v256_high_v128(res))); | 462 | 25.3M | v64_store_aligned(&dst16[(i + 1) * dstride], | 463 | 25.3M | v128_low_v64(v256_high_v128(res))); | 464 | 25.3M | v64_store_aligned(&dst16[(i + 2) * dstride], | 465 | 25.3M | v128_high_v64(v256_low_v128(res))); | 466 | 25.3M | v64_store_aligned(&dst16[(i + 3) * dstride], | 467 | 25.3M | v128_low_v64(v256_low_v128(res))); | 468 | 25.3M | } | 469 | 47.9M | } | 470 | 47.0M | } |
|
471 | | |
472 | | CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, |
473 | | const uint16_t *in, int pri_strength, |
474 | | int sec_strength, int dir, int pri_damping, |
475 | | int sec_damping, int coeff_shift, int height, |
476 | 20.4M | int enable_primary, int enable_secondary) { |
477 | 20.4M | uint8_t *dst8 = (uint8_t *)dest; |
478 | 20.4M | uint16_t *dst16 = (uint16_t *)dest; |
479 | 20.4M | const int clipping_required = enable_primary && enable_secondary; |
480 | 20.4M | int i; |
481 | 20.4M | v256 sum, p0, p1, p2, p3, row, res; |
482 | 20.4M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); |
483 | 20.4M | v256 max, min; |
484 | 20.4M | const int po1 = cdef_directions[dir][0]; |
485 | 20.4M | const int po2 = cdef_directions[dir][1]; |
486 | 20.4M | const int s1o1 = cdef_directions[dir + 2][0]; |
487 | 20.4M | const int s1o2 = cdef_directions[dir + 2][1]; |
488 | 20.4M | const int s2o1 = cdef_directions[dir - 2][0]; |
489 | 20.4M | const int s2o2 = cdef_directions[dir - 2][1]; |
490 | 20.4M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; |
491 | 20.4M | const int *sec_taps = cdef_sec_taps; |
492 | | |
493 | 20.4M | if (enable_primary && pri_strength) |
494 | 13.3M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
495 | 20.4M | if (enable_secondary && sec_strength) |
496 | 17.4M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
497 | | |
498 | 83.4M | for (i = 0; i < height; i += 2) { |
499 | 63.0M | v256 tap[8]; |
500 | 63.0M | sum = v256_zero(); |
501 | 63.0M | row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), |
502 | 63.0M | v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); |
503 | | |
504 | 63.0M | min = max = row; |
505 | 63.0M | if (enable_primary) { |
506 | | // Primary near taps |
507 | 42.2M | tap[0] = v256_from_v128( |
508 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), |
509 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); |
510 | 42.2M | tap[1] = v256_from_v128( |
511 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), |
512 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); |
513 | 42.2M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); |
514 | 42.2M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); |
515 | | |
516 | | // sum += pri_taps[0] * (p0 + p1) |
517 | 42.2M | sum = v256_add_16( |
518 | 42.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); |
519 | | |
520 | | // Primary far taps |
521 | 42.2M | tap[2] = v256_from_v128( |
522 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), |
523 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); |
524 | 42.2M | tap[3] = v256_from_v128( |
525 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), |
526 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); |
527 | 42.2M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); |
528 | 42.2M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); |
529 | | |
530 | | // sum += pri_taps[1] * (p0 + p1) |
531 | 42.2M | sum = v256_add_16( |
532 | 42.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); |
533 | | |
534 | 42.2M | if (clipping_required) { |
535 | 26.3M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); |
536 | | |
537 | 26.3M | min = v256_min_s16(min, tap[0]); |
538 | 26.3M | min = v256_min_s16(min, tap[1]); |
539 | 26.3M | min = v256_min_s16(min, tap[2]); |
540 | 26.3M | min = v256_min_s16(min, tap[3]); |
541 | 26.3M | } |
542 | | // End primary |
543 | 42.2M | } |
544 | | |
545 | 63.0M | if (enable_secondary) { |
546 | | // Secondary near taps |
547 | 49.4M | tap[0] = v256_from_v128( |
548 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), |
549 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); |
550 | 49.4M | tap[1] = v256_from_v128( |
551 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), |
552 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); |
553 | 49.4M | tap[2] = v256_from_v128( |
554 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), |
555 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); |
556 | 49.4M | tap[3] = v256_from_v128( |
557 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), |
558 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); |
559 | 49.4M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); |
560 | 49.4M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); |
561 | 49.4M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); |
562 | 49.4M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); |
563 | | |
564 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) |
565 | 49.4M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), |
566 | 49.4M | v256_add_16(v256_add_16(p0, p1), |
567 | 49.4M | v256_add_16(p2, p3)))); |
568 | | |
569 | | // Secondary far taps |
570 | 49.4M | tap[4] = v256_from_v128( |
571 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), |
572 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); |
573 | 49.4M | tap[5] = v256_from_v128( |
574 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), |
575 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); |
576 | 49.4M | tap[6] = v256_from_v128( |
577 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), |
578 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); |
579 | 49.4M | tap[7] = v256_from_v128( |
580 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), |
581 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); |
582 | 49.4M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); |
583 | 49.4M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); |
584 | 49.4M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); |
585 | 49.4M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); |
586 | | |
587 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) |
588 | 49.4M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), |
589 | 49.4M | v256_add_16(v256_add_16(p0, p1), |
590 | 49.4M | v256_add_16(p2, p3)))); |
591 | | |
592 | 49.4M | if (clipping_required) { |
593 | 25.3M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); |
594 | | |
595 | 25.3M | min = v256_min_s16(min, tap[0]); |
596 | 25.3M | min = v256_min_s16(min, tap[1]); |
597 | 25.3M | min = v256_min_s16(min, tap[2]); |
598 | 25.3M | min = v256_min_s16(min, tap[3]); |
599 | 25.3M | min = v256_min_s16(min, tap[4]); |
600 | 25.3M | min = v256_min_s16(min, tap[5]); |
601 | 25.3M | min = v256_min_s16(min, tap[6]); |
602 | 25.3M | min = v256_min_s16(min, tap[7]); |
603 | 25.3M | } |
604 | | // End secondary |
605 | 49.4M | } |
606 | | |
607 | | // res = row + ((sum - (sum < 0) + 8) >> 4) |
608 | 63.0M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); |
609 | 63.0M | res = v256_add_16(sum, v256_dup_16(8)); |
610 | 63.0M | res = v256_shr_n_s16(res, 4); |
611 | 63.0M | res = v256_add_16(row, res); |
612 | 63.0M | if (clipping_required) { |
613 | 25.3M | res = v256_min_s16(v256_max_s16(res, min), max); |
614 | 25.3M | } |
615 | | |
616 | 63.0M | if (is_lowbd) { |
617 | 28.7M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); |
618 | 28.7M | v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); |
619 | 28.7M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); |
620 | 34.2M | } else { |
621 | 34.2M | v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); |
622 | 34.2M | v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); |
623 | 34.2M | } |
624 | 63.0M | } |
625 | 20.4M | } Unexecuted instantiation: cdef_block_sse4.c:filter_block_8x8 cdef_block_avx2.c:filter_block_8x8 Line | Count | Source | 476 | 20.4M | int enable_primary, int enable_secondary) { | 477 | 20.4M | uint8_t *dst8 = (uint8_t *)dest; | 478 | 20.4M | uint16_t *dst16 = (uint16_t *)dest; | 479 | 20.4M | const int clipping_required = enable_primary && enable_secondary; | 480 | 20.4M | int i; | 481 | 20.4M | v256 sum, p0, p1, p2, p3, row, res; | 482 | 20.4M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); | 483 | 20.4M | v256 max, min; | 484 | 20.4M | const int po1 = cdef_directions[dir][0]; | 485 | 20.4M | const int po2 = cdef_directions[dir][1]; | 486 | 20.4M | const int s1o1 = cdef_directions[dir + 2][0]; | 487 | 20.4M | const int s1o2 = cdef_directions[dir + 2][1]; | 488 | 20.4M | const int s2o1 = cdef_directions[dir - 2][0]; | 489 | 20.4M | const int s2o2 = cdef_directions[dir - 2][1]; | 490 | 20.4M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; | 491 | 20.4M | const int *sec_taps = cdef_sec_taps; | 492 | | | 493 | 20.4M | if (enable_primary && pri_strength) | 494 | 13.3M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); | 495 | 20.4M | if (enable_secondary && sec_strength) | 496 | 17.4M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); | 497 | | | 498 | 83.4M | for (i = 0; i < height; i += 2) { | 499 | 63.0M | v256 tap[8]; | 500 | 63.0M | sum = v256_zero(); | 501 | 63.0M | row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), | 502 | 63.0M | v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); | 503 | | | 504 | 63.0M | min = max = row; | 505 | 63.0M | if (enable_primary) { | 506 | | // Primary near taps | 507 | 42.2M | tap[0] = v256_from_v128( | 508 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), | 509 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); | 510 | 42.2M | tap[1] = v256_from_v128( | 511 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), | 512 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); | 513 | 42.2M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); | 514 | 42.2M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); | 515 | | | 516 | | // sum += pri_taps[0] * (p0 + p1) | 517 | 42.2M | sum = v256_add_16( | 518 | 42.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); | 519 | | | 520 | | // Primary far taps | 521 | 42.2M | tap[2] = v256_from_v128( | 522 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), | 523 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); | 524 | 42.2M | tap[3] = v256_from_v128( | 525 | 42.2M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), | 526 | 42.2M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); | 527 | 42.2M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); | 528 | 42.2M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); | 529 | | | 530 | | // sum += pri_taps[1] * (p0 + p1) | 531 | 42.2M | sum = v256_add_16( | 532 | 42.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); | 533 | | | 534 | 42.2M | if (clipping_required) { | 535 | 26.3M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); | 536 | | | 537 | 26.3M | min = v256_min_s16(min, tap[0]); | 538 | 26.3M | min = v256_min_s16(min, tap[1]); | 539 | 26.3M | min = v256_min_s16(min, tap[2]); | 540 | 26.3M | min = v256_min_s16(min, tap[3]); | 541 | 26.3M | } | 542 | | // End primary | 543 | 42.2M | } | 544 | | | 545 | 63.0M | if (enable_secondary) { | 546 | | // Secondary near taps | 547 | 49.4M | tap[0] = v256_from_v128( | 548 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), | 549 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); | 550 | 49.4M | tap[1] = v256_from_v128( | 551 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), | 552 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); | 553 | 49.4M | tap[2] = v256_from_v128( | 554 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), | 555 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); | 556 | 49.4M | tap[3] = v256_from_v128( | 557 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), | 558 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); | 559 | 49.4M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); | 560 | 49.4M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); | 561 | 49.4M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); | 562 | 49.4M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); | 563 | | | 564 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) | 565 | 49.4M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), | 566 | 49.4M | v256_add_16(v256_add_16(p0, p1), | 567 | 49.4M | v256_add_16(p2, p3)))); | 568 | | | 569 | | // Secondary far taps | 570 | 49.4M | tap[4] = v256_from_v128( | 571 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), | 572 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); | 573 | 49.4M | tap[5] = v256_from_v128( | 574 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), | 575 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); | 576 | 49.4M | tap[6] = v256_from_v128( | 577 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), | 578 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); | 579 | 49.4M | tap[7] = v256_from_v128( | 580 | 49.4M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), | 581 | 49.4M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); | 582 | 49.4M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); | 583 | 49.4M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); | 584 | 49.4M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); | 585 | 49.4M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); | 586 | | | 587 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) | 588 | 49.4M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), | 589 | 49.4M | v256_add_16(v256_add_16(p0, p1), | 590 | 49.4M | v256_add_16(p2, p3)))); | 591 | | | 592 | 49.4M | if (clipping_required) { | 593 | 25.3M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); | 594 | | | 595 | 25.3M | min = v256_min_s16(min, tap[0]); | 596 | 25.3M | min = v256_min_s16(min, tap[1]); | 597 | 25.3M | min = v256_min_s16(min, tap[2]); | 598 | 25.3M | min = v256_min_s16(min, tap[3]); | 599 | 25.3M | min = v256_min_s16(min, tap[4]); | 600 | 25.3M | min = v256_min_s16(min, tap[5]); | 601 | 25.3M | min = v256_min_s16(min, tap[6]); | 602 | 25.3M | min = v256_min_s16(min, tap[7]); | 603 | 25.3M | } | 604 | | // End secondary | 605 | 49.4M | } | 606 | | | 607 | | // res = row + ((sum - (sum < 0) + 8) >> 4) | 608 | 63.0M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); | 609 | 63.0M | res = v256_add_16(sum, v256_dup_16(8)); | 610 | 63.0M | res = v256_shr_n_s16(res, 4); | 611 | 63.0M | res = v256_add_16(row, res); | 612 | 63.0M | if (clipping_required) { | 613 | 25.3M | res = v256_min_s16(v256_max_s16(res, min), max); | 614 | 25.3M | } | 615 | | | 616 | 63.0M | if (is_lowbd) { | 617 | 28.7M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); | 618 | 28.7M | v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); | 619 | 28.7M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); | 620 | 34.2M | } else { | 621 | 34.2M | v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); | 622 | 34.2M | v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); | 623 | 34.2M | } | 624 | 63.0M | } | 625 | 20.4M | } |
|
626 | | |
627 | | #if defined(_MSC_VER) && !defined(__clang__) |
628 | | #pragma optimize("", on) |
629 | | #endif |
630 | | |
631 | | SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, |
632 | 0 | const uint16_t *in, int height) { |
633 | 0 | uint8_t *dst8 = (uint8_t *)dest; |
634 | 0 | uint16_t *dst16 = (uint16_t *)dest; |
635 | 0 | int i; |
636 | 0 | for (i = 0; i < height; i += 4) { |
637 | 0 | const v128 row0 = |
638 | 0 | v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), |
639 | 0 | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); |
640 | 0 | const v128 row1 = |
641 | 0 | v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), |
642 | 0 | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); |
643 | 0 | if (is_lowbd) { |
644 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ |
645 | 0 | const v128 res_128 = v128_pack_s16_u8(row1, row0); |
646 | 0 | u32_store_aligned(&dst8[(i + 0) * dstride], |
647 | 0 | v64_high_u32(v128_low_v64(res_128))); |
648 | 0 | u32_store_aligned(&dst8[(i + 1) * dstride], |
649 | 0 | v64_low_u32(v128_low_v64(res_128))); |
650 | 0 | u32_store_aligned(&dst8[(i + 2) * dstride], |
651 | 0 | v64_high_u32(v128_high_v64(res_128))); |
652 | 0 | u32_store_aligned(&dst8[(i + 3) * dstride], |
653 | 0 | v64_low_u32(v128_high_v64(res_128))); |
654 | 0 | } else { |
655 | 0 | v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0)); |
656 | 0 | v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0)); |
657 | 0 | v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1)); |
658 | 0 | v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1)); |
659 | 0 | } |
660 | 0 | } |
661 | 0 | } Unexecuted instantiation: cdef_block_sse4.c:copy_block_4xh Unexecuted instantiation: cdef_block_avx2.c:copy_block_4xh |
662 | | |
663 | | SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, |
664 | 5.22M | const uint16_t *in, int height) { |
665 | 5.22M | uint8_t *dst8 = (uint8_t *)dest; |
666 | 5.22M | uint16_t *dst16 = (uint16_t *)dest; |
667 | 5.22M | int i; |
668 | 26.0M | for (i = 0; i < height; i += 2) { |
669 | 20.8M | const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); |
670 | 20.8M | const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); |
671 | 20.8M | if (is_lowbd) { |
672 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ |
673 | 11.0M | const v128 res_128 = v128_pack_s16_u8(row1, row0); |
674 | 11.0M | v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); |
675 | 11.0M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); |
676 | 11.0M | } else { |
677 | 9.75M | v128_store_unaligned(&dst16[i * dstride], row0); |
678 | 9.75M | v128_store_unaligned(&dst16[(i + 1) * dstride], row1); |
679 | 9.75M | } |
680 | 20.8M | } |
681 | 5.22M | } Unexecuted instantiation: cdef_block_sse4.c:copy_block_8xh cdef_block_avx2.c:copy_block_8xh Line | Count | Source | 664 | 5.22M | const uint16_t *in, int height) { | 665 | 5.22M | uint8_t *dst8 = (uint8_t *)dest; | 666 | 5.22M | uint16_t *dst16 = (uint16_t *)dest; | 667 | 5.22M | int i; | 668 | 26.0M | for (i = 0; i < height; i += 2) { | 669 | 20.8M | const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); | 670 | 20.8M | const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); | 671 | 20.8M | if (is_lowbd) { | 672 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ | 673 | 11.0M | const v128 res_128 = v128_pack_s16_u8(row1, row0); | 674 | 11.0M | v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); | 675 | 11.0M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); | 676 | 11.0M | } else { | 677 | 9.75M | v128_store_unaligned(&dst16[i * dstride], row0); | 678 | 9.75M | v128_store_unaligned(&dst16[(i + 1) * dstride], row1); | 679 | 9.75M | } | 680 | 20.8M | } | 681 | 5.22M | } |
|
682 | | |
683 | | void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in, |
684 | | int pri_strength, int sec_strength, int dir, |
685 | | int pri_damping, int sec_damping, |
686 | | int coeff_shift, int block_width, |
687 | 26.5M | int block_height) { |
688 | 26.5M | if (block_width == 8) { |
689 | 4.73M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
690 | 4.73M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
691 | 4.73M | block_height, /*enable_primary=*/1, |
692 | 4.73M | /*enable_secondary=*/1); |
693 | 21.7M | } else { |
694 | 21.7M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
695 | 21.7M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
696 | 21.7M | block_height, /*enable_primary=*/1, |
697 | 21.7M | /*enable_secondary=*/1); |
698 | 21.7M | } |
699 | 26.5M | } Unexecuted instantiation: cdef_filter_8_0_sse4_1 Line | Count | Source | 687 | 26.5M | int block_height) { | 688 | 26.5M | if (block_width == 8) { | 689 | 4.73M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 690 | 4.73M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 691 | 4.73M | block_height, /*enable_primary=*/1, | 692 | 4.73M | /*enable_secondary=*/1); | 693 | 21.7M | } else { | 694 | 21.7M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 695 | 21.7M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 696 | 21.7M | block_height, /*enable_primary=*/1, | 697 | 21.7M | /*enable_secondary=*/1); | 698 | 21.7M | } | 699 | 26.5M | } |
|
700 | | |
701 | | void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in, |
702 | | int pri_strength, int sec_strength, int dir, |
703 | | int pri_damping, int sec_damping, |
704 | | int coeff_shift, int block_width, |
705 | 3.15M | int block_height) { |
706 | 3.15M | if (block_width == 8) { |
707 | 1.80M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
708 | 1.80M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
709 | 1.80M | block_height, /*enable_primary=*/1, |
710 | 1.80M | /*enable_secondary=*/0); |
711 | 1.80M | } else { |
712 | 1.35M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
713 | 1.35M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
714 | 1.35M | block_height, /*enable_primary=*/1, |
715 | 1.35M | /*enable_secondary=*/0); |
716 | 1.35M | } |
717 | 3.15M | } Unexecuted instantiation: cdef_filter_8_1_sse4_1 Line | Count | Source | 705 | 3.15M | int block_height) { | 706 | 3.15M | if (block_width == 8) { | 707 | 1.80M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 708 | 1.80M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 709 | 1.80M | block_height, /*enable_primary=*/1, | 710 | 1.80M | /*enable_secondary=*/0); | 711 | 1.80M | } else { | 712 | 1.35M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 713 | 1.35M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 714 | 1.35M | block_height, /*enable_primary=*/1, | 715 | 1.35M | /*enable_secondary=*/0); | 716 | 1.35M | } | 717 | 3.15M | } |
|
718 | | void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in, |
719 | | int pri_strength, int sec_strength, int dir, |
720 | | int pri_damping, int sec_damping, |
721 | | int coeff_shift, int block_width, |
722 | 4.45M | int block_height) { |
723 | 4.45M | if (block_width == 8) { |
724 | 4.25M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
725 | 4.25M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
726 | 4.25M | block_height, /*enable_primary=*/0, |
727 | 4.25M | /*enable_secondary=*/1); |
728 | 4.25M | } else { |
729 | 199k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
730 | 199k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
731 | 199k | block_height, /*enable_primary=*/0, |
732 | 199k | /*enable_secondary=*/1); |
733 | 199k | } |
734 | 4.45M | } Unexecuted instantiation: cdef_filter_8_2_sse4_1 Line | Count | Source | 722 | 4.45M | int block_height) { | 723 | 4.45M | if (block_width == 8) { | 724 | 4.25M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 725 | 4.25M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 726 | 4.25M | block_height, /*enable_primary=*/0, | 727 | 4.25M | /*enable_secondary=*/1); | 728 | 4.25M | } else { | 729 | 199k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 730 | 199k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 731 | 199k | block_height, /*enable_primary=*/0, | 732 | 199k | /*enable_secondary=*/1); | 733 | 199k | } | 734 | 4.45M | } |
|
735 | | |
736 | | void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in, |
737 | | int pri_strength, int sec_strength, int dir, |
738 | | int pri_damping, int sec_damping, |
739 | | int coeff_shift, int block_width, |
740 | 2.79M | int block_height) { |
741 | 2.79M | (void)pri_strength; |
742 | 2.79M | (void)sec_strength; |
743 | 2.79M | (void)dir; |
744 | 2.79M | (void)pri_damping; |
745 | 2.79M | (void)sec_damping; |
746 | 2.79M | (void)coeff_shift; |
747 | 2.79M | (void)block_width; |
748 | | |
749 | 2.79M | if (block_width == 8) { |
750 | 2.79M | copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); |
751 | 18.4E | } else { |
752 | 18.4E | copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); |
753 | 18.4E | } |
754 | 2.79M | } Unexecuted instantiation: cdef_filter_8_3_sse4_1 Line | Count | Source | 740 | 2.79M | int block_height) { | 741 | 2.79M | (void)pri_strength; | 742 | 2.79M | (void)sec_strength; | 743 | 2.79M | (void)dir; | 744 | 2.79M | (void)pri_damping; | 745 | 2.79M | (void)sec_damping; | 746 | 2.79M | (void)coeff_shift; | 747 | 2.79M | (void)block_width; | 748 | | | 749 | 2.79M | if (block_width == 8) { | 750 | 2.79M | copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); | 751 | 18.4E | } else { | 752 | 18.4E | copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); | 753 | 18.4E | } | 754 | 2.79M | } |
|
755 | | |
756 | | void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in, |
757 | | int pri_strength, int sec_strength, int dir, |
758 | | int pri_damping, int sec_damping, |
759 | | int coeff_shift, int block_width, |
760 | 25.0M | int block_height) { |
761 | 25.0M | if (block_width == 8) { |
762 | 4.96M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
763 | 4.96M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
764 | 4.96M | block_height, /*enable_primary=*/1, |
765 | 4.96M | /*enable_secondary=*/1); |
766 | 20.0M | } else { |
767 | 20.0M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
768 | 20.0M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
769 | 20.0M | block_height, /*enable_primary=*/1, |
770 | 20.0M | /*enable_secondary=*/1); |
771 | 20.0M | } |
772 | 25.0M | } Unexecuted instantiation: cdef_filter_16_0_sse4_1 Line | Count | Source | 760 | 25.0M | int block_height) { | 761 | 25.0M | if (block_width == 8) { | 762 | 4.96M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 763 | 4.96M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 764 | 4.96M | block_height, /*enable_primary=*/1, | 765 | 4.96M | /*enable_secondary=*/1); | 766 | 20.0M | } else { | 767 | 20.0M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 768 | 20.0M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 769 | 20.0M | block_height, /*enable_primary=*/1, | 770 | 20.0M | /*enable_secondary=*/1); | 771 | 20.0M | } | 772 | 25.0M | } |
|
773 | | |
774 | | void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in, |
775 | | int pri_strength, int sec_strength, int dir, |
776 | | int pri_damping, int sec_damping, |
777 | | int coeff_shift, int block_width, |
778 | 3.22M | int block_height) { |
779 | 3.22M | if (block_width == 8) { |
780 | 1.94M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
781 | 1.94M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
782 | 1.94M | block_height, /*enable_primary=*/1, |
783 | 1.94M | /*enable_secondary=*/0); |
784 | 1.94M | } else { |
785 | 1.27M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
786 | 1.27M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
787 | 1.27M | block_height, /*enable_primary=*/1, |
788 | 1.27M | /*enable_secondary=*/0); |
789 | 1.27M | } |
790 | 3.22M | } Unexecuted instantiation: cdef_filter_16_1_sse4_1 Line | Count | Source | 778 | 3.22M | int block_height) { | 779 | 3.22M | if (block_width == 8) { | 780 | 1.94M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 781 | 1.94M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 782 | 1.94M | block_height, /*enable_primary=*/1, | 783 | 1.94M | /*enable_secondary=*/0); | 784 | 1.94M | } else { | 785 | 1.27M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 786 | 1.27M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 787 | 1.27M | block_height, /*enable_primary=*/1, | 788 | 1.27M | /*enable_secondary=*/0); | 789 | 1.27M | } | 790 | 3.22M | } |
|
791 | | void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in, |
792 | | int pri_strength, int sec_strength, int dir, |
793 | | int pri_damping, int sec_damping, |
794 | | int coeff_shift, int block_width, |
795 | 3.63M | int block_height) { |
796 | 3.63M | if (block_width == 8) { |
797 | 3.49M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
798 | 3.49M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
799 | 3.49M | block_height, /*enable_primary=*/0, |
800 | 3.49M | /*enable_secondary=*/1); |
801 | 3.49M | } else { |
802 | 137k | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
803 | 137k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
804 | 137k | block_height, /*enable_primary=*/0, |
805 | 137k | /*enable_secondary=*/1); |
806 | 137k | } |
807 | 3.63M | } Unexecuted instantiation: cdef_filter_16_2_sse4_1 Line | Count | Source | 795 | 3.63M | int block_height) { | 796 | 3.63M | if (block_width == 8) { | 797 | 3.49M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 798 | 3.49M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 799 | 3.49M | block_height, /*enable_primary=*/0, | 800 | 3.49M | /*enable_secondary=*/1); | 801 | 3.49M | } else { | 802 | 137k | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 803 | 137k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 804 | 137k | block_height, /*enable_primary=*/0, | 805 | 137k | /*enable_secondary=*/1); | 806 | 137k | } | 807 | 3.63M | } |
|
808 | | |
809 | | void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in, |
810 | | int pri_strength, int sec_strength, int dir, |
811 | | int pri_damping, int sec_damping, |
812 | | int coeff_shift, int block_width, |
813 | 2.43M | int block_height) { |
814 | 2.43M | (void)pri_strength; |
815 | 2.43M | (void)sec_strength; |
816 | 2.43M | (void)dir; |
817 | 2.43M | (void)pri_damping; |
818 | 2.43M | (void)sec_damping; |
819 | 2.43M | (void)coeff_shift; |
820 | 2.43M | (void)block_width; |
821 | 2.43M | if (block_width == 8) { |
822 | 2.43M | copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); |
823 | 18.4E | } else { |
824 | 18.4E | copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); |
825 | 18.4E | } |
826 | 2.43M | } Unexecuted instantiation: cdef_filter_16_3_sse4_1 Line | Count | Source | 813 | 2.43M | int block_height) { | 814 | 2.43M | (void)pri_strength; | 815 | 2.43M | (void)sec_strength; | 816 | 2.43M | (void)dir; | 817 | 2.43M | (void)pri_damping; | 818 | 2.43M | (void)sec_damping; | 819 | 2.43M | (void)coeff_shift; | 820 | 2.43M | (void)block_width; | 821 | 2.43M | if (block_width == 8) { | 822 | 2.43M | copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); | 823 | 18.4E | } else { | 824 | 18.4E | copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); | 825 | 18.4E | } | 826 | 2.43M | } |
|
827 | | |
828 | | #if CONFIG_AV1_HIGHBITDEPTH |
829 | | void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, |
830 | | const uint16_t *src, int sstride, |
831 | 809k | int width, int height) { |
832 | 809k | int i, j; |
833 | 24.1M | for (i = 0; i < height; i++) { |
834 | 179M | for (j = 0; j < (width & ~0x7); j += 8) { |
835 | 156M | v128 row = v128_load_unaligned(&src[i * sstride + j]); |
836 | 156M | v128_store_unaligned(&dst[i * dstride + j], row); |
837 | 156M | } |
838 | 26.4M | for (; j < width; j++) { |
839 | 3.17M | dst[i * dstride + j] = src[i * sstride + j]; |
840 | 3.17M | } |
841 | 23.3M | } |
842 | 809k | } Unexecuted instantiation: cdef_copy_rect8_16bit_to_16bit_sse4_1 cdef_copy_rect8_16bit_to_16bit_avx2 Line | Count | Source | 831 | 809k | int width, int height) { | 832 | 809k | int i, j; | 833 | 24.1M | for (i = 0; i < height; i++) { | 834 | 179M | for (j = 0; j < (width & ~0x7); j += 8) { | 835 | 156M | v128 row = v128_load_unaligned(&src[i * sstride + j]); | 836 | 156M | v128_store_unaligned(&dst[i * dstride + j], row); | 837 | 156M | } | 838 | 26.4M | for (; j < width; j++) { | 839 | 3.17M | dst[i * dstride + j] = src[i * sstride + j]; | 840 | 3.17M | } | 841 | 23.3M | } | 842 | 809k | } |
|
843 | | #endif // CONFIG_AV1_HIGHBITDEPTH |
844 | | |
845 | | #undef CDEF_INLINE |
846 | | |
847 | | #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |