/src/aom/av1/common/cdef_block_simd.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2016, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |
13 | | #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |
14 | | |
15 | | #include "config/av1_rtcd.h" |
16 | | |
17 | | #include "av1/common/cdef_block.h" |
18 | | |
19 | | /* partial A is a 16-bit vector of the form: |
20 | | [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: |
21 | | [0 y1 y2 y3 y4 y5 y6 y7]. |
22 | | This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... |
23 | | (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 |
24 | | and const2. */ |
25 | | static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1, |
26 | 56.8k | v128 const2) { |
27 | 56.8k | v128 tmp; |
28 | | /* Reverse partial B. */ |
29 | 56.8k | partialb = v128_shuffle_8( |
30 | 56.8k | partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); |
31 | | /* Interleave the x and y values of identical indices and pair x8 with 0. */ |
32 | 56.8k | tmp = partiala; |
33 | 56.8k | partiala = v128_ziplo_16(partialb, partiala); |
34 | 56.8k | partialb = v128_ziphi_16(partialb, tmp); |
35 | | /* Square and add the corresponding x and y values. */ |
36 | 56.8k | partiala = v128_madd_s16(partiala, partiala); |
37 | 56.8k | partialb = v128_madd_s16(partialb, partialb); |
38 | | /* Multiply by constant. */ |
39 | 56.8k | partiala = v128_mullo_s32(partiala, const1); |
40 | 56.8k | partialb = v128_mullo_s32(partialb, const2); |
41 | | /* Sum all results. */ |
42 | 56.8k | partiala = v128_add_32(partiala, partialb); |
43 | 56.8k | return partiala; |
44 | 56.8k | } Unexecuted instantiation: cdef_block_sse2.c:fold_mul_and_sum Unexecuted instantiation: cdef_block_ssse3.c:fold_mul_and_sum Unexecuted instantiation: cdef_block_sse4.c:fold_mul_and_sum cdef_block_avx2.c:fold_mul_and_sum Line | Count | Source | 26 | 56.8k | v128 const2) { | 27 | 56.8k | v128 tmp; | 28 | | /* Reverse partial B. */ | 29 | 56.8k | partialb = v128_shuffle_8( | 30 | 56.8k | partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c)); | 31 | | /* Interleave the x and y values of identical indices and pair x8 with 0. */ | 32 | 56.8k | tmp = partiala; | 33 | 56.8k | partiala = v128_ziplo_16(partialb, partiala); | 34 | 56.8k | partialb = v128_ziphi_16(partialb, tmp); | 35 | | /* Square and add the corresponding x and y values. */ | 36 | 56.8k | partiala = v128_madd_s16(partiala, partiala); | 37 | 56.8k | partialb = v128_madd_s16(partialb, partialb); | 38 | | /* Multiply by constant. */ | 39 | 56.8k | partiala = v128_mullo_s32(partiala, const1); | 40 | 56.8k | partialb = v128_mullo_s32(partialb, const2); | 41 | | /* Sum all results. */ | 42 | 56.8k | partiala = v128_add_32(partiala, partialb); | 43 | 56.8k | return partiala; | 44 | 56.8k | } |
|
45 | | |
46 | 18.9k | static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { |
47 | 18.9k | v128 t0, t1, t2, t3; |
48 | 18.9k | t0 = v128_ziplo_32(x1, x0); |
49 | 18.9k | t1 = v128_ziplo_32(x3, x2); |
50 | 18.9k | t2 = v128_ziphi_32(x1, x0); |
51 | 18.9k | t3 = v128_ziphi_32(x3, x2); |
52 | 18.9k | x0 = v128_ziplo_64(t1, t0); |
53 | 18.9k | x1 = v128_ziphi_64(t1, t0); |
54 | 18.9k | x2 = v128_ziplo_64(t3, t2); |
55 | 18.9k | x3 = v128_ziphi_64(t3, t2); |
56 | 18.9k | return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); |
57 | 18.9k | } Unexecuted instantiation: cdef_block_sse2.c:hsum4 Unexecuted instantiation: cdef_block_ssse3.c:hsum4 Unexecuted instantiation: cdef_block_sse4.c:hsum4 Line | Count | Source | 46 | 18.9k | static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) { | 47 | 18.9k | v128 t0, t1, t2, t3; | 48 | 18.9k | t0 = v128_ziplo_32(x1, x0); | 49 | 18.9k | t1 = v128_ziplo_32(x3, x2); | 50 | 18.9k | t2 = v128_ziphi_32(x1, x0); | 51 | 18.9k | t3 = v128_ziphi_32(x3, x2); | 52 | 18.9k | x0 = v128_ziplo_64(t1, t0); | 53 | 18.9k | x1 = v128_ziphi_64(t1, t0); | 54 | 18.9k | x2 = v128_ziplo_64(t3, t2); | 55 | 18.9k | x3 = v128_ziphi_64(t3, t2); | 56 | 18.9k | return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3)); | 57 | 18.9k | } |
|
58 | | |
59 | | /* Computes cost for directions 0, 5, 6 and 7. We can call this function again |
60 | | to compute the remaining directions. */ |
61 | 18.9k | static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { |
62 | 18.9k | v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; |
63 | 18.9k | v128 partial6; |
64 | 18.9k | v128 tmp; |
65 | | /* Partial sums for lines 0 and 1. */ |
66 | 18.9k | partial4a = v128_shl_n_byte(lines[0], 14); |
67 | 18.9k | partial4b = v128_shr_n_byte(lines[0], 2); |
68 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); |
69 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); |
70 | 18.9k | tmp = v128_add_16(lines[0], lines[1]); |
71 | 18.9k | partial5a = v128_shl_n_byte(tmp, 10); |
72 | 18.9k | partial5b = v128_shr_n_byte(tmp, 6); |
73 | 18.9k | partial7a = v128_shl_n_byte(tmp, 4); |
74 | 18.9k | partial7b = v128_shr_n_byte(tmp, 12); |
75 | 18.9k | partial6 = tmp; |
76 | | |
77 | | /* Partial sums for lines 2 and 3. */ |
78 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); |
79 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); |
80 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); |
81 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); |
82 | 18.9k | tmp = v128_add_16(lines[2], lines[3]); |
83 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); |
84 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); |
85 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); |
86 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); |
87 | 18.9k | partial6 = v128_add_16(partial6, tmp); |
88 | | |
89 | | /* Partial sums for lines 4 and 5. */ |
90 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); |
91 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); |
92 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); |
93 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); |
94 | 18.9k | tmp = v128_add_16(lines[4], lines[5]); |
95 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); |
96 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); |
97 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); |
98 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); |
99 | 18.9k | partial6 = v128_add_16(partial6, tmp); |
100 | | |
101 | | /* Partial sums for lines 6 and 7. */ |
102 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); |
103 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); |
104 | 18.9k | partial4a = v128_add_16(partial4a, lines[7]); |
105 | 18.9k | tmp = v128_add_16(lines[6], lines[7]); |
106 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); |
107 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); |
108 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); |
109 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); |
110 | 18.9k | partial6 = v128_add_16(partial6, tmp); |
111 | | |
112 | | /* Compute costs in terms of partial sums. */ |
113 | 18.9k | partial4a = |
114 | 18.9k | fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), |
115 | 18.9k | v128_from_32(105, 120, 140, 168)); |
116 | 18.9k | partial7a = |
117 | 18.9k | fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), |
118 | 18.9k | v128_from_32(105, 105, 105, 140)); |
119 | 18.9k | partial5a = |
120 | 18.9k | fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), |
121 | 18.9k | v128_from_32(105, 105, 105, 140)); |
122 | 18.9k | partial6 = v128_madd_s16(partial6, partial6); |
123 | 18.9k | partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); |
124 | | |
125 | 18.9k | partial4a = hsum4(partial4a, partial5a, partial6, partial7a); |
126 | 18.9k | v128_store_unaligned(tmp_cost1, partial4a); |
127 | 18.9k | return partial4a; |
128 | 18.9k | } Unexecuted instantiation: cdef_block_sse2.c:compute_directions Unexecuted instantiation: cdef_block_ssse3.c:compute_directions Unexecuted instantiation: cdef_block_sse4.c:compute_directions cdef_block_avx2.c:compute_directions Line | Count | Source | 61 | 18.9k | static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) { | 62 | 18.9k | v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; | 63 | 18.9k | v128 partial6; | 64 | 18.9k | v128 tmp; | 65 | | /* Partial sums for lines 0 and 1. */ | 66 | 18.9k | partial4a = v128_shl_n_byte(lines[0], 14); | 67 | 18.9k | partial4b = v128_shr_n_byte(lines[0], 2); | 68 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12)); | 69 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4)); | 70 | 18.9k | tmp = v128_add_16(lines[0], lines[1]); | 71 | 18.9k | partial5a = v128_shl_n_byte(tmp, 10); | 72 | 18.9k | partial5b = v128_shr_n_byte(tmp, 6); | 73 | 18.9k | partial7a = v128_shl_n_byte(tmp, 4); | 74 | 18.9k | partial7b = v128_shr_n_byte(tmp, 12); | 75 | 18.9k | partial6 = tmp; | 76 | | | 77 | | /* Partial sums for lines 2 and 3. */ | 78 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10)); | 79 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6)); | 80 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8)); | 81 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8)); | 82 | 18.9k | tmp = v128_add_16(lines[2], lines[3]); | 83 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8)); | 84 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8)); | 85 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6)); | 86 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10)); | 87 | 18.9k | partial6 = v128_add_16(partial6, tmp); | 88 | | | 89 | | /* Partial sums for lines 4 and 5. */ | 90 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6)); | 91 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10)); | 92 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4)); | 93 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12)); | 94 | 18.9k | tmp = v128_add_16(lines[4], lines[5]); | 95 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6)); | 96 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10)); | 97 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8)); | 98 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8)); | 99 | 18.9k | partial6 = v128_add_16(partial6, tmp); | 100 | | | 101 | | /* Partial sums for lines 6 and 7. */ | 102 | 18.9k | partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2)); | 103 | 18.9k | partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14)); | 104 | 18.9k | partial4a = v128_add_16(partial4a, lines[7]); | 105 | 18.9k | tmp = v128_add_16(lines[6], lines[7]); | 106 | 18.9k | partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4)); | 107 | 18.9k | partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12)); | 108 | 18.9k | partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10)); | 109 | 18.9k | partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6)); | 110 | 18.9k | partial6 = v128_add_16(partial6, tmp); | 111 | | | 112 | | /* Compute costs in terms of partial sums. */ | 113 | 18.9k | partial4a = | 114 | 18.9k | fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840), | 115 | 18.9k | v128_from_32(105, 120, 140, 168)); | 116 | 18.9k | partial7a = | 117 | 18.9k | fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0), | 118 | 18.9k | v128_from_32(105, 105, 105, 140)); | 119 | 18.9k | partial5a = | 120 | 18.9k | fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0), | 121 | 18.9k | v128_from_32(105, 105, 105, 140)); | 122 | 18.9k | partial6 = v128_madd_s16(partial6, partial6); | 123 | 18.9k | partial6 = v128_mullo_s32(partial6, v128_dup_32(105)); | 124 | | | 125 | 18.9k | partial4a = hsum4(partial4a, partial5a, partial6, partial7a); | 126 | 18.9k | v128_store_unaligned(tmp_cost1, partial4a); | 127 | 18.9k | return partial4a; | 128 | 18.9k | } |
|
129 | | |
130 | | /* transpose and reverse the order of the lines -- equivalent to a 90-degree |
131 | | counter-clockwise rotation of the pixels. */ |
132 | 9.47k | static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) { |
133 | 9.47k | const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); |
134 | 9.47k | const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); |
135 | 9.47k | const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); |
136 | 9.47k | const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); |
137 | 9.47k | const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); |
138 | 9.47k | const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); |
139 | 9.47k | const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); |
140 | 9.47k | const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); |
141 | | |
142 | 9.47k | const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); |
143 | 9.47k | const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); |
144 | 9.47k | const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); |
145 | 9.47k | const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); |
146 | 9.47k | const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); |
147 | 9.47k | const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); |
148 | 9.47k | const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); |
149 | 9.47k | const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); |
150 | | |
151 | 9.47k | res[7] = v128_ziplo_64(tr1_1, tr1_0); |
152 | 9.47k | res[6] = v128_ziphi_64(tr1_1, tr1_0); |
153 | 9.47k | res[5] = v128_ziplo_64(tr1_3, tr1_2); |
154 | 9.47k | res[4] = v128_ziphi_64(tr1_3, tr1_2); |
155 | 9.47k | res[3] = v128_ziplo_64(tr1_5, tr1_4); |
156 | 9.47k | res[2] = v128_ziphi_64(tr1_5, tr1_4); |
157 | 9.47k | res[1] = v128_ziplo_64(tr1_7, tr1_6); |
158 | 9.47k | res[0] = v128_ziphi_64(tr1_7, tr1_6); |
159 | 9.47k | } Unexecuted instantiation: cdef_block_sse2.c:array_reverse_transpose_8x8 Unexecuted instantiation: cdef_block_ssse3.c:array_reverse_transpose_8x8 Unexecuted instantiation: cdef_block_sse4.c:array_reverse_transpose_8x8 cdef_block_avx2.c:array_reverse_transpose_8x8 Line | Count | Source | 132 | 9.47k | static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) { | 133 | 9.47k | const v128 tr0_0 = v128_ziplo_16(in[1], in[0]); | 134 | 9.47k | const v128 tr0_1 = v128_ziplo_16(in[3], in[2]); | 135 | 9.47k | const v128 tr0_2 = v128_ziphi_16(in[1], in[0]); | 136 | 9.47k | const v128 tr0_3 = v128_ziphi_16(in[3], in[2]); | 137 | 9.47k | const v128 tr0_4 = v128_ziplo_16(in[5], in[4]); | 138 | 9.47k | const v128 tr0_5 = v128_ziplo_16(in[7], in[6]); | 139 | 9.47k | const v128 tr0_6 = v128_ziphi_16(in[5], in[4]); | 140 | 9.47k | const v128 tr0_7 = v128_ziphi_16(in[7], in[6]); | 141 | | | 142 | 9.47k | const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0); | 143 | 9.47k | const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4); | 144 | 9.47k | const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0); | 145 | 9.47k | const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4); | 146 | 9.47k | const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2); | 147 | 9.47k | const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6); | 148 | 9.47k | const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2); | 149 | 9.47k | const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6); | 150 | | | 151 | 9.47k | res[7] = v128_ziplo_64(tr1_1, tr1_0); | 152 | 9.47k | res[6] = v128_ziphi_64(tr1_1, tr1_0); | 153 | 9.47k | res[5] = v128_ziplo_64(tr1_3, tr1_2); | 154 | 9.47k | res[4] = v128_ziphi_64(tr1_3, tr1_2); | 155 | 9.47k | res[3] = v128_ziplo_64(tr1_5, tr1_4); | 156 | 9.47k | res[2] = v128_ziphi_64(tr1_5, tr1_4); | 157 | 9.47k | res[1] = v128_ziplo_64(tr1_7, tr1_6); | 158 | 9.47k | res[0] = v128_ziphi_64(tr1_7, tr1_6); | 159 | 9.47k | } |
|
160 | | |
161 | | int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, |
162 | 9.47k | int coeff_shift) { |
163 | 9.47k | int i; |
164 | 9.47k | int32_t cost[8]; |
165 | 9.47k | int32_t best_cost = 0; |
166 | 9.47k | int best_dir = 0; |
167 | 9.47k | v128 lines[8]; |
168 | 85.2k | for (i = 0; i < 8; i++) { |
169 | 75.7k | lines[i] = v128_load_unaligned(&img[i * stride]); |
170 | 75.7k | lines[i] = |
171 | 75.7k | v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); |
172 | 75.7k | } |
173 | | |
174 | | /* Compute "mostly vertical" directions. */ |
175 | 9.47k | v128 dir47 = compute_directions(lines, cost + 4); |
176 | | |
177 | 9.47k | array_reverse_transpose_8x8(lines, lines); |
178 | | |
179 | | /* Compute "mostly horizontal" directions. */ |
180 | 9.47k | v128 dir03 = compute_directions(lines, cost); |
181 | | |
182 | 9.47k | v128 max = v128_max_s32(dir03, dir47); |
183 | 9.47k | max = v128_max_s32(max, v128_align(max, max, 8)); |
184 | 9.47k | max = v128_max_s32(max, v128_align(max, max, 4)); |
185 | 9.47k | best_cost = v128_low_u32(max); |
186 | 9.47k | v128 t = |
187 | 9.47k | v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); |
188 | 9.47k | best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); |
189 | 9.47k | best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros |
190 | | |
191 | | /* Difference between the optimal variance and the variance along the |
192 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ |
193 | 9.47k | *var = best_cost - cost[(best_dir + 4) & 7]; |
194 | | /* We'd normally divide by 840, but dividing by 1024 is close enough |
195 | | for what we're going to do with this. */ |
196 | 9.47k | *var >>= 10; |
197 | 9.47k | return best_dir; |
198 | 9.47k | } Unexecuted instantiation: cdef_find_dir_sse2 Unexecuted instantiation: cdef_find_dir_ssse3 Unexecuted instantiation: cdef_find_dir_sse4_1 Line | Count | Source | 162 | 9.47k | int coeff_shift) { | 163 | 9.47k | int i; | 164 | 9.47k | int32_t cost[8]; | 165 | 9.47k | int32_t best_cost = 0; | 166 | 9.47k | int best_dir = 0; | 167 | 9.47k | v128 lines[8]; | 168 | 85.2k | for (i = 0; i < 8; i++) { | 169 | 75.7k | lines[i] = v128_load_unaligned(&img[i * stride]); | 170 | 75.7k | lines[i] = | 171 | 75.7k | v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128)); | 172 | 75.7k | } | 173 | | | 174 | | /* Compute "mostly vertical" directions. */ | 175 | 9.47k | v128 dir47 = compute_directions(lines, cost + 4); | 176 | | | 177 | 9.47k | array_reverse_transpose_8x8(lines, lines); | 178 | | | 179 | | /* Compute "mostly horizontal" directions. */ | 180 | 9.47k | v128 dir03 = compute_directions(lines, cost); | 181 | | | 182 | 9.47k | v128 max = v128_max_s32(dir03, dir47); | 183 | 9.47k | max = v128_max_s32(max, v128_align(max, max, 8)); | 184 | 9.47k | max = v128_max_s32(max, v128_align(max, max, 4)); | 185 | 9.47k | best_cost = v128_low_u32(max); | 186 | 9.47k | v128 t = | 187 | 9.47k | v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03)); | 188 | 9.47k | best_dir = v128_movemask_8(v128_pack_s16_s8(t, t)); | 189 | 9.47k | best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros | 190 | | | 191 | | /* Difference between the optimal variance and the variance along the | 192 | | orthogonal direction. Again, the sum(x^2) terms cancel out. */ | 193 | 9.47k | *var = best_cost - cost[(best_dir + 4) & 7]; | 194 | | /* We'd normally divide by 840, but dividing by 1024 is close enough | 195 | | for what we're going to do with this. */ | 196 | 9.47k | *var >>= 10; | 197 | 9.47k | return best_dir; | 198 | 9.47k | } |
|
199 | | |
200 | | // Work around compiler out of memory issues with Win32 builds. This issue has |
201 | | // been observed with Visual Studio 2017, 2019, and 2022 (version 17.4). |
202 | | #if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1940 |
203 | | #define CDEF_INLINE static INLINE |
204 | | #else |
205 | | #define CDEF_INLINE SIMD_INLINE |
206 | | #endif |
207 | | |
208 | | // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) |
209 | | CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, |
210 | 256M | unsigned int adjdamp) { |
211 | 256M | v256 diff = v256_sub_16(a, b); |
212 | 256M | const v256 sign = v256_shr_n_s16(diff, 15); |
213 | 256M | diff = v256_abs_s16(diff); |
214 | 256M | const v256 s = |
215 | 256M | v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); |
216 | 256M | return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); |
217 | 256M | } Unexecuted instantiation: cdef_block_sse2.c:constrain16 Unexecuted instantiation: cdef_block_ssse3.c:constrain16 Unexecuted instantiation: cdef_block_sse4.c:constrain16 cdef_block_avx2.c:constrain16 Line | Count | Source | 210 | 256M | unsigned int adjdamp) { | 211 | 256M | v256 diff = v256_sub_16(a, b); | 212 | 256M | const v256 sign = v256_shr_n_s16(diff, 15); | 213 | 256M | diff = v256_abs_s16(diff); | 214 | 256M | const v256 s = | 215 | 256M | v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp)); | 216 | 256M | return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign); | 217 | 256M | } |
|
218 | | |
219 | | SIMD_INLINE v256 get_max_primary(const int is_lowbd, v256 *tap, v256 max, |
220 | 38.9M | v256 cdef_large_value_mask) { |
221 | 38.9M | if (is_lowbd) { |
222 | 28.8M | v256 max_u8; |
223 | 28.8M | max_u8 = tap[0]; |
224 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[1]); |
225 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[2]); |
226 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[3]); |
227 | | /* The source is 16 bits, however, we only really care about the lower |
228 | | 8 bits. The upper 8 bits contain the "large" flag. After the final |
229 | | primary max has been calculated, zero out the upper 8 bits. Use this |
230 | | to find the "16 bit" max. */ |
231 | 28.8M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); |
232 | 28.8M | } else { |
233 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ |
234 | 10.1M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); |
235 | 10.1M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); |
236 | 10.1M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); |
237 | 10.1M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); |
238 | 10.1M | } |
239 | 38.9M | return max; |
240 | 38.9M | } Unexecuted instantiation: cdef_block_sse2.c:get_max_primary Unexecuted instantiation: cdef_block_ssse3.c:get_max_primary Unexecuted instantiation: cdef_block_sse4.c:get_max_primary cdef_block_avx2.c:get_max_primary Line | Count | Source | 220 | 38.9M | v256 cdef_large_value_mask) { | 221 | 38.9M | if (is_lowbd) { | 222 | 28.8M | v256 max_u8; | 223 | 28.8M | max_u8 = tap[0]; | 224 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[1]); | 225 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[2]); | 226 | 28.8M | max_u8 = v256_max_u8(max_u8, tap[3]); | 227 | | /* The source is 16 bits, however, we only really care about the lower | 228 | | 8 bits. The upper 8 bits contain the "large" flag. After the final | 229 | | primary max has been calculated, zero out the upper 8 bits. Use this | 230 | | to find the "16 bit" max. */ | 231 | 28.8M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); | 232 | 28.8M | } else { | 233 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ | 234 | 10.1M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); | 235 | 10.1M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); | 236 | 10.1M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); | 237 | 10.1M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); | 238 | 10.1M | } | 239 | 38.9M | return max; | 240 | 38.9M | } |
|
241 | | |
242 | | SIMD_INLINE v256 get_max_secondary(const int is_lowbd, v256 *tap, v256 max, |
243 | 41.1M | v256 cdef_large_value_mask) { |
244 | 41.1M | if (is_lowbd) { |
245 | 30.3M | v256 max_u8; |
246 | 30.3M | max_u8 = tap[0]; |
247 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[1]); |
248 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[2]); |
249 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[3]); |
250 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[4]); |
251 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[5]); |
252 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[6]); |
253 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[7]); |
254 | | /* The source is 16 bits, however, we only really care about the lower |
255 | | 8 bits. The upper 8 bits contain the "large" flag. After the final |
256 | | primary max has been calculated, zero out the upper 8 bits. Use this |
257 | | to find the "16 bit" max. */ |
258 | 30.3M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); |
259 | 30.3M | } else { |
260 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ |
261 | 10.7M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); |
262 | 10.7M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); |
263 | 10.7M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); |
264 | 10.7M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); |
265 | 10.7M | max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); |
266 | 10.7M | max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); |
267 | 10.7M | max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); |
268 | 10.7M | max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); |
269 | 10.7M | } |
270 | 41.1M | return max; |
271 | 41.1M | } Unexecuted instantiation: cdef_block_sse2.c:get_max_secondary Unexecuted instantiation: cdef_block_ssse3.c:get_max_secondary Unexecuted instantiation: cdef_block_sse4.c:get_max_secondary cdef_block_avx2.c:get_max_secondary Line | Count | Source | 243 | 41.1M | v256 cdef_large_value_mask) { | 244 | 41.1M | if (is_lowbd) { | 245 | 30.3M | v256 max_u8; | 246 | 30.3M | max_u8 = tap[0]; | 247 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[1]); | 248 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[2]); | 249 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[3]); | 250 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[4]); | 251 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[5]); | 252 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[6]); | 253 | 30.3M | max_u8 = v256_max_u8(max_u8, tap[7]); | 254 | | /* The source is 16 bits, however, we only really care about the lower | 255 | | 8 bits. The upper 8 bits contain the "large" flag. After the final | 256 | | primary max has been calculated, zero out the upper 8 bits. Use this | 257 | | to find the "16 bit" max. */ | 258 | 30.3M | max = v256_max_s16(max, v256_and(max_u8, cdef_large_value_mask)); | 259 | 30.3M | } else { | 260 | | /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ | 261 | 10.7M | max = v256_max_s16(max, v256_and(tap[0], cdef_large_value_mask)); | 262 | 10.7M | max = v256_max_s16(max, v256_and(tap[1], cdef_large_value_mask)); | 263 | 10.7M | max = v256_max_s16(max, v256_and(tap[2], cdef_large_value_mask)); | 264 | 10.7M | max = v256_max_s16(max, v256_and(tap[3], cdef_large_value_mask)); | 265 | 10.7M | max = v256_max_s16(max, v256_and(tap[4], cdef_large_value_mask)); | 266 | 10.7M | max = v256_max_s16(max, v256_and(tap[5], cdef_large_value_mask)); | 267 | 10.7M | max = v256_max_s16(max, v256_and(tap[6], cdef_large_value_mask)); | 268 | 10.7M | max = v256_max_s16(max, v256_and(tap[7], cdef_large_value_mask)); | 269 | 10.7M | } | 270 | 41.1M | return max; | 271 | 41.1M | } |
|
272 | | |
273 | | // MSVC takes far too much time optimizing these. |
274 | | // https://bugs.chromium.org/p/aomedia/issues/detail?id=3395 |
275 | | #if defined(_MSC_VER) && !defined(__clang__) |
276 | | #pragma optimize("", off) |
277 | | #endif |
278 | | |
279 | | CDEF_INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, |
280 | | const uint16_t *in, int pri_strength, |
281 | | int sec_strength, int dir, int pri_damping, |
282 | | int sec_damping, int coeff_shift, int height, |
283 | 38.4M | int enable_primary, int enable_secondary) { |
284 | 38.4M | uint8_t *dst8 = (uint8_t *)dest; |
285 | 38.4M | uint16_t *dst16 = (uint16_t *)dest; |
286 | 38.4M | const int clipping_required = enable_primary && enable_secondary; |
287 | 38.4M | v256 p0, p1, p2, p3; |
288 | 38.4M | v256 sum, row, res; |
289 | 38.4M | v256 max, min; |
290 | 38.4M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); |
291 | 38.4M | const int po1 = cdef_directions[dir][0]; |
292 | 38.4M | const int po2 = cdef_directions[dir][1]; |
293 | 38.4M | const int s1o1 = cdef_directions[dir + 2][0]; |
294 | 38.4M | const int s1o2 = cdef_directions[dir + 2][1]; |
295 | 38.4M | const int s2o1 = cdef_directions[dir - 2][0]; |
296 | 38.4M | const int s2o2 = cdef_directions[dir - 2][1]; |
297 | 38.4M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; |
298 | 38.4M | const int *sec_taps = cdef_sec_taps; |
299 | 38.4M | int i; |
300 | | |
301 | 39.3M | if (enable_primary && pri_strength) |
302 | 39.8M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
303 | 39.6M | if (enable_secondary && sec_strength) |
304 | 40.0M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
305 | | |
306 | 87.1M | for (i = 0; i < height; i += 4) { |
307 | 48.6M | sum = v256_zero(); |
308 | 48.6M | row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), |
309 | 48.6M | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), |
310 | 48.6M | v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), |
311 | 48.6M | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); |
312 | 48.6M | max = min = row; |
313 | | |
314 | 48.6M | if (enable_primary) { |
315 | 43.2M | v256 tap[4]; |
316 | | // Primary near taps |
317 | 43.2M | tap[0] = |
318 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), |
319 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), |
320 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), |
321 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); |
322 | 43.2M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); |
323 | 43.2M | tap[1] = |
324 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), |
325 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), |
326 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), |
327 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); |
328 | 43.2M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); |
329 | | |
330 | | // sum += pri_taps[0] * (p0 + p1) |
331 | 43.2M | sum = v256_add_16( |
332 | 43.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); |
333 | | |
334 | | // Primary far taps |
335 | 43.2M | tap[2] = |
336 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), |
337 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), |
338 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), |
339 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); |
340 | 43.2M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); |
341 | 43.2M | tap[3] = |
342 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), |
343 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), |
344 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), |
345 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); |
346 | 43.2M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); |
347 | | |
348 | | // sum += pri_taps[1] * (p0 + p1) |
349 | 43.2M | sum = v256_add_16( |
350 | 43.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); |
351 | 43.2M | if (clipping_required) { |
352 | 28.6M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); |
353 | | |
354 | 28.6M | min = v256_min_s16(min, tap[0]); |
355 | 28.6M | min = v256_min_s16(min, tap[1]); |
356 | 28.6M | min = v256_min_s16(min, tap[2]); |
357 | 28.6M | min = v256_min_s16(min, tap[3]); |
358 | 28.6M | } |
359 | 43.2M | } |
360 | | |
361 | 48.6M | if (enable_secondary) { |
362 | 29.1M | v256 tap[8]; |
363 | | // Secondary near taps |
364 | 29.1M | tap[0] = |
365 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), |
366 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), |
367 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), |
368 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); |
369 | 29.1M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); |
370 | 29.1M | tap[1] = |
371 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), |
372 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), |
373 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), |
374 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); |
375 | 29.1M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); |
376 | 29.1M | tap[2] = |
377 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), |
378 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), |
379 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), |
380 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); |
381 | 29.1M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); |
382 | 29.1M | tap[3] = |
383 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), |
384 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), |
385 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), |
386 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); |
387 | 29.1M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); |
388 | | |
389 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) |
390 | 29.1M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), |
391 | 29.1M | v256_add_16(v256_add_16(p0, p1), |
392 | 29.1M | v256_add_16(p2, p3)))); |
393 | | |
394 | | // Secondary far taps |
395 | 29.1M | tap[4] = |
396 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), |
397 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), |
398 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), |
399 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); |
400 | 29.1M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); |
401 | 29.1M | tap[5] = |
402 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), |
403 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), |
404 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), |
405 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); |
406 | 29.1M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); |
407 | 29.1M | tap[6] = |
408 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), |
409 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), |
410 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), |
411 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); |
412 | 29.1M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); |
413 | 29.1M | tap[7] = |
414 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), |
415 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), |
416 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), |
417 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); |
418 | 29.1M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); |
419 | | |
420 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) |
421 | 29.1M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), |
422 | 29.1M | v256_add_16(v256_add_16(p0, p1), |
423 | 29.1M | v256_add_16(p2, p3)))); |
424 | | |
425 | 29.1M | if (clipping_required) { |
426 | 28.7M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); |
427 | | |
428 | 28.7M | min = v256_min_s16(min, tap[0]); |
429 | 28.7M | min = v256_min_s16(min, tap[1]); |
430 | 28.7M | min = v256_min_s16(min, tap[2]); |
431 | 28.7M | min = v256_min_s16(min, tap[3]); |
432 | 28.7M | min = v256_min_s16(min, tap[4]); |
433 | 28.7M | min = v256_min_s16(min, tap[5]); |
434 | 28.7M | min = v256_min_s16(min, tap[6]); |
435 | 28.7M | min = v256_min_s16(min, tap[7]); |
436 | 28.7M | } |
437 | 29.1M | } |
438 | | |
439 | | // res = row + ((sum - (sum < 0) + 8) >> 4) |
440 | 48.6M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); |
441 | 48.6M | res = v256_add_16(sum, v256_dup_16(8)); |
442 | 48.6M | res = v256_shr_n_s16(res, 4); |
443 | 48.6M | res = v256_add_16(row, res); |
444 | 48.6M | if (clipping_required) { |
445 | 28.7M | res = v256_min_s16(v256_max_s16(res, min), max); |
446 | 28.7M | } |
447 | | |
448 | 48.6M | if (is_lowbd) { |
449 | 23.4M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); |
450 | 23.4M | u32_store_aligned(&dst8[(i + 0) * dstride], |
451 | 23.4M | v64_high_u32(v128_high_v64(res_128))); |
452 | 23.4M | u32_store_aligned(&dst8[(i + 1) * dstride], |
453 | 23.4M | v64_low_u32(v128_high_v64(res_128))); |
454 | 23.4M | u32_store_aligned(&dst8[(i + 2) * dstride], |
455 | 23.4M | v64_high_u32(v128_low_v64(res_128))); |
456 | 23.4M | u32_store_aligned(&dst8[(i + 3) * dstride], |
457 | 23.4M | v64_low_u32(v128_low_v64(res_128))); |
458 | 25.2M | } else { |
459 | 25.2M | v64_store_aligned(&dst16[(i + 0) * dstride], |
460 | 25.2M | v128_high_v64(v256_high_v128(res))); |
461 | 25.2M | v64_store_aligned(&dst16[(i + 1) * dstride], |
462 | 25.2M | v128_low_v64(v256_high_v128(res))); |
463 | 25.2M | v64_store_aligned(&dst16[(i + 2) * dstride], |
464 | 25.2M | v128_high_v64(v256_low_v128(res))); |
465 | 25.2M | v64_store_aligned(&dst16[(i + 3) * dstride], |
466 | 25.2M | v128_low_v64(v256_low_v128(res))); |
467 | 25.2M | } |
468 | 48.6M | } |
469 | 38.4M | } Unexecuted instantiation: cdef_block_sse2.c:filter_block_4x4 Unexecuted instantiation: cdef_block_ssse3.c:filter_block_4x4 Unexecuted instantiation: cdef_block_sse4.c:filter_block_4x4 cdef_block_avx2.c:filter_block_4x4 Line | Count | Source | 283 | 38.4M | int enable_primary, int enable_secondary) { | 284 | 38.4M | uint8_t *dst8 = (uint8_t *)dest; | 285 | 38.4M | uint16_t *dst16 = (uint16_t *)dest; | 286 | 38.4M | const int clipping_required = enable_primary && enable_secondary; | 287 | 38.4M | v256 p0, p1, p2, p3; | 288 | 38.4M | v256 sum, row, res; | 289 | 38.4M | v256 max, min; | 290 | 38.4M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); | 291 | 38.4M | const int po1 = cdef_directions[dir][0]; | 292 | 38.4M | const int po2 = cdef_directions[dir][1]; | 293 | 38.4M | const int s1o1 = cdef_directions[dir + 2][0]; | 294 | 38.4M | const int s1o2 = cdef_directions[dir + 2][1]; | 295 | 38.4M | const int s2o1 = cdef_directions[dir - 2][0]; | 296 | 38.4M | const int s2o2 = cdef_directions[dir - 2][1]; | 297 | 38.4M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; | 298 | 38.4M | const int *sec_taps = cdef_sec_taps; | 299 | 38.4M | int i; | 300 | | | 301 | 39.3M | if (enable_primary && pri_strength) | 302 | 39.8M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); | 303 | 39.6M | if (enable_secondary && sec_strength) | 304 | 40.0M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); | 305 | | | 306 | 87.1M | for (i = 0; i < height; i += 4) { | 307 | 48.6M | sum = v256_zero(); | 308 | 48.6M | row = v256_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), | 309 | 48.6M | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]), | 310 | 48.6M | v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), | 311 | 48.6M | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); | 312 | 48.6M | max = min = row; | 313 | | | 314 | 48.6M | if (enable_primary) { | 315 | 43.2M | v256 tap[4]; | 316 | | // Primary near taps | 317 | 43.2M | tap[0] = | 318 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po1]), | 319 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]), | 320 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]), | 321 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1])); | 322 | 43.2M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); | 323 | 43.2M | tap[1] = | 324 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po1]), | 325 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]), | 326 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]), | 327 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1])); | 328 | 43.2M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); | 329 | | | 330 | | // sum += pri_taps[0] * (p0 + p1) | 331 | 43.2M | sum = v256_add_16( | 332 | 43.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); | 333 | | | 334 | | // Primary far taps | 335 | 43.2M | tap[2] = | 336 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + po2]), | 337 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]), | 338 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]), | 339 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2])); | 340 | 43.2M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); | 341 | 43.2M | tap[3] = | 342 | 43.2M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - po2]), | 343 | 43.2M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]), | 344 | 43.2M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]), | 345 | 43.2M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2])); | 346 | 43.2M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); | 347 | | | 348 | | // sum += pri_taps[1] * (p0 + p1) | 349 | 43.2M | sum = v256_add_16( | 350 | 43.2M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); | 351 | 43.2M | if (clipping_required) { | 352 | 28.6M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); | 353 | | | 354 | 28.6M | min = v256_min_s16(min, tap[0]); | 355 | 28.6M | min = v256_min_s16(min, tap[1]); | 356 | 28.6M | min = v256_min_s16(min, tap[2]); | 357 | 28.6M | min = v256_min_s16(min, tap[3]); | 358 | 28.6M | } | 359 | 43.2M | } | 360 | | | 361 | 48.6M | if (enable_secondary) { | 362 | 29.1M | v256 tap[8]; | 363 | | // Secondary near taps | 364 | 29.1M | tap[0] = | 365 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o1]), | 366 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]), | 367 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]), | 368 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1])); | 369 | 29.1M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); | 370 | 29.1M | tap[1] = | 371 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o1]), | 372 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]), | 373 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]), | 374 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1])); | 375 | 29.1M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); | 376 | 29.1M | tap[2] = | 377 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o1]), | 378 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]), | 379 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]), | 380 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1])); | 381 | 29.1M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); | 382 | 29.1M | tap[3] = | 383 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o1]), | 384 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]), | 385 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]), | 386 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1])); | 387 | 29.1M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); | 388 | | | 389 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) | 390 | 29.1M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), | 391 | 29.1M | v256_add_16(v256_add_16(p0, p1), | 392 | 29.1M | v256_add_16(p2, p3)))); | 393 | | | 394 | | // Secondary far taps | 395 | 29.1M | tap[4] = | 396 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s1o2]), | 397 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]), | 398 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]), | 399 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2])); | 400 | 29.1M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); | 401 | 29.1M | tap[5] = | 402 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s1o2]), | 403 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]), | 404 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]), | 405 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2])); | 406 | 29.1M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); | 407 | 29.1M | tap[6] = | 408 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE + s2o2]), | 409 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]), | 410 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]), | 411 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2])); | 412 | 29.1M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); | 413 | 29.1M | tap[7] = | 414 | 29.1M | v256_from_v64(v64_load_unaligned(&in[(i + 0) * CDEF_BSTRIDE - s2o2]), | 415 | 29.1M | v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]), | 416 | 29.1M | v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]), | 417 | 29.1M | v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2])); | 418 | 29.1M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); | 419 | | | 420 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) | 421 | 29.1M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), | 422 | 29.1M | v256_add_16(v256_add_16(p0, p1), | 423 | 29.1M | v256_add_16(p2, p3)))); | 424 | | | 425 | 29.1M | if (clipping_required) { | 426 | 28.7M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); | 427 | | | 428 | 28.7M | min = v256_min_s16(min, tap[0]); | 429 | 28.7M | min = v256_min_s16(min, tap[1]); | 430 | 28.7M | min = v256_min_s16(min, tap[2]); | 431 | 28.7M | min = v256_min_s16(min, tap[3]); | 432 | 28.7M | min = v256_min_s16(min, tap[4]); | 433 | 28.7M | min = v256_min_s16(min, tap[5]); | 434 | 28.7M | min = v256_min_s16(min, tap[6]); | 435 | 28.7M | min = v256_min_s16(min, tap[7]); | 436 | 28.7M | } | 437 | 29.1M | } | 438 | | | 439 | | // res = row + ((sum - (sum < 0) + 8) >> 4) | 440 | 48.6M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); | 441 | 48.6M | res = v256_add_16(sum, v256_dup_16(8)); | 442 | 48.6M | res = v256_shr_n_s16(res, 4); | 443 | 48.6M | res = v256_add_16(row, res); | 444 | 48.6M | if (clipping_required) { | 445 | 28.7M | res = v256_min_s16(v256_max_s16(res, min), max); | 446 | 28.7M | } | 447 | | | 448 | 48.6M | if (is_lowbd) { | 449 | 23.4M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); | 450 | 23.4M | u32_store_aligned(&dst8[(i + 0) * dstride], | 451 | 23.4M | v64_high_u32(v128_high_v64(res_128))); | 452 | 23.4M | u32_store_aligned(&dst8[(i + 1) * dstride], | 453 | 23.4M | v64_low_u32(v128_high_v64(res_128))); | 454 | 23.4M | u32_store_aligned(&dst8[(i + 2) * dstride], | 455 | 23.4M | v64_high_u32(v128_low_v64(res_128))); | 456 | 23.4M | u32_store_aligned(&dst8[(i + 3) * dstride], | 457 | 23.4M | v64_low_u32(v128_low_v64(res_128))); | 458 | 25.2M | } else { | 459 | 25.2M | v64_store_aligned(&dst16[(i + 0) * dstride], | 460 | 25.2M | v128_high_v64(v256_high_v128(res))); | 461 | 25.2M | v64_store_aligned(&dst16[(i + 1) * dstride], | 462 | 25.2M | v128_low_v64(v256_high_v128(res))); | 463 | 25.2M | v64_store_aligned(&dst16[(i + 2) * dstride], | 464 | 25.2M | v128_high_v64(v256_low_v128(res))); | 465 | 25.2M | v64_store_aligned(&dst16[(i + 3) * dstride], | 466 | 25.2M | v128_low_v64(v256_low_v128(res))); | 467 | 25.2M | } | 468 | 48.6M | } | 469 | 38.4M | } |
|
470 | | |
471 | | CDEF_INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, |
472 | | const uint16_t *in, int pri_strength, |
473 | | int sec_strength, int dir, int pri_damping, |
474 | | int sec_damping, int coeff_shift, int height, |
475 | 18.8M | int enable_primary, int enable_secondary) { |
476 | 18.8M | uint8_t *dst8 = (uint8_t *)dest; |
477 | 18.8M | uint16_t *dst16 = (uint16_t *)dest; |
478 | 18.8M | const int clipping_required = enable_primary && enable_secondary; |
479 | 18.8M | int i; |
480 | 18.8M | v256 sum, p0, p1, p2, p3, row, res; |
481 | 18.8M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); |
482 | 18.8M | v256 max, min; |
483 | 18.8M | const int po1 = cdef_directions[dir][0]; |
484 | 18.8M | const int po2 = cdef_directions[dir][1]; |
485 | 18.8M | const int s1o1 = cdef_directions[dir + 2][0]; |
486 | 18.8M | const int s1o2 = cdef_directions[dir + 2][1]; |
487 | 18.8M | const int s2o1 = cdef_directions[dir - 2][0]; |
488 | 18.8M | const int s2o2 = cdef_directions[dir - 2][1]; |
489 | 18.8M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; |
490 | 18.8M | const int *sec_taps = cdef_sec_taps; |
491 | | |
492 | 18.8M | if (enable_primary && pri_strength) |
493 | 13.4M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); |
494 | 18.8M | if (enable_secondary && sec_strength) |
495 | 15.9M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); |
496 | | |
497 | 72.1M | for (i = 0; i < height; i += 2) { |
498 | 53.2M | v256 tap[8]; |
499 | 53.2M | sum = v256_zero(); |
500 | 53.2M | row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), |
501 | 53.2M | v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); |
502 | | |
503 | 53.2M | min = max = row; |
504 | 53.2M | if (enable_primary) { |
505 | | // Primary near taps |
506 | 36.6M | tap[0] = v256_from_v128( |
507 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), |
508 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); |
509 | 36.6M | tap[1] = v256_from_v128( |
510 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), |
511 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); |
512 | 36.6M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); |
513 | 36.6M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); |
514 | | |
515 | | // sum += pri_taps[0] * (p0 + p1) |
516 | 36.6M | sum = v256_add_16( |
517 | 36.6M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); |
518 | | |
519 | | // Primary far taps |
520 | 36.6M | tap[2] = v256_from_v128( |
521 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), |
522 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); |
523 | 36.6M | tap[3] = v256_from_v128( |
524 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), |
525 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); |
526 | 36.6M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); |
527 | 36.6M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); |
528 | | |
529 | | // sum += pri_taps[1] * (p0 + p1) |
530 | 36.6M | sum = v256_add_16( |
531 | 36.6M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); |
532 | | |
533 | 36.6M | if (clipping_required) { |
534 | 25.7M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); |
535 | | |
536 | 25.7M | min = v256_min_s16(min, tap[0]); |
537 | 25.7M | min = v256_min_s16(min, tap[1]); |
538 | 25.7M | min = v256_min_s16(min, tap[2]); |
539 | 25.7M | min = v256_min_s16(min, tap[3]); |
540 | 25.7M | } |
541 | | // End primary |
542 | 36.6M | } |
543 | | |
544 | 53.2M | if (enable_secondary) { |
545 | | // Secondary near taps |
546 | 43.0M | tap[0] = v256_from_v128( |
547 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), |
548 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); |
549 | 43.0M | tap[1] = v256_from_v128( |
550 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), |
551 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); |
552 | 43.0M | tap[2] = v256_from_v128( |
553 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), |
554 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); |
555 | 43.0M | tap[3] = v256_from_v128( |
556 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), |
557 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); |
558 | 43.0M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); |
559 | 43.0M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); |
560 | 43.0M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); |
561 | 43.0M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); |
562 | | |
563 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) |
564 | 43.0M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), |
565 | 43.0M | v256_add_16(v256_add_16(p0, p1), |
566 | 43.0M | v256_add_16(p2, p3)))); |
567 | | |
568 | | // Secondary far taps |
569 | 43.0M | tap[4] = v256_from_v128( |
570 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), |
571 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); |
572 | 43.0M | tap[5] = v256_from_v128( |
573 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), |
574 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); |
575 | 43.0M | tap[6] = v256_from_v128( |
576 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), |
577 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); |
578 | 43.0M | tap[7] = v256_from_v128( |
579 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), |
580 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); |
581 | 43.0M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); |
582 | 43.0M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); |
583 | 43.0M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); |
584 | 43.0M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); |
585 | | |
586 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) |
587 | 43.0M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), |
588 | 43.0M | v256_add_16(v256_add_16(p0, p1), |
589 | 43.0M | v256_add_16(p2, p3)))); |
590 | | |
591 | 43.0M | if (clipping_required) { |
592 | 27.2M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); |
593 | | |
594 | 27.2M | min = v256_min_s16(min, tap[0]); |
595 | 27.2M | min = v256_min_s16(min, tap[1]); |
596 | 27.2M | min = v256_min_s16(min, tap[2]); |
597 | 27.2M | min = v256_min_s16(min, tap[3]); |
598 | 27.2M | min = v256_min_s16(min, tap[4]); |
599 | 27.2M | min = v256_min_s16(min, tap[5]); |
600 | 27.2M | min = v256_min_s16(min, tap[6]); |
601 | 27.2M | min = v256_min_s16(min, tap[7]); |
602 | 27.2M | } |
603 | | // End secondary |
604 | 43.0M | } |
605 | | |
606 | | // res = row + ((sum - (sum < 0) + 8) >> 4) |
607 | 53.2M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); |
608 | 53.2M | res = v256_add_16(sum, v256_dup_16(8)); |
609 | 53.2M | res = v256_shr_n_s16(res, 4); |
610 | 53.2M | res = v256_add_16(row, res); |
611 | 53.2M | if (clipping_required) { |
612 | 27.2M | res = v256_min_s16(v256_max_s16(res, min), max); |
613 | 27.2M | } |
614 | | |
615 | 53.2M | if (is_lowbd) { |
616 | 35.8M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); |
617 | 35.8M | v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); |
618 | 35.8M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); |
619 | 35.8M | } else { |
620 | 17.4M | v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); |
621 | 17.4M | v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); |
622 | 17.4M | } |
623 | 53.2M | } |
624 | 18.8M | } Unexecuted instantiation: cdef_block_sse2.c:filter_block_8x8 Unexecuted instantiation: cdef_block_ssse3.c:filter_block_8x8 Unexecuted instantiation: cdef_block_sse4.c:filter_block_8x8 cdef_block_avx2.c:filter_block_8x8 Line | Count | Source | 475 | 18.8M | int enable_primary, int enable_secondary) { | 476 | 18.8M | uint8_t *dst8 = (uint8_t *)dest; | 477 | 18.8M | uint16_t *dst16 = (uint16_t *)dest; | 478 | 18.8M | const int clipping_required = enable_primary && enable_secondary; | 479 | 18.8M | int i; | 480 | 18.8M | v256 sum, p0, p1, p2, p3, row, res; | 481 | 18.8M | const v256 cdef_large_value_mask = v256_dup_16((uint16_t)~CDEF_VERY_LARGE); | 482 | 18.8M | v256 max, min; | 483 | 18.8M | const int po1 = cdef_directions[dir][0]; | 484 | 18.8M | const int po2 = cdef_directions[dir][1]; | 485 | 18.8M | const int s1o1 = cdef_directions[dir + 2][0]; | 486 | 18.8M | const int s1o2 = cdef_directions[dir + 2][1]; | 487 | 18.8M | const int s2o1 = cdef_directions[dir - 2][0]; | 488 | 18.8M | const int s2o2 = cdef_directions[dir - 2][1]; | 489 | 18.8M | const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; | 490 | 18.8M | const int *sec_taps = cdef_sec_taps; | 491 | | | 492 | 18.8M | if (enable_primary && pri_strength) | 493 | 13.4M | pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); | 494 | 18.8M | if (enable_secondary && sec_strength) | 495 | 15.9M | sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); | 496 | | | 497 | 72.1M | for (i = 0; i < height; i += 2) { | 498 | 53.2M | v256 tap[8]; | 499 | 53.2M | sum = v256_zero(); | 500 | 53.2M | row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]), | 501 | 53.2M | v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); | 502 | | | 503 | 53.2M | min = max = row; | 504 | 53.2M | if (enable_primary) { | 505 | | // Primary near taps | 506 | 36.6M | tap[0] = v256_from_v128( | 507 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]), | 508 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1])); | 509 | 36.6M | tap[1] = v256_from_v128( | 510 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]), | 511 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1])); | 512 | 36.6M | p0 = constrain16(tap[0], row, pri_strength, pri_damping); | 513 | 36.6M | p1 = constrain16(tap[1], row, pri_strength, pri_damping); | 514 | | | 515 | | // sum += pri_taps[0] * (p0 + p1) | 516 | 36.6M | sum = v256_add_16( | 517 | 36.6M | sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1))); | 518 | | | 519 | | // Primary far taps | 520 | 36.6M | tap[2] = v256_from_v128( | 521 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]), | 522 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2])); | 523 | 36.6M | tap[3] = v256_from_v128( | 524 | 36.6M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]), | 525 | 36.6M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2])); | 526 | 36.6M | p0 = constrain16(tap[2], row, pri_strength, pri_damping); | 527 | 36.6M | p1 = constrain16(tap[3], row, pri_strength, pri_damping); | 528 | | | 529 | | // sum += pri_taps[1] * (p0 + p1) | 530 | 36.6M | sum = v256_add_16( | 531 | 36.6M | sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1))); | 532 | | | 533 | 36.6M | if (clipping_required) { | 534 | 25.7M | max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); | 535 | | | 536 | 25.7M | min = v256_min_s16(min, tap[0]); | 537 | 25.7M | min = v256_min_s16(min, tap[1]); | 538 | 25.7M | min = v256_min_s16(min, tap[2]); | 539 | 25.7M | min = v256_min_s16(min, tap[3]); | 540 | 25.7M | } | 541 | | // End primary | 542 | 36.6M | } | 543 | | | 544 | 53.2M | if (enable_secondary) { | 545 | | // Secondary near taps | 546 | 43.0M | tap[0] = v256_from_v128( | 547 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]), | 548 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1])); | 549 | 43.0M | tap[1] = v256_from_v128( | 550 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]), | 551 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1])); | 552 | 43.0M | tap[2] = v256_from_v128( | 553 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]), | 554 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1])); | 555 | 43.0M | tap[3] = v256_from_v128( | 556 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]), | 557 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1])); | 558 | 43.0M | p0 = constrain16(tap[0], row, sec_strength, sec_damping); | 559 | 43.0M | p1 = constrain16(tap[1], row, sec_strength, sec_damping); | 560 | 43.0M | p2 = constrain16(tap[2], row, sec_strength, sec_damping); | 561 | 43.0M | p3 = constrain16(tap[3], row, sec_strength, sec_damping); | 562 | | | 563 | | // sum += sec_taps[0] * (p0 + p1 + p2 + p3) | 564 | 43.0M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]), | 565 | 43.0M | v256_add_16(v256_add_16(p0, p1), | 566 | 43.0M | v256_add_16(p2, p3)))); | 567 | | | 568 | | // Secondary far taps | 569 | 43.0M | tap[4] = v256_from_v128( | 570 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]), | 571 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2])); | 572 | 43.0M | tap[5] = v256_from_v128( | 573 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]), | 574 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2])); | 575 | 43.0M | tap[6] = v256_from_v128( | 576 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]), | 577 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2])); | 578 | 43.0M | tap[7] = v256_from_v128( | 579 | 43.0M | v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]), | 580 | 43.0M | v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2])); | 581 | 43.0M | p0 = constrain16(tap[4], row, sec_strength, sec_damping); | 582 | 43.0M | p1 = constrain16(tap[5], row, sec_strength, sec_damping); | 583 | 43.0M | p2 = constrain16(tap[6], row, sec_strength, sec_damping); | 584 | 43.0M | p3 = constrain16(tap[7], row, sec_strength, sec_damping); | 585 | | | 586 | | // sum += sec_taps[1] * (p0 + p1 + p2 + p3) | 587 | 43.0M | sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]), | 588 | 43.0M | v256_add_16(v256_add_16(p0, p1), | 589 | 43.0M | v256_add_16(p2, p3)))); | 590 | | | 591 | 43.0M | if (clipping_required) { | 592 | 27.2M | max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); | 593 | | | 594 | 27.2M | min = v256_min_s16(min, tap[0]); | 595 | 27.2M | min = v256_min_s16(min, tap[1]); | 596 | 27.2M | min = v256_min_s16(min, tap[2]); | 597 | 27.2M | min = v256_min_s16(min, tap[3]); | 598 | 27.2M | min = v256_min_s16(min, tap[4]); | 599 | 27.2M | min = v256_min_s16(min, tap[5]); | 600 | 27.2M | min = v256_min_s16(min, tap[6]); | 601 | 27.2M | min = v256_min_s16(min, tap[7]); | 602 | 27.2M | } | 603 | | // End secondary | 604 | 43.0M | } | 605 | | | 606 | | // res = row + ((sum - (sum < 0) + 8) >> 4) | 607 | 53.2M | sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero())); | 608 | 53.2M | res = v256_add_16(sum, v256_dup_16(8)); | 609 | 53.2M | res = v256_shr_n_s16(res, 4); | 610 | 53.2M | res = v256_add_16(row, res); | 611 | 53.2M | if (clipping_required) { | 612 | 27.2M | res = v256_min_s16(v256_max_s16(res, min), max); | 613 | 27.2M | } | 614 | | | 615 | 53.2M | if (is_lowbd) { | 616 | 35.8M | const v128 res_128 = v256_low_v128(v256_pack_s16_u8(res, res)); | 617 | 35.8M | v64_store_aligned(&dst8[i * dstride], v128_high_v64(res_128)); | 618 | 35.8M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_low_v64(res_128)); | 619 | 35.8M | } else { | 620 | 17.4M | v128_store_unaligned(&dst16[i * dstride], v256_high_v128(res)); | 621 | 17.4M | v128_store_unaligned(&dst16[(i + 1) * dstride], v256_low_v128(res)); | 622 | 17.4M | } | 623 | 53.2M | } | 624 | 18.8M | } |
|
625 | | |
626 | | #if defined(_MSC_VER) && !defined(__clang__) |
627 | | #pragma optimize("", on) |
628 | | #endif |
629 | | |
630 | | SIMD_INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, |
631 | 0 | const uint16_t *in, int height) { |
632 | 0 | uint8_t *dst8 = (uint8_t *)dest; |
633 | 0 | uint16_t *dst16 = (uint16_t *)dest; |
634 | 0 | int i; |
635 | 0 | for (i = 0; i < height; i += 4) { |
636 | 0 | const v128 row0 = |
637 | 0 | v128_from_v64(v64_load_aligned(&in[(i + 0) * CDEF_BSTRIDE]), |
638 | 0 | v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE])); |
639 | 0 | const v128 row1 = |
640 | 0 | v128_from_v64(v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]), |
641 | 0 | v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE])); |
642 | 0 | if (is_lowbd) { |
643 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ |
644 | 0 | const v128 res_128 = v128_pack_s16_u8(row1, row0); |
645 | 0 | u32_store_aligned(&dst8[(i + 0) * dstride], |
646 | 0 | v64_high_u32(v128_low_v64(res_128))); |
647 | 0 | u32_store_aligned(&dst8[(i + 1) * dstride], |
648 | 0 | v64_low_u32(v128_low_v64(res_128))); |
649 | 0 | u32_store_aligned(&dst8[(i + 2) * dstride], |
650 | 0 | v64_high_u32(v128_high_v64(res_128))); |
651 | 0 | u32_store_aligned(&dst8[(i + 3) * dstride], |
652 | 0 | v64_low_u32(v128_high_v64(res_128))); |
653 | 0 | } else { |
654 | 0 | v64_store_aligned(&dst16[(i + 0) * dstride], v128_high_v64(row0)); |
655 | 0 | v64_store_aligned(&dst16[(i + 1) * dstride], v128_low_v64(row0)); |
656 | 0 | v64_store_aligned(&dst16[(i + 2) * dstride], v128_high_v64(row1)); |
657 | 0 | v64_store_aligned(&dst16[(i + 3) * dstride], v128_low_v64(row1)); |
658 | 0 | } |
659 | 0 | } |
660 | 0 | } Unexecuted instantiation: cdef_block_sse2.c:copy_block_4xh Unexecuted instantiation: cdef_block_ssse3.c:copy_block_4xh Unexecuted instantiation: cdef_block_sse4.c:copy_block_4xh Unexecuted instantiation: cdef_block_avx2.c:copy_block_4xh |
661 | | |
662 | | SIMD_INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, |
663 | 5.23M | const uint16_t *in, int height) { |
664 | 5.23M | uint8_t *dst8 = (uint8_t *)dest; |
665 | 5.23M | uint16_t *dst16 = (uint16_t *)dest; |
666 | 5.23M | int i; |
667 | 26.0M | for (i = 0; i < height; i += 2) { |
668 | 20.8M | const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); |
669 | 20.8M | const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); |
670 | 20.8M | if (is_lowbd) { |
671 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ |
672 | 15.2M | const v128 res_128 = v128_pack_s16_u8(row1, row0); |
673 | 15.2M | v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); |
674 | 15.2M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); |
675 | 15.2M | } else { |
676 | 5.52M | v128_store_unaligned(&dst16[i * dstride], row0); |
677 | 5.52M | v128_store_unaligned(&dst16[(i + 1) * dstride], row1); |
678 | 5.52M | } |
679 | 20.8M | } |
680 | 5.23M | } Unexecuted instantiation: cdef_block_sse2.c:copy_block_8xh Unexecuted instantiation: cdef_block_ssse3.c:copy_block_8xh Unexecuted instantiation: cdef_block_sse4.c:copy_block_8xh cdef_block_avx2.c:copy_block_8xh Line | Count | Source | 663 | 5.23M | const uint16_t *in, int height) { | 664 | 5.23M | uint8_t *dst8 = (uint8_t *)dest; | 665 | 5.23M | uint16_t *dst16 = (uint16_t *)dest; | 666 | 5.23M | int i; | 667 | 26.0M | for (i = 0; i < height; i += 2) { | 668 | 20.8M | const v128 row0 = v128_load_aligned(&in[i * CDEF_BSTRIDE]); | 669 | 20.8M | const v128 row1 = v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]); | 670 | 20.8M | if (is_lowbd) { | 671 | | /* Note: v128_pack_s16_u8(). The parameter order is swapped internally */ | 672 | 15.2M | const v128 res_128 = v128_pack_s16_u8(row1, row0); | 673 | 15.2M | v64_store_aligned(&dst8[i * dstride], v128_low_v64(res_128)); | 674 | 15.2M | v64_store_aligned(&dst8[(i + 1) * dstride], v128_high_v64(res_128)); | 675 | 15.2M | } else { | 676 | 5.52M | v128_store_unaligned(&dst16[i * dstride], row0); | 677 | 5.52M | v128_store_unaligned(&dst16[(i + 1) * dstride], row1); | 678 | 5.52M | } | 679 | 20.8M | } | 680 | 5.23M | } |
|
681 | | |
682 | | void SIMD_FUNC(cdef_filter_8_0)(void *dest, int dstride, const uint16_t *in, |
683 | | int pri_strength, int sec_strength, int dir, |
684 | | int pri_damping, int sec_damping, |
685 | | int coeff_shift, int block_width, |
686 | 30.7M | int block_height) { |
687 | 30.7M | if (block_width == 8) { |
688 | 7.19M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
689 | 7.19M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
690 | 7.19M | block_height, /*enable_primary=*/1, |
691 | 7.19M | /*enable_secondary=*/1); |
692 | 23.5M | } else { |
693 | 23.5M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
694 | 23.5M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
695 | 23.5M | block_height, /*enable_primary=*/1, |
696 | 23.5M | /*enable_secondary=*/1); |
697 | 23.5M | } |
698 | 30.7M | } Unexecuted instantiation: cdef_filter_8_0_sse2 Unexecuted instantiation: cdef_filter_8_0_ssse3 Unexecuted instantiation: cdef_filter_8_0_sse4_1 Line | Count | Source | 686 | 30.7M | int block_height) { | 687 | 30.7M | if (block_width == 8) { | 688 | 7.19M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 689 | 7.19M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 690 | 7.19M | block_height, /*enable_primary=*/1, | 691 | 7.19M | /*enable_secondary=*/1); | 692 | 23.5M | } else { | 693 | 23.5M | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 694 | 23.5M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 695 | 23.5M | block_height, /*enable_primary=*/1, | 696 | 23.5M | /*enable_secondary=*/1); | 697 | 23.5M | } | 698 | 30.7M | } |
|
699 | | |
700 | | void SIMD_FUNC(cdef_filter_8_1)(void *dest, int dstride, const uint16_t *in, |
701 | | int pri_strength, int sec_strength, int dir, |
702 | | int pri_damping, int sec_damping, |
703 | | int coeff_shift, int block_width, |
704 | 3.92M | int block_height) { |
705 | 3.92M | if (block_width == 8) { |
706 | 3.08M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
707 | 3.08M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
708 | 3.08M | block_height, /*enable_primary=*/1, |
709 | 3.08M | /*enable_secondary=*/0); |
710 | 3.08M | } else { |
711 | 833k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
712 | 833k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
713 | 833k | block_height, /*enable_primary=*/1, |
714 | 833k | /*enable_secondary=*/0); |
715 | 833k | } |
716 | 3.92M | } Unexecuted instantiation: cdef_filter_8_1_sse2 Unexecuted instantiation: cdef_filter_8_1_ssse3 Unexecuted instantiation: cdef_filter_8_1_sse4_1 Line | Count | Source | 704 | 3.92M | int block_height) { | 705 | 3.92M | if (block_width == 8) { | 706 | 3.08M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 707 | 3.08M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 708 | 3.08M | block_height, /*enable_primary=*/1, | 709 | 3.08M | /*enable_secondary=*/0); | 710 | 3.08M | } else { | 711 | 833k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 712 | 833k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 713 | 833k | block_height, /*enable_primary=*/1, | 714 | 833k | /*enable_secondary=*/0); | 715 | 833k | } | 716 | 3.92M | } |
|
717 | | void SIMD_FUNC(cdef_filter_8_2)(void *dest, int dstride, const uint16_t *in, |
718 | | int pri_strength, int sec_strength, int dir, |
719 | | int pri_damping, int sec_damping, |
720 | | int coeff_shift, int block_width, |
721 | 4.76M | int block_height) { |
722 | 4.76M | if (block_width == 8) { |
723 | 4.46M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
724 | 4.46M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
725 | 4.46M | block_height, /*enable_primary=*/0, |
726 | 4.46M | /*enable_secondary=*/1); |
727 | 4.46M | } else { |
728 | 295k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, |
729 | 295k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
730 | 295k | block_height, /*enable_primary=*/0, |
731 | 295k | /*enable_secondary=*/1); |
732 | 295k | } |
733 | 4.76M | } Unexecuted instantiation: cdef_filter_8_2_sse2 Unexecuted instantiation: cdef_filter_8_2_ssse3 Unexecuted instantiation: cdef_filter_8_2_sse4_1 Line | Count | Source | 721 | 4.76M | int block_height) { | 722 | 4.76M | if (block_width == 8) { | 723 | 4.46M | filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 724 | 4.46M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 725 | 4.46M | block_height, /*enable_primary=*/0, | 726 | 4.46M | /*enable_secondary=*/1); | 727 | 4.46M | } else { | 728 | 295k | filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, | 729 | 295k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 730 | 295k | block_height, /*enable_primary=*/0, | 731 | 295k | /*enable_secondary=*/1); | 732 | 295k | } | 733 | 4.76M | } |
|
734 | | |
735 | | void SIMD_FUNC(cdef_filter_8_3)(void *dest, int dstride, const uint16_t *in, |
736 | | int pri_strength, int sec_strength, int dir, |
737 | | int pri_damping, int sec_damping, |
738 | | int coeff_shift, int block_width, |
739 | 3.85M | int block_height) { |
740 | 3.85M | (void)pri_strength; |
741 | 3.85M | (void)sec_strength; |
742 | 3.85M | (void)dir; |
743 | 3.85M | (void)pri_damping; |
744 | 3.85M | (void)sec_damping; |
745 | 3.85M | (void)coeff_shift; |
746 | 3.85M | (void)block_width; |
747 | | |
748 | 3.85M | if (block_width == 8) { |
749 | 3.85M | copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); |
750 | 18.4E | } else { |
751 | 18.4E | copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); |
752 | 18.4E | } |
753 | 3.85M | } Unexecuted instantiation: cdef_filter_8_3_sse2 Unexecuted instantiation: cdef_filter_8_3_ssse3 Unexecuted instantiation: cdef_filter_8_3_sse4_1 Line | Count | Source | 739 | 3.85M | int block_height) { | 740 | 3.85M | (void)pri_strength; | 741 | 3.85M | (void)sec_strength; | 742 | 3.85M | (void)dir; | 743 | 3.85M | (void)pri_damping; | 744 | 3.85M | (void)sec_damping; | 745 | 3.85M | (void)coeff_shift; | 746 | 3.85M | (void)block_width; | 747 | | | 748 | 3.85M | if (block_width == 8) { | 749 | 3.85M | copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); | 750 | 18.4E | } else { | 751 | 18.4E | copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); | 752 | 18.4E | } | 753 | 3.85M | } |
|
754 | | |
755 | | void SIMD_FUNC(cdef_filter_16_0)(void *dest, int dstride, const uint16_t *in, |
756 | | int pri_strength, int sec_strength, int dir, |
757 | | int pri_damping, int sec_damping, |
758 | | int coeff_shift, int block_width, |
759 | 10.0M | int block_height) { |
760 | 10.0M | if (block_width == 8) { |
761 | 2.50M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
762 | 2.50M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
763 | 2.50M | block_height, /*enable_primary=*/1, |
764 | 2.50M | /*enable_secondary=*/1); |
765 | 7.57M | } else { |
766 | 7.57M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
767 | 7.57M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
768 | 7.57M | block_height, /*enable_primary=*/1, |
769 | 7.57M | /*enable_secondary=*/1); |
770 | 7.57M | } |
771 | 10.0M | } Unexecuted instantiation: cdef_filter_16_0_sse2 Unexecuted instantiation: cdef_filter_16_0_ssse3 Unexecuted instantiation: cdef_filter_16_0_sse4_1 Line | Count | Source | 759 | 10.0M | int block_height) { | 760 | 10.0M | if (block_width == 8) { | 761 | 2.50M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 762 | 2.50M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 763 | 2.50M | block_height, /*enable_primary=*/1, | 764 | 2.50M | /*enable_secondary=*/1); | 765 | 7.57M | } else { | 766 | 7.57M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 767 | 7.57M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 768 | 7.57M | block_height, /*enable_primary=*/1, | 769 | 7.57M | /*enable_secondary=*/1); | 770 | 7.57M | } | 771 | 10.0M | } |
|
772 | | |
773 | | void SIMD_FUNC(cdef_filter_16_1)(void *dest, int dstride, const uint16_t *in, |
774 | | int pri_strength, int sec_strength, int dir, |
775 | | int pri_damping, int sec_damping, |
776 | | int coeff_shift, int block_width, |
777 | 2.07M | int block_height) { |
778 | 2.07M | if (block_width == 8) { |
779 | 1.04M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
780 | 1.04M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
781 | 1.04M | block_height, /*enable_primary=*/1, |
782 | 1.04M | /*enable_secondary=*/0); |
783 | 1.04M | } else { |
784 | 1.02M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
785 | 1.02M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
786 | 1.02M | block_height, /*enable_primary=*/1, |
787 | 1.02M | /*enable_secondary=*/0); |
788 | 1.02M | } |
789 | 2.07M | } Unexecuted instantiation: cdef_filter_16_1_sse2 Unexecuted instantiation: cdef_filter_16_1_ssse3 Unexecuted instantiation: cdef_filter_16_1_sse4_1 Line | Count | Source | 777 | 2.07M | int block_height) { | 778 | 2.07M | if (block_width == 8) { | 779 | 1.04M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 780 | 1.04M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 781 | 1.04M | block_height, /*enable_primary=*/1, | 782 | 1.04M | /*enable_secondary=*/0); | 783 | 1.04M | } else { | 784 | 1.02M | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 785 | 1.02M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 786 | 1.02M | block_height, /*enable_primary=*/1, | 787 | 1.02M | /*enable_secondary=*/0); | 788 | 1.02M | } | 789 | 2.07M | } |
|
790 | | void SIMD_FUNC(cdef_filter_16_2)(void *dest, int dstride, const uint16_t *in, |
791 | | int pri_strength, int sec_strength, int dir, |
792 | | int pri_damping, int sec_damping, |
793 | | int coeff_shift, int block_width, |
794 | 2.06M | int block_height) { |
795 | 2.06M | if (block_width == 8) { |
796 | 1.91M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
797 | 1.91M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
798 | 1.91M | block_height, /*enable_primary=*/0, |
799 | 1.91M | /*enable_secondary=*/1); |
800 | 1.91M | } else { |
801 | 148k | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, |
802 | 148k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, |
803 | 148k | block_height, /*enable_primary=*/0, |
804 | 148k | /*enable_secondary=*/1); |
805 | 148k | } |
806 | 2.06M | } Unexecuted instantiation: cdef_filter_16_2_sse2 Unexecuted instantiation: cdef_filter_16_2_ssse3 Unexecuted instantiation: cdef_filter_16_2_sse4_1 Line | Count | Source | 794 | 2.06M | int block_height) { | 795 | 2.06M | if (block_width == 8) { | 796 | 1.91M | filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 797 | 1.91M | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 798 | 1.91M | block_height, /*enable_primary=*/0, | 799 | 1.91M | /*enable_secondary=*/1); | 800 | 1.91M | } else { | 801 | 148k | filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, | 802 | 148k | sec_strength, dir, pri_damping, sec_damping, coeff_shift, | 803 | 148k | block_height, /*enable_primary=*/0, | 804 | 148k | /*enable_secondary=*/1); | 805 | 148k | } | 806 | 2.06M | } |
|
807 | | |
808 | | void SIMD_FUNC(cdef_filter_16_3)(void *dest, int dstride, const uint16_t *in, |
809 | | int pri_strength, int sec_strength, int dir, |
810 | | int pri_damping, int sec_damping, |
811 | | int coeff_shift, int block_width, |
812 | 1.38M | int block_height) { |
813 | 1.38M | (void)pri_strength; |
814 | 1.38M | (void)sec_strength; |
815 | 1.38M | (void)dir; |
816 | 1.38M | (void)pri_damping; |
817 | 1.38M | (void)sec_damping; |
818 | 1.38M | (void)coeff_shift; |
819 | 1.38M | (void)block_width; |
820 | 1.38M | if (block_width == 8) { |
821 | 1.38M | copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); |
822 | 18.4E | } else { |
823 | 18.4E | copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); |
824 | 18.4E | } |
825 | 1.38M | } Unexecuted instantiation: cdef_filter_16_3_sse2 Unexecuted instantiation: cdef_filter_16_3_ssse3 Unexecuted instantiation: cdef_filter_16_3_sse4_1 Line | Count | Source | 812 | 1.38M | int block_height) { | 813 | 1.38M | (void)pri_strength; | 814 | 1.38M | (void)sec_strength; | 815 | 1.38M | (void)dir; | 816 | 1.38M | (void)pri_damping; | 817 | 1.38M | (void)sec_damping; | 818 | 1.38M | (void)coeff_shift; | 819 | 1.38M | (void)block_width; | 820 | 1.38M | if (block_width == 8) { | 821 | 1.38M | copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); | 822 | 18.4E | } else { | 823 | 18.4E | copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); | 824 | 18.4E | } | 825 | 1.38M | } |
|
826 | | |
827 | | void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, |
828 | | const uint16_t *src, int sstride, |
829 | 481k | int width, int height) { |
830 | 481k | int i, j; |
831 | 14.4M | for (i = 0; i < height; i++) { |
832 | 107M | for (j = 0; j < (width & ~0x7); j += 8) { |
833 | 93.6M | v128 row = v128_load_unaligned(&src[i * sstride + j]); |
834 | 93.6M | v128_store_unaligned(&dst[i * dstride + j], row); |
835 | 93.6M | } |
836 | 17.0M | for (; j < width; j++) { |
837 | 3.05M | dst[i * dstride + j] = src[i * sstride + j]; |
838 | 3.05M | } |
839 | 13.9M | } |
840 | 481k | } Unexecuted instantiation: cdef_copy_rect8_16bit_to_16bit_sse2 Unexecuted instantiation: cdef_copy_rect8_16bit_to_16bit_ssse3 Unexecuted instantiation: cdef_copy_rect8_16bit_to_16bit_sse4_1 cdef_copy_rect8_16bit_to_16bit_avx2 Line | Count | Source | 829 | 481k | int width, int height) { | 830 | 481k | int i, j; | 831 | 14.4M | for (i = 0; i < height; i++) { | 832 | 107M | for (j = 0; j < (width & ~0x7); j += 8) { | 833 | 93.6M | v128 row = v128_load_unaligned(&src[i * sstride + j]); | 834 | 93.6M | v128_store_unaligned(&dst[i * dstride + j], row); | 835 | 93.6M | } | 836 | 17.0M | for (; j < width; j++) { | 837 | 3.05M | dst[i * dstride + j] = src[i * sstride + j]; | 838 | 3.05M | } | 839 | 13.9M | } | 840 | 481k | } |
|
841 | | |
842 | | #undef CDEF_INLINE |
843 | | |
844 | | #endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_ |