/src/aom/aom_dsp/x86/transpose_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ |
13 | | #define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ |
14 | | |
15 | | #include <emmintrin.h> // SSE2 |
16 | | |
17 | | #include "config/aom_config.h" |
18 | | |
19 | 0 | static inline __m128i transpose_8bit_4x4(const __m128i *const in) { |
20 | | // Unpack 8 bit elements. Goes from: |
21 | | // in[0]: 00 01 02 03 |
22 | | // in[1]: 10 11 12 13 |
23 | | // in[2]: 20 21 22 23 |
24 | | // in[3]: 30 31 32 33 |
25 | | // to: |
26 | | // a0: 00 10 01 11 02 12 03 13 |
27 | | // a1: 20 30 21 31 22 32 23 33 |
28 | 0 | const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); |
29 | 0 | const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); |
30 | | |
31 | | // Unpack 16 bit elements resulting in: |
32 | | // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
33 | 0 | return _mm_unpacklo_epi16(a0, a1); |
34 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: resize_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_8bit_4x4 |
35 | | |
36 | | static inline void transpose_8bit_8x8(const __m128i *const in, |
37 | 0 | __m128i *const out) { |
38 | | // Unpack 8 bit elements. Goes from: |
39 | | // in[0]: 00 01 02 03 04 05 06 07 |
40 | | // in[1]: 10 11 12 13 14 15 16 17 |
41 | | // in[2]: 20 21 22 23 24 25 26 27 |
42 | | // in[3]: 30 31 32 33 34 35 36 37 |
43 | | // in[4]: 40 41 42 43 44 45 46 47 |
44 | | // in[5]: 50 51 52 53 54 55 56 57 |
45 | | // in[6]: 60 61 62 63 64 65 66 67 |
46 | | // in[7]: 70 71 72 73 74 75 76 77 |
47 | | // to: |
48 | | // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
49 | | // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
50 | | // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 |
51 | | // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 |
52 | 0 | const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); |
53 | 0 | const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); |
54 | 0 | const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); |
55 | 0 | const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); |
56 | | |
57 | | // Unpack 16 bit elements resulting in: |
58 | | // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
59 | | // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 |
60 | | // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 |
61 | | // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 |
62 | 0 | const __m128i b0 = _mm_unpacklo_epi16(a0, a1); |
63 | 0 | const __m128i b1 = _mm_unpackhi_epi16(a0, a1); |
64 | 0 | const __m128i b2 = _mm_unpacklo_epi16(a2, a3); |
65 | 0 | const __m128i b3 = _mm_unpackhi_epi16(a2, a3); |
66 | | |
67 | | // Unpack 32 bit elements resulting in: |
68 | | // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
69 | | // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
70 | | // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 |
71 | | // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 |
72 | 0 | const __m128i c0 = _mm_unpacklo_epi32(b0, b2); |
73 | 0 | const __m128i c1 = _mm_unpackhi_epi32(b0, b2); |
74 | 0 | const __m128i c2 = _mm_unpacklo_epi32(b1, b3); |
75 | 0 | const __m128i c3 = _mm_unpackhi_epi32(b1, b3); |
76 | | |
77 | | // Unpack 64 bit elements resulting in: |
78 | | // out[0]: 00 10 20 30 40 50 60 70 |
79 | | // out[1]: 01 11 21 31 41 51 61 71 |
80 | | // out[2]: 02 12 22 32 42 52 62 72 |
81 | | // out[3]: 03 13 23 33 43 53 63 73 |
82 | | // out[4]: 04 14 24 34 44 54 64 74 |
83 | | // out[5]: 05 15 25 35 45 55 65 75 |
84 | | // out[6]: 06 16 26 36 46 56 66 76 |
85 | | // out[7]: 07 17 27 37 47 57 67 77 |
86 | 0 | out[0] = _mm_unpacklo_epi64(c0, c0); |
87 | 0 | out[1] = _mm_unpackhi_epi64(c0, c0); |
88 | 0 | out[2] = _mm_unpacklo_epi64(c1, c1); |
89 | 0 | out[3] = _mm_unpackhi_epi64(c1, c1); |
90 | 0 | out[4] = _mm_unpacklo_epi64(c2, c2); |
91 | 0 | out[5] = _mm_unpackhi_epi64(c2, c2); |
92 | 0 | out[6] = _mm_unpacklo_epi64(c3, c3); |
93 | 0 | out[7] = _mm_unpackhi_epi64(c3, c3); |
94 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: resize_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_8bit_8x8 |
95 | | |
96 | | static inline void transpose_16bit_4x4(const __m128i *const in, |
97 | 588k | __m128i *const out) { |
98 | | // Unpack 16 bit elements. Goes from: |
99 | | // in[0]: 00 01 02 03 XX XX XX XX |
100 | | // in[1]: 10 11 12 13 XX XX XX XX |
101 | | // in[2]: 20 21 22 23 XX XX XX XX |
102 | | // in[3]: 30 31 32 33 XX XX XX XX |
103 | | // to: |
104 | | // a0: 00 10 01 11 02 12 03 13 |
105 | | // a1: 20 30 21 31 22 32 23 33 |
106 | 588k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
107 | 588k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
108 | | |
109 | | // Unpack 32 bit elements resulting in: |
110 | | // out[0]: 00 10 20 30 01 11 21 31 |
111 | | // out[1]: 01 11 21 31 __ __ __ __ |
112 | | // out[2]: 02 12 22 32 03 13 23 33 |
113 | | // out[3]: 03 13 23 33 __ __ __ __ |
114 | | // |
115 | | // Note: The high 64 bits of the output registers are shown for informational |
116 | | // purposes only. Callers should only use the low 64 bits of the output |
117 | | // registers. "__" indicates zeros. |
118 | 588k | out[0] = _mm_unpacklo_epi32(a0, a1); |
119 | 588k | out[1] = _mm_srli_si128(out[0], 8); |
120 | 588k | out[2] = _mm_unpackhi_epi32(a0, a1); |
121 | 588k | out[3] = _mm_srli_si128(out[2], 8); |
122 | 588k | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x4 av1_inv_txfm_ssse3.c:transpose_16bit_4x4 Line | Count | Source | 97 | 588k | __m128i *const out) { | 98 | | // Unpack 16 bit elements. Goes from: | 99 | | // in[0]: 00 01 02 03 XX XX XX XX | 100 | | // in[1]: 10 11 12 13 XX XX XX XX | 101 | | // in[2]: 20 21 22 23 XX XX XX XX | 102 | | // in[3]: 30 31 32 33 XX XX XX XX | 103 | | // to: | 104 | | // a0: 00 10 01 11 02 12 03 13 | 105 | | // a1: 20 30 21 31 22 32 23 33 | 106 | 588k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 107 | 588k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 108 | | | 109 | | // Unpack 32 bit elements resulting in: | 110 | | // out[0]: 00 10 20 30 01 11 21 31 | 111 | | // out[1]: 01 11 21 31 __ __ __ __ | 112 | | // out[2]: 02 12 22 32 03 13 23 33 | 113 | | // out[3]: 03 13 23 33 __ __ __ __ | 114 | | // | 115 | | // Note: The high 64 bits of the output registers are shown for informational | 116 | | // purposes only. Callers should only use the low 64 bits of the output | 117 | | // registers. "__" indicates zeros. | 118 | 588k | out[0] = _mm_unpacklo_epi32(a0, a1); | 119 | 588k | out[1] = _mm_srli_si128(out[0], 8); | 120 | 588k | out[2] = _mm_unpackhi_epi32(a0, a1); | 121 | 588k | out[3] = _mm_srli_si128(out[2], 8); | 122 | 588k | } |
Unexecuted instantiation: resize_ssse3.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_16bit_4x4 |
123 | | |
124 | | static inline void transpose_16bit_4x8(const __m128i *const in, |
125 | 1.57M | __m128i *const out) { |
126 | | // Unpack 16 bit elements. Goes from: |
127 | | // in[0]: 00 01 02 03 XX XX XX XX |
128 | | // in[1]: 10 11 12 13 XX XX XX XX |
129 | | // in[2]: 20 21 22 23 XX XX XX XX |
130 | | // in[3]: 30 31 32 33 XX XX XX XX |
131 | | // in[4]: 40 41 42 43 XX XX XX XX |
132 | | // in[5]: 50 51 52 53 XX XX XX XX |
133 | | // in[6]: 60 61 62 63 XX XX XX XX |
134 | | // in[7]: 70 71 72 73 XX XX XX XX |
135 | | // to: |
136 | | // a0: 00 10 01 11 02 12 03 13 |
137 | | // a1: 20 30 21 31 22 32 23 33 |
138 | | // a2: 40 50 41 51 42 52 43 53 |
139 | | // a3: 60 70 61 71 62 72 63 73 |
140 | 1.57M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
141 | 1.57M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
142 | 1.57M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); |
143 | 1.57M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); |
144 | | |
145 | | // Unpack 32 bit elements resulting in: |
146 | | // b0: 00 10 20 30 01 11 21 31 |
147 | | // b1: 40 50 60 70 41 51 61 71 |
148 | | // b2: 02 12 22 32 03 13 23 33 |
149 | | // b3: 42 52 62 72 43 53 63 73 |
150 | 1.57M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); |
151 | 1.57M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); |
152 | 1.57M | const __m128i b2 = _mm_unpackhi_epi32(a0, a1); |
153 | 1.57M | const __m128i b3 = _mm_unpackhi_epi32(a2, a3); |
154 | | |
155 | | // Unpack 64 bit elements resulting in: |
156 | | // out[0]: 00 10 20 30 40 50 60 70 |
157 | | // out[1]: 01 11 21 31 41 51 61 71 |
158 | | // out[2]: 02 12 22 32 42 52 62 72 |
159 | | // out[3]: 03 13 23 33 43 53 63 73 |
160 | 1.57M | out[0] = _mm_unpacklo_epi64(b0, b1); |
161 | 1.57M | out[1] = _mm_unpackhi_epi64(b0, b1); |
162 | 1.57M | out[2] = _mm_unpacklo_epi64(b2, b3); |
163 | 1.57M | out[3] = _mm_unpackhi_epi64(b2, b3); |
164 | 1.57M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x8 av1_inv_txfm_ssse3.c:transpose_16bit_4x8 Line | Count | Source | 125 | 1.57M | __m128i *const out) { | 126 | | // Unpack 16 bit elements. Goes from: | 127 | | // in[0]: 00 01 02 03 XX XX XX XX | 128 | | // in[1]: 10 11 12 13 XX XX XX XX | 129 | | // in[2]: 20 21 22 23 XX XX XX XX | 130 | | // in[3]: 30 31 32 33 XX XX XX XX | 131 | | // in[4]: 40 41 42 43 XX XX XX XX | 132 | | // in[5]: 50 51 52 53 XX XX XX XX | 133 | | // in[6]: 60 61 62 63 XX XX XX XX | 134 | | // in[7]: 70 71 72 73 XX XX XX XX | 135 | | // to: | 136 | | // a0: 00 10 01 11 02 12 03 13 | 137 | | // a1: 20 30 21 31 22 32 23 33 | 138 | | // a2: 40 50 41 51 42 52 43 53 | 139 | | // a3: 60 70 61 71 62 72 63 73 | 140 | 1.57M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 141 | 1.57M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 142 | 1.57M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 143 | 1.57M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 144 | | | 145 | | // Unpack 32 bit elements resulting in: | 146 | | // b0: 00 10 20 30 01 11 21 31 | 147 | | // b1: 40 50 60 70 41 51 61 71 | 148 | | // b2: 02 12 22 32 03 13 23 33 | 149 | | // b3: 42 52 62 72 43 53 63 73 | 150 | 1.57M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 151 | 1.57M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 152 | 1.57M | const __m128i b2 = _mm_unpackhi_epi32(a0, a1); | 153 | 1.57M | const __m128i b3 = _mm_unpackhi_epi32(a2, a3); | 154 | | | 155 | | // Unpack 64 bit elements resulting in: | 156 | | // out[0]: 00 10 20 30 40 50 60 70 | 157 | | // out[1]: 01 11 21 31 41 51 61 71 | 158 | | // out[2]: 02 12 22 32 42 52 62 72 | 159 | | // out[3]: 03 13 23 33 43 53 63 73 | 160 | 1.57M | out[0] = _mm_unpacklo_epi64(b0, b1); | 161 | 1.57M | out[1] = _mm_unpackhi_epi64(b0, b1); | 162 | 1.57M | out[2] = _mm_unpacklo_epi64(b2, b3); | 163 | 1.57M | out[3] = _mm_unpackhi_epi64(b2, b3); | 164 | 1.57M | } |
Unexecuted instantiation: resize_ssse3.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_16bit_4x8 |
165 | | |
166 | | static inline void transpose_16bit_8x4(const __m128i *const in, |
167 | 912k | __m128i *const out) { |
168 | | // Unpack 16 bit elements. Goes from: |
169 | | // in[0]: 00 01 02 03 04 05 06 07 |
170 | | // in[1]: 10 11 12 13 14 15 16 17 |
171 | | // in[2]: 20 21 22 23 24 25 26 27 |
172 | | // in[3]: 30 31 32 33 34 35 36 37 |
173 | | |
174 | | // to: |
175 | | // a0: 00 10 01 11 02 12 03 13 |
176 | | // a1: 20 30 21 31 22 32 23 33 |
177 | | // a4: 04 14 05 15 06 16 07 17 |
178 | | // a5: 24 34 25 35 26 36 27 37 |
179 | 912k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
180 | 912k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
181 | 912k | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); |
182 | 912k | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); |
183 | | |
184 | | // Unpack 32 bit elements resulting in: |
185 | | // b0: 00 10 20 30 01 11 21 31 |
186 | | // b2: 04 14 24 34 05 15 25 35 |
187 | | // b4: 02 12 22 32 03 13 23 33 |
188 | | // b6: 06 16 26 36 07 17 27 37 |
189 | 912k | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); |
190 | 912k | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); |
191 | 912k | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); |
192 | 912k | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); |
193 | | |
194 | | // Unpack 64 bit elements resulting in: |
195 | | // out[0]: 00 10 20 30 XX XX XX XX |
196 | | // out[1]: 01 11 21 31 XX XX XX XX |
197 | | // out[2]: 02 12 22 32 XX XX XX XX |
198 | | // out[3]: 03 13 23 33 XX XX XX XX |
199 | | // out[4]: 04 14 24 34 XX XX XX XX |
200 | | // out[5]: 05 15 25 35 XX XX XX XX |
201 | | // out[6]: 06 16 26 36 XX XX XX XX |
202 | | // out[7]: 07 17 27 37 XX XX XX XX |
203 | 912k | const __m128i zeros = _mm_setzero_si128(); |
204 | 912k | out[0] = _mm_unpacklo_epi64(b0, zeros); |
205 | 912k | out[1] = _mm_unpackhi_epi64(b0, zeros); |
206 | 912k | out[2] = _mm_unpacklo_epi64(b4, zeros); |
207 | 912k | out[3] = _mm_unpackhi_epi64(b4, zeros); |
208 | 912k | out[4] = _mm_unpacklo_epi64(b2, zeros); |
209 | 912k | out[5] = _mm_unpackhi_epi64(b2, zeros); |
210 | 912k | out[6] = _mm_unpacklo_epi64(b6, zeros); |
211 | 912k | out[7] = _mm_unpackhi_epi64(b6, zeros); |
212 | 912k | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_16bit_8x4 av1_inv_txfm_ssse3.c:transpose_16bit_8x4 Line | Count | Source | 167 | 912k | __m128i *const out) { | 168 | | // Unpack 16 bit elements. Goes from: | 169 | | // in[0]: 00 01 02 03 04 05 06 07 | 170 | | // in[1]: 10 11 12 13 14 15 16 17 | 171 | | // in[2]: 20 21 22 23 24 25 26 27 | 172 | | // in[3]: 30 31 32 33 34 35 36 37 | 173 | | | 174 | | // to: | 175 | | // a0: 00 10 01 11 02 12 03 13 | 176 | | // a1: 20 30 21 31 22 32 23 33 | 177 | | // a4: 04 14 05 15 06 16 07 17 | 178 | | // a5: 24 34 25 35 26 36 27 37 | 179 | 912k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 180 | 912k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 181 | 912k | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); | 182 | 912k | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); | 183 | | | 184 | | // Unpack 32 bit elements resulting in: | 185 | | // b0: 00 10 20 30 01 11 21 31 | 186 | | // b2: 04 14 24 34 05 15 25 35 | 187 | | // b4: 02 12 22 32 03 13 23 33 | 188 | | // b6: 06 16 26 36 07 17 27 37 | 189 | 912k | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 190 | 912k | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); | 191 | 912k | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); | 192 | 912k | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); | 193 | | | 194 | | // Unpack 64 bit elements resulting in: | 195 | | // out[0]: 00 10 20 30 XX XX XX XX | 196 | | // out[1]: 01 11 21 31 XX XX XX XX | 197 | | // out[2]: 02 12 22 32 XX XX XX XX | 198 | | // out[3]: 03 13 23 33 XX XX XX XX | 199 | | // out[4]: 04 14 24 34 XX XX XX XX | 200 | | // out[5]: 05 15 25 35 XX XX XX XX | 201 | | // out[6]: 06 16 26 36 XX XX XX XX | 202 | | // out[7]: 07 17 27 37 XX XX XX XX | 203 | 912k | const __m128i zeros = _mm_setzero_si128(); | 204 | 912k | out[0] = _mm_unpacklo_epi64(b0, zeros); | 205 | 912k | out[1] = _mm_unpackhi_epi64(b0, zeros); | 206 | 912k | out[2] = _mm_unpacklo_epi64(b4, zeros); | 207 | 912k | out[3] = _mm_unpackhi_epi64(b4, zeros); | 208 | 912k | out[4] = _mm_unpacklo_epi64(b2, zeros); | 209 | 912k | out[5] = _mm_unpackhi_epi64(b2, zeros); | 210 | 912k | out[6] = _mm_unpacklo_epi64(b6, zeros); | 211 | 912k | out[7] = _mm_unpackhi_epi64(b6, zeros); | 212 | 912k | } |
Unexecuted instantiation: resize_ssse3.c:transpose_16bit_8x4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_16bit_8x4 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_16bit_8x4 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_16bit_8x4 |
213 | | |
214 | | static inline void transpose_16bit_8x8(const __m128i *const in, |
215 | 3.13M | __m128i *const out) { |
216 | | // Unpack 16 bit elements. Goes from: |
217 | | // in[0]: 00 01 02 03 04 05 06 07 |
218 | | // in[1]: 10 11 12 13 14 15 16 17 |
219 | | // in[2]: 20 21 22 23 24 25 26 27 |
220 | | // in[3]: 30 31 32 33 34 35 36 37 |
221 | | // in[4]: 40 41 42 43 44 45 46 47 |
222 | | // in[5]: 50 51 52 53 54 55 56 57 |
223 | | // in[6]: 60 61 62 63 64 65 66 67 |
224 | | // in[7]: 70 71 72 73 74 75 76 77 |
225 | | // to: |
226 | | // a0: 00 10 01 11 02 12 03 13 |
227 | | // a1: 20 30 21 31 22 32 23 33 |
228 | | // a2: 40 50 41 51 42 52 43 53 |
229 | | // a3: 60 70 61 71 62 72 63 73 |
230 | | // a4: 04 14 05 15 06 16 07 17 |
231 | | // a5: 24 34 25 35 26 36 27 37 |
232 | | // a6: 44 54 45 55 46 56 47 57 |
233 | | // a7: 64 74 65 75 66 76 67 77 |
234 | 3.13M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
235 | 3.13M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
236 | 3.13M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); |
237 | 3.13M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); |
238 | 3.13M | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); |
239 | 3.13M | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); |
240 | 3.13M | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); |
241 | 3.13M | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); |
242 | | |
243 | | // Unpack 32 bit elements resulting in: |
244 | | // b0: 00 10 20 30 01 11 21 31 |
245 | | // b1: 40 50 60 70 41 51 61 71 |
246 | | // b2: 04 14 24 34 05 15 25 35 |
247 | | // b3: 44 54 64 74 45 55 65 75 |
248 | | // b4: 02 12 22 32 03 13 23 33 |
249 | | // b5: 42 52 62 72 43 53 63 73 |
250 | | // b6: 06 16 26 36 07 17 27 37 |
251 | | // b7: 46 56 66 76 47 57 67 77 |
252 | 3.13M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); |
253 | 3.13M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); |
254 | 3.13M | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); |
255 | 3.13M | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); |
256 | 3.13M | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); |
257 | 3.13M | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); |
258 | 3.13M | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); |
259 | 3.13M | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); |
260 | | |
261 | | // Unpack 64 bit elements resulting in: |
262 | | // out[0]: 00 10 20 30 40 50 60 70 |
263 | | // out[1]: 01 11 21 31 41 51 61 71 |
264 | | // out[2]: 02 12 22 32 42 52 62 72 |
265 | | // out[3]: 03 13 23 33 43 53 63 73 |
266 | | // out[4]: 04 14 24 34 44 54 64 74 |
267 | | // out[5]: 05 15 25 35 45 55 65 75 |
268 | | // out[6]: 06 16 26 36 46 56 66 76 |
269 | | // out[7]: 07 17 27 37 47 57 67 77 |
270 | 3.13M | out[0] = _mm_unpacklo_epi64(b0, b1); |
271 | 3.13M | out[1] = _mm_unpackhi_epi64(b0, b1); |
272 | 3.13M | out[2] = _mm_unpacklo_epi64(b4, b5); |
273 | 3.13M | out[3] = _mm_unpackhi_epi64(b4, b5); |
274 | 3.13M | out[4] = _mm_unpacklo_epi64(b2, b3); |
275 | 3.13M | out[5] = _mm_unpackhi_epi64(b2, b3); |
276 | 3.13M | out[6] = _mm_unpacklo_epi64(b6, b7); |
277 | 3.13M | out[7] = _mm_unpackhi_epi64(b6, b7); |
278 | 3.13M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_16bit_8x8 av1_inv_txfm_ssse3.c:transpose_16bit_8x8 Line | Count | Source | 215 | 3.13M | __m128i *const out) { | 216 | | // Unpack 16 bit elements. Goes from: | 217 | | // in[0]: 00 01 02 03 04 05 06 07 | 218 | | // in[1]: 10 11 12 13 14 15 16 17 | 219 | | // in[2]: 20 21 22 23 24 25 26 27 | 220 | | // in[3]: 30 31 32 33 34 35 36 37 | 221 | | // in[4]: 40 41 42 43 44 45 46 47 | 222 | | // in[5]: 50 51 52 53 54 55 56 57 | 223 | | // in[6]: 60 61 62 63 64 65 66 67 | 224 | | // in[7]: 70 71 72 73 74 75 76 77 | 225 | | // to: | 226 | | // a0: 00 10 01 11 02 12 03 13 | 227 | | // a1: 20 30 21 31 22 32 23 33 | 228 | | // a2: 40 50 41 51 42 52 43 53 | 229 | | // a3: 60 70 61 71 62 72 63 73 | 230 | | // a4: 04 14 05 15 06 16 07 17 | 231 | | // a5: 24 34 25 35 26 36 27 37 | 232 | | // a6: 44 54 45 55 46 56 47 57 | 233 | | // a7: 64 74 65 75 66 76 67 77 | 234 | 3.13M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 235 | 3.13M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 236 | 3.13M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 237 | 3.13M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 238 | 3.13M | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); | 239 | 3.13M | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); | 240 | 3.13M | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); | 241 | 3.13M | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); | 242 | | | 243 | | // Unpack 32 bit elements resulting in: | 244 | | // b0: 00 10 20 30 01 11 21 31 | 245 | | // b1: 40 50 60 70 41 51 61 71 | 246 | | // b2: 04 14 24 34 05 15 25 35 | 247 | | // b3: 44 54 64 74 45 55 65 75 | 248 | | // b4: 02 12 22 32 03 13 23 33 | 249 | | // b5: 42 52 62 72 43 53 63 73 | 250 | | // b6: 06 16 26 36 07 17 27 37 | 251 | | // b7: 46 56 66 76 47 57 67 77 | 252 | 3.13M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 253 | 3.13M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 254 | 3.13M | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); | 255 | 3.13M | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); | 256 | 3.13M | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); | 257 | 3.13M | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); | 258 | 3.13M | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); | 259 | 3.13M | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); | 260 | | | 261 | | // Unpack 64 bit elements resulting in: | 262 | | // out[0]: 00 10 20 30 40 50 60 70 | 263 | | // out[1]: 01 11 21 31 41 51 61 71 | 264 | | // out[2]: 02 12 22 32 42 52 62 72 | 265 | | // out[3]: 03 13 23 33 43 53 63 73 | 266 | | // out[4]: 04 14 24 34 44 54 64 74 | 267 | | // out[5]: 05 15 25 35 45 55 65 75 | 268 | | // out[6]: 06 16 26 36 46 56 66 76 | 269 | | // out[7]: 07 17 27 37 47 57 67 77 | 270 | 3.13M | out[0] = _mm_unpacklo_epi64(b0, b1); | 271 | 3.13M | out[1] = _mm_unpackhi_epi64(b0, b1); | 272 | 3.13M | out[2] = _mm_unpacklo_epi64(b4, b5); | 273 | 3.13M | out[3] = _mm_unpackhi_epi64(b4, b5); | 274 | 3.13M | out[4] = _mm_unpacklo_epi64(b2, b3); | 275 | 3.13M | out[5] = _mm_unpackhi_epi64(b2, b3); | 276 | 3.13M | out[6] = _mm_unpacklo_epi64(b6, b7); | 277 | 3.13M | out[7] = _mm_unpackhi_epi64(b6, b7); | 278 | 3.13M | } |
Unexecuted instantiation: resize_ssse3.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_16bit_8x8 |
279 | | |
280 | | // Transpose in-place |
281 | | static inline void transpose_16bit_16x16(__m128i *const left, |
282 | 0 | __m128i *const right) { |
283 | 0 | __m128i tbuf[8]; |
284 | 0 | transpose_16bit_8x8(left, left); |
285 | 0 | transpose_16bit_8x8(right, tbuf); |
286 | 0 | transpose_16bit_8x8(left + 8, right); |
287 | 0 | transpose_16bit_8x8(right + 8, right + 8); |
288 | 0 |
|
289 | 0 | left[8] = tbuf[0]; |
290 | 0 | left[9] = tbuf[1]; |
291 | 0 | left[10] = tbuf[2]; |
292 | 0 | left[11] = tbuf[3]; |
293 | 0 | left[12] = tbuf[4]; |
294 | 0 | left[13] = tbuf[5]; |
295 | 0 | left[14] = tbuf[6]; |
296 | 0 | left[15] = tbuf[7]; |
297 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_16bit_16x16 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_16bit_16x16 Unexecuted instantiation: resize_ssse3.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_16bit_16x16 |
298 | | |
299 | | static inline void transpose_32bit_4x4(const __m128i *const in, |
300 | 6.31M | __m128i *const out) { |
301 | | // Unpack 32 bit elements. Goes from: |
302 | | // in[0]: 00 01 02 03 |
303 | | // in[1]: 10 11 12 13 |
304 | | // in[2]: 20 21 22 23 |
305 | | // in[3]: 30 31 32 33 |
306 | | // to: |
307 | | // a0: 00 10 01 11 |
308 | | // a1: 20 30 21 31 |
309 | | // a2: 02 12 03 13 |
310 | | // a3: 22 32 23 33 |
311 | | |
312 | 6.31M | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); |
313 | 6.31M | const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); |
314 | 6.31M | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); |
315 | 6.31M | const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); |
316 | | |
317 | | // Unpack 64 bit elements resulting in: |
318 | | // out[0]: 00 10 20 30 |
319 | | // out[1]: 01 11 21 31 |
320 | | // out[2]: 02 12 22 32 |
321 | | // out[3]: 03 13 23 33 |
322 | 6.31M | out[0] = _mm_unpacklo_epi64(a0, a1); |
323 | 6.31M | out[1] = _mm_unpackhi_epi64(a0, a1); |
324 | 6.31M | out[2] = _mm_unpacklo_epi64(a2, a3); |
325 | 6.31M | out[3] = _mm_unpackhi_epi64(a2, a3); |
326 | 6.31M | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_32bit_4x4 Unexecuted instantiation: resize_ssse3.c:transpose_32bit_4x4 highbd_inv_txfm_sse4.c:transpose_32bit_4x4 Line | Count | Source | 300 | 6.31M | __m128i *const out) { | 301 | | // Unpack 32 bit elements. Goes from: | 302 | | // in[0]: 00 01 02 03 | 303 | | // in[1]: 10 11 12 13 | 304 | | // in[2]: 20 21 22 23 | 305 | | // in[3]: 30 31 32 33 | 306 | | // to: | 307 | | // a0: 00 10 01 11 | 308 | | // a1: 20 30 21 31 | 309 | | // a2: 02 12 03 13 | 310 | | // a3: 22 32 23 33 | 311 | | | 312 | 6.31M | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); | 313 | 6.31M | const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); | 314 | 6.31M | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); | 315 | 6.31M | const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); | 316 | | | 317 | | // Unpack 64 bit elements resulting in: | 318 | | // out[0]: 00 10 20 30 | 319 | | // out[1]: 01 11 21 31 | 320 | | // out[2]: 02 12 22 32 | 321 | | // out[3]: 03 13 23 33 | 322 | 6.31M | out[0] = _mm_unpacklo_epi64(a0, a1); | 323 | 6.31M | out[1] = _mm_unpackhi_epi64(a0, a1); | 324 | 6.31M | out[2] = _mm_unpacklo_epi64(a2, a3); | 325 | 6.31M | out[3] = _mm_unpackhi_epi64(a2, a3); | 326 | 6.31M | } |
Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_32bit_4x4 |
327 | | |
328 | | static inline void transpose_32bit_4x4x2(const __m128i *const in, |
329 | 0 | __m128i *const out) { |
330 | 0 | // Unpack 32 bit elements. Goes from: |
331 | 0 | // in[0]: 00 01 02 03 |
332 | 0 | // in[1]: 10 11 12 13 |
333 | 0 | // in[2]: 20 21 22 23 |
334 | 0 | // in[3]: 30 31 32 33 |
335 | 0 | // in[4]: 04 05 06 07 |
336 | 0 | // in[5]: 14 15 16 17 |
337 | 0 | // in[6]: 24 25 26 27 |
338 | 0 | // in[7]: 34 35 36 37 |
339 | 0 | // to: |
340 | 0 | // a0: 00 10 01 11 |
341 | 0 | // a1: 20 30 21 31 |
342 | 0 | // a2: 02 12 03 13 |
343 | 0 | // a3: 22 32 23 33 |
344 | 0 | // a4: 04 14 05 15 |
345 | 0 | // a5: 24 34 25 35 |
346 | 0 | // a6: 06 16 07 17 |
347 | 0 | // a7: 26 36 27 37 |
348 | 0 | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); |
349 | 0 | const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); |
350 | 0 | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); |
351 | 0 | const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); |
352 | 0 | const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); |
353 | 0 | const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); |
354 | 0 | const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); |
355 | 0 | const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); |
356 | 0 |
|
357 | 0 | // Unpack 64 bit elements resulting in: |
358 | 0 | // out[0]: 00 10 20 30 |
359 | 0 | // out[1]: 01 11 21 31 |
360 | 0 | // out[2]: 02 12 22 32 |
361 | 0 | // out[3]: 03 13 23 33 |
362 | 0 | // out[4]: 04 14 24 34 |
363 | 0 | // out[5]: 05 15 25 35 |
364 | 0 | // out[6]: 06 16 26 36 |
365 | 0 | // out[7]: 07 17 27 37 |
366 | 0 | out[0] = _mm_unpacklo_epi64(a0, a1); |
367 | 0 | out[1] = _mm_unpackhi_epi64(a0, a1); |
368 | 0 | out[2] = _mm_unpacklo_epi64(a2, a3); |
369 | 0 | out[3] = _mm_unpackhi_epi64(a2, a3); |
370 | 0 | out[4] = _mm_unpacklo_epi64(a4, a5); |
371 | 0 | out[5] = _mm_unpackhi_epi64(a4, a5); |
372 | 0 | out[6] = _mm_unpacklo_epi64(a6, a7); |
373 | 0 | out[7] = _mm_unpackhi_epi64(a6, a7); |
374 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: resize_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_32bit_4x4x2 |
375 | | |
376 | | static inline void transpose_32bit_8x4(const __m128i *const in, |
377 | 0 | __m128i *const out) { |
378 | 0 | // Unpack 32 bit elements. Goes from: |
379 | 0 | // in[0]: 00 01 02 03 |
380 | 0 | // in[1]: 04 05 06 07 |
381 | 0 | // in[2]: 10 11 12 13 |
382 | 0 | // in[3]: 14 15 16 17 |
383 | 0 | // in[4]: 20 21 22 23 |
384 | 0 | // in[5]: 24 25 26 27 |
385 | 0 | // in[6]: 30 31 32 33 |
386 | 0 | // in[7]: 34 35 36 37 |
387 | 0 | // to: |
388 | 0 | // a0: 00 10 01 11 |
389 | 0 | // a1: 20 30 21 31 |
390 | 0 | // a2: 02 12 03 13 |
391 | 0 | // a3: 22 32 23 33 |
392 | 0 | // a4: 04 14 05 15 |
393 | 0 | // a5: 24 34 25 35 |
394 | 0 | // a6: 06 16 07 17 |
395 | 0 | // a7: 26 36 27 37 |
396 | 0 | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); |
397 | 0 | const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); |
398 | 0 | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); |
399 | 0 | const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); |
400 | 0 | const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); |
401 | 0 | const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); |
402 | 0 | const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); |
403 | 0 | const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); |
404 | 0 |
|
405 | 0 | // Unpack 64 bit elements resulting in: |
406 | 0 | // out[0]: 00 10 20 30 |
407 | 0 | // out[1]: 01 11 21 31 |
408 | 0 | // out[2]: 02 12 22 32 |
409 | 0 | // out[3]: 03 13 23 33 |
410 | 0 | // out[4]: 04 14 24 34 |
411 | 0 | // out[5]: 05 15 25 35 |
412 | 0 | // out[6]: 06 16 26 36 |
413 | 0 | // out[7]: 07 17 27 37 |
414 | 0 | out[0] = _mm_unpacklo_epi64(a0, a1); |
415 | 0 | out[1] = _mm_unpackhi_epi64(a0, a1); |
416 | 0 | out[2] = _mm_unpacklo_epi64(a2, a3); |
417 | 0 | out[3] = _mm_unpackhi_epi64(a2, a3); |
418 | 0 | out[4] = _mm_unpacklo_epi64(a4, a5); |
419 | 0 | out[5] = _mm_unpackhi_epi64(a4, a5); |
420 | 0 | out[6] = _mm_unpacklo_epi64(a6, a7); |
421 | 0 | out[7] = _mm_unpackhi_epi64(a6, a7); |
422 | 0 | } Unexecuted instantiation: aom_subpixel_8t_intrin_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: av1_inv_txfm_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: resize_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_inv_txfm_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: av1_inv_txfm_avx2.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_inv_txfm_avx2.c:transpose_32bit_8x4 |
423 | | |
424 | | #endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ |