/src/aom/aom_dsp/x86/lpf_common_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2017, Alliance for Open Media. All rights reserved. |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
10 | | */ |
11 | | |
12 | | #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ |
13 | | #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ |
14 | | |
15 | | #include <emmintrin.h> // SSE2 |
16 | | |
17 | | #include "config/aom_config.h" |
18 | | |
19 | 0 | #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) |
20 | 0 | #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) |
21 | | |
22 | | static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, |
23 | | __m128i *x2, __m128i *x3, |
24 | | __m128i *x4, __m128i *x5, |
25 | | __m128i *d0, __m128i *d1, |
26 | | __m128i *d2, __m128i *d3, |
27 | 0 | __m128i *d4, __m128i *d5) { |
28 | 0 | __m128i w0, w1, w2, w3, w4, w5, ww0; |
29 | 0 |
|
30 | 0 | // 00 01 02 03 04 05 xx xx |
31 | 0 | // 10 11 12 13 14 15 xx xx |
32 | 0 | // 20 21 22 23 24 25 xx xx |
33 | 0 | // 30 31 32 33 34 35 xx xx |
34 | 0 | // 40 41 42 43 44 45 xx xx |
35 | 0 | // 50 51 52 53 54 55 xx xx |
36 | 0 |
|
37 | 0 | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 |
38 | 0 | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 |
39 | 0 | w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 |
40 | 0 |
|
41 | 0 | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 |
42 | 0 | *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 |
43 | 0 | *d1 = _mm_unpackhi_epi64(ww0, |
44 | 0 | _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx |
45 | 0 |
|
46 | 0 | ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 |
47 | 0 | *d2 = _mm_unpacklo_epi64(ww0, |
48 | 0 | _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx |
49 | 0 |
|
50 | 0 | w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx |
51 | 0 | w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx |
52 | 0 | w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx |
53 | 0 |
|
54 | 0 | *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 |
55 | 0 |
|
56 | 0 | ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 |
57 | 0 | *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 |
58 | 0 | *d5 = _mm_unpackhi_epi64(ww0, |
59 | 0 | _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx |
60 | 0 | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose6x6_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose6x6_sse2 Unexecuted instantiation: intrapred_sse4.c:highbd_transpose6x6_sse2 Unexecuted instantiation: intrapred_avx2.c:highbd_transpose6x6_sse2 Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose6x6_sse2 |
61 | | |
62 | | static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, |
63 | | __m128i *x2, __m128i *x3, |
64 | | __m128i *d0, __m128i *d1, |
65 | 72.0M | __m128i *d2, __m128i *d3) { |
66 | 72.0M | __m128i zero = _mm_setzero_si128(); |
67 | 72.0M | __m128i w0, w1, ww0, ww1; |
68 | | |
69 | 72.0M | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 |
70 | 72.0M | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 |
71 | | |
72 | 72.0M | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 |
73 | 72.0M | ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 |
74 | | |
75 | 72.0M | *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx |
76 | 72.0M | *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx |
77 | 72.0M | *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx |
78 | 72.0M | *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx |
79 | 72.0M | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2 highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2 Line | Count | Source | 65 | 71.7M | __m128i *d2, __m128i *d3) { | 66 | 71.7M | __m128i zero = _mm_setzero_si128(); | 67 | 71.7M | __m128i w0, w1, ww0, ww1; | 68 | | | 69 | 71.7M | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 | 70 | 71.7M | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 | 71 | | | 72 | 71.7M | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 | 73 | 71.7M | ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 | 74 | | | 75 | 71.7M | *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx | 76 | 71.7M | *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx | 77 | 71.7M | *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx | 78 | 71.7M | *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx | 79 | 71.7M | } |
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_low_sse2 intrapred_avx2.c:highbd_transpose4x8_8x4_low_sse2 Line | Count | Source | 65 | 261k | __m128i *d2, __m128i *d3) { | 66 | 261k | __m128i zero = _mm_setzero_si128(); | 67 | 261k | __m128i w0, w1, ww0, ww1; | 68 | | | 69 | 261k | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 | 70 | 261k | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 | 71 | | | 72 | 261k | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 | 73 | 261k | ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 | 74 | | | 75 | 261k | *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx | 76 | 261k | *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx | 77 | 261k | *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx | 78 | 261k | *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx | 79 | 261k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_low_sse2 |
80 | | |
81 | | static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, |
82 | | __m128i *x2, __m128i *x3, |
83 | | __m128i *d4, __m128i *d5, |
84 | 17.0M | __m128i *d6, __m128i *d7) { |
85 | 17.0M | __m128i w0, w1, ww2, ww3; |
86 | 17.0M | __m128i zero = _mm_setzero_si128(); |
87 | | |
88 | 17.0M | w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 |
89 | 17.0M | w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 |
90 | | |
91 | 17.0M | ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 |
92 | 17.0M | ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 |
93 | | |
94 | 17.0M | *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx |
95 | 17.0M | *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx |
96 | 17.0M | *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx |
97 | 17.0M | *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx |
98 | 17.0M | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2 highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2 Line | Count | Source | 84 | 16.9M | __m128i *d6, __m128i *d7) { | 85 | 16.9M | __m128i w0, w1, ww2, ww3; | 86 | 16.9M | __m128i zero = _mm_setzero_si128(); | 87 | | | 88 | 16.9M | w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 | 89 | 16.9M | w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 | 90 | | | 91 | 16.9M | ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 | 92 | 16.9M | ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 | 93 | | | 94 | 16.9M | *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx | 95 | 16.9M | *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx | 96 | 16.9M | *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx | 97 | 16.9M | *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx | 98 | 16.9M | } |
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_high_sse2 intrapred_avx2.c:highbd_transpose4x8_8x4_high_sse2 Line | Count | Source | 84 | 35.7k | __m128i *d6, __m128i *d7) { | 85 | 35.7k | __m128i w0, w1, ww2, ww3; | 86 | 35.7k | __m128i zero = _mm_setzero_si128(); | 87 | | | 88 | 35.7k | w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 | 89 | 35.7k | w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 | 90 | | | 91 | 35.7k | ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 | 92 | 35.7k | ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 | 93 | | | 94 | 35.7k | *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx | 95 | 35.7k | *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx | 96 | 35.7k | *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx | 97 | 35.7k | *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx | 98 | 35.7k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_high_sse2 |
99 | | |
100 | | // here in and out pointers (x and d) should be different! we don't store their |
101 | | // values inside |
102 | | static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, |
103 | | __m128i *x2, __m128i *x3, |
104 | | __m128i *d0, __m128i *d1, |
105 | | __m128i *d2, __m128i *d3, |
106 | | __m128i *d4, __m128i *d5, |
107 | 15.9M | __m128i *d6, __m128i *d7) { |
108 | | // input |
109 | | // x0 00 01 02 03 04 05 06 07 |
110 | | // x1 10 11 12 13 14 15 16 17 |
111 | | // x2 20 21 22 23 24 25 26 27 |
112 | | // x3 30 31 32 33 34 35 36 37 |
113 | | // output |
114 | | // 00 10 20 30 xx xx xx xx |
115 | | // 01 11 21 31 xx xx xx xx |
116 | | // 02 12 22 32 xx xx xx xx |
117 | | // 03 13 23 33 xx xx xx xx |
118 | | // 04 14 24 34 xx xx xx xx |
119 | | // 05 15 25 35 xx xx xx xx |
120 | | // 06 16 26 36 xx xx xx xx |
121 | | // 07 17 27 37 xx xx xx xx |
122 | 15.9M | highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); |
123 | 15.9M | highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); |
124 | 15.9M | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2 highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2 Line | Count | Source | 107 | 15.9M | __m128i *d6, __m128i *d7) { | 108 | | // input | 109 | | // x0 00 01 02 03 04 05 06 07 | 110 | | // x1 10 11 12 13 14 15 16 17 | 111 | | // x2 20 21 22 23 24 25 26 27 | 112 | | // x3 30 31 32 33 34 35 36 37 | 113 | | // output | 114 | | // 00 10 20 30 xx xx xx xx | 115 | | // 01 11 21 31 xx xx xx xx | 116 | | // 02 12 22 32 xx xx xx xx | 117 | | // 03 13 23 33 xx xx xx xx | 118 | | // 04 14 24 34 xx xx xx xx | 119 | | // 05 15 25 35 xx xx xx xx | 120 | | // 06 16 26 36 xx xx xx xx | 121 | | // 07 17 27 37 xx xx xx xx | 122 | 15.9M | highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); | 123 | 15.9M | highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); | 124 | 15.9M | } |
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_sse2 intrapred_avx2.c:highbd_transpose4x8_8x4_sse2 Line | Count | Source | 107 | 35.7k | __m128i *d6, __m128i *d7) { | 108 | | // input | 109 | | // x0 00 01 02 03 04 05 06 07 | 110 | | // x1 10 11 12 13 14 15 16 17 | 111 | | // x2 20 21 22 23 24 25 26 27 | 112 | | // x3 30 31 32 33 34 35 36 37 | 113 | | // output | 114 | | // 00 10 20 30 xx xx xx xx | 115 | | // 01 11 21 31 xx xx xx xx | 116 | | // 02 12 22 32 xx xx xx xx | 117 | | // 03 13 23 33 xx xx xx xx | 118 | | // 04 14 24 34 xx xx xx xx | 119 | | // 05 15 25 35 xx xx xx xx | 120 | | // 06 16 26 36 xx xx xx xx | 121 | | // 07 17 27 37 xx xx xx xx | 122 | 35.7k | highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); | 123 | 35.7k | highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); | 124 | 35.7k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_sse2 |
125 | | |
126 | | static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, |
127 | | __m128i *x2, __m128i *x3, |
128 | | __m128i *x4, __m128i *x5, |
129 | | __m128i *x6, __m128i *x7, |
130 | | __m128i *d0, __m128i *d1, |
131 | 11.2M | __m128i *d2, __m128i *d3) { |
132 | 11.2M | __m128i w0, w1, w2, w3, ww0, ww1; |
133 | | // x0 00 01 02 03 04 05 06 07 |
134 | | // x1 10 11 12 13 14 15 16 17 |
135 | | // x2 20 21 22 23 24 25 26 27 |
136 | | // x3 30 31 32 33 34 35 36 37 |
137 | | // x4 40 41 42 43 44 45 46 47 |
138 | | // x5 50 51 52 53 54 55 56 57 |
139 | | // x6 60 61 62 63 64 65 66 67 |
140 | | // x7 70 71 72 73 74 75 76 77 |
141 | | |
142 | 11.2M | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 |
143 | 11.2M | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 |
144 | 11.2M | w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 |
145 | 11.2M | w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 |
146 | | |
147 | 11.2M | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 |
148 | 11.2M | ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 |
149 | | |
150 | 11.2M | *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 |
151 | 11.2M | *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 |
152 | | |
153 | 11.2M | ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 |
154 | 11.2M | ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 |
155 | | |
156 | 11.2M | *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 |
157 | 11.2M | *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 |
158 | 11.2M | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_low_sse2 highbd_loopfilter_sse2.c:highbd_transpose8x8_low_sse2 Line | Count | Source | 131 | 10.5M | __m128i *d2, __m128i *d3) { | 132 | 10.5M | __m128i w0, w1, w2, w3, ww0, ww1; | 133 | | // x0 00 01 02 03 04 05 06 07 | 134 | | // x1 10 11 12 13 14 15 16 17 | 135 | | // x2 20 21 22 23 24 25 26 27 | 136 | | // x3 30 31 32 33 34 35 36 37 | 137 | | // x4 40 41 42 43 44 45 46 47 | 138 | | // x5 50 51 52 53 54 55 56 57 | 139 | | // x6 60 61 62 63 64 65 66 67 | 140 | | // x7 70 71 72 73 74 75 76 77 | 141 | | | 142 | 10.5M | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 | 143 | 10.5M | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 | 144 | 10.5M | w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 | 145 | 10.5M | w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 | 146 | | | 147 | 10.5M | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 | 148 | 10.5M | ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 | 149 | | | 150 | 10.5M | *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 | 151 | 10.5M | *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 | 152 | | | 153 | 10.5M | ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 | 154 | 10.5M | ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 | 155 | | | 156 | 10.5M | *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 | 157 | 10.5M | *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 | 158 | 10.5M | } |
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_low_sse2 intrapred_avx2.c:highbd_transpose8x8_low_sse2 Line | Count | Source | 131 | 676k | __m128i *d2, __m128i *d3) { | 132 | 676k | __m128i w0, w1, w2, w3, ww0, ww1; | 133 | | // x0 00 01 02 03 04 05 06 07 | 134 | | // x1 10 11 12 13 14 15 16 17 | 135 | | // x2 20 21 22 23 24 25 26 27 | 136 | | // x3 30 31 32 33 34 35 36 37 | 137 | | // x4 40 41 42 43 44 45 46 47 | 138 | | // x5 50 51 52 53 54 55 56 57 | 139 | | // x6 60 61 62 63 64 65 66 67 | 140 | | // x7 70 71 72 73 74 75 76 77 | 141 | | | 142 | 676k | w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 | 143 | 676k | w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 | 144 | 676k | w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 | 145 | 676k | w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 | 146 | | | 147 | 676k | ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 | 148 | 676k | ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 | 149 | | | 150 | 676k | *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 | 151 | 676k | *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 | 152 | | | 153 | 676k | ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 | 154 | 676k | ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 | 155 | | | 156 | 676k | *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 | 157 | 676k | *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 | 158 | 676k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_low_sse2 |
159 | | |
160 | | static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, |
161 | | __m128i *x2, __m128i *x3, |
162 | | __m128i *x4, __m128i *x5, |
163 | | __m128i *x6, __m128i *x7, |
164 | | __m128i *d4, __m128i *d5, |
165 | 610k | __m128i *d6, __m128i *d7) { |
166 | 610k | __m128i w0, w1, w2, w3, ww0, ww1; |
167 | | // x0 00 01 02 03 04 05 06 07 |
168 | | // x1 10 11 12 13 14 15 16 17 |
169 | | // x2 20 21 22 23 24 25 26 27 |
170 | | // x3 30 31 32 33 34 35 36 37 |
171 | | // x4 40 41 42 43 44 45 46 47 |
172 | | // x5 50 51 52 53 54 55 56 57 |
173 | | // x6 60 61 62 63 64 65 66 67 |
174 | | // x7 70 71 72 73 74 75 76 77 |
175 | 610k | w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 |
176 | 610k | w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 |
177 | 610k | w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 |
178 | 610k | w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 |
179 | | |
180 | 610k | ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 |
181 | 610k | ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 |
182 | | |
183 | 610k | *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 |
184 | 610k | *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 |
185 | | |
186 | 610k | ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 |
187 | 610k | ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 |
188 | | |
189 | 610k | *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 |
190 | 610k | *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 |
191 | 610k | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_high_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x8_high_sse2 Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_high_sse2 intrapred_avx2.c:highbd_transpose8x8_high_sse2 Line | Count | Source | 165 | 610k | __m128i *d6, __m128i *d7) { | 166 | 610k | __m128i w0, w1, w2, w3, ww0, ww1; | 167 | | // x0 00 01 02 03 04 05 06 07 | 168 | | // x1 10 11 12 13 14 15 16 17 | 169 | | // x2 20 21 22 23 24 25 26 27 | 170 | | // x3 30 31 32 33 34 35 36 37 | 171 | | // x4 40 41 42 43 44 45 46 47 | 172 | | // x5 50 51 52 53 54 55 56 57 | 173 | | // x6 60 61 62 63 64 65 66 67 | 174 | | // x7 70 71 72 73 74 75 76 77 | 175 | 610k | w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 | 176 | 610k | w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 | 177 | 610k | w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 | 178 | 610k | w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 | 179 | | | 180 | 610k | ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 | 181 | 610k | ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 | 182 | | | 183 | 610k | *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 | 184 | 610k | *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 | 185 | | | 186 | 610k | ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 | 187 | 610k | ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 | 188 | | | 189 | 610k | *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 | 190 | 610k | *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 | 191 | 610k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_high_sse2 |
192 | | |
193 | | // here in and out pointers (x and d) should be different! we don't store their |
194 | | // values inside |
195 | | static inline void highbd_transpose8x8_sse2( |
196 | | __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, |
197 | | __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, |
198 | | __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, |
199 | 610k | __m128i *d7) { |
200 | 610k | highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); |
201 | 610k | highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); |
202 | 610k | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x8_sse2 Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_sse2 intrapred_avx2.c:highbd_transpose8x8_sse2 Line | Count | Source | 199 | 610k | __m128i *d7) { | 200 | 610k | highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); | 201 | 610k | highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); | 202 | 610k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_sse2 |
203 | | |
204 | | // here in and out pointers (x and d arrays) should be different! we don't store |
205 | | // their values inside |
206 | | static inline void highbd_transpose8x16_sse2( |
207 | | __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, |
208 | | __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, |
209 | | __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, |
210 | 0 | __m128i *d7) { |
211 | 0 | highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, |
212 | 0 | d5, d6, d7); |
213 | 0 | highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, |
214 | 0 | x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, |
215 | 0 | d4 + 1, d5 + 1, d6 + 1, d7 + 1); |
216 | 0 | } Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x16_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x16_sse2 Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x16_sse2 Unexecuted instantiation: intrapred_avx2.c:highbd_transpose8x16_sse2 Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x16_sse2 |
217 | | |
218 | | // Low bit depth functions |
219 | | static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, |
220 | | __m128i *x2, __m128i *x3, |
221 | | __m128i *d0, __m128i *d1, |
222 | 4.60M | __m128i *d2, __m128i *d3) { |
223 | | // input |
224 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx |
225 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx |
226 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx |
227 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx |
228 | | // output |
229 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx |
230 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx |
231 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx |
232 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx |
233 | | |
234 | 4.60M | __m128i w0, w1; |
235 | | |
236 | 4.60M | w0 = _mm_unpacklo_epi8( |
237 | 4.60M | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
238 | 4.60M | w1 = _mm_unpacklo_epi8( |
239 | 4.60M | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
240 | | |
241 | 4.60M | *d0 = _mm_unpacklo_epi16( |
242 | 4.60M | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
243 | | |
244 | 4.60M | *d1 = _mm_srli_si128(*d0, |
245 | 4.60M | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx |
246 | 4.60M | *d2 = _mm_srli_si128(*d0, |
247 | 4.60M | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx |
248 | 4.60M | *d3 = _mm_srli_si128(*d0, |
249 | 4.60M | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx |
250 | 4.60M | } loopfilter_sse2.c:transpose4x8_8x4_low_sse2 Line | Count | Source | 222 | 4.48M | __m128i *d2, __m128i *d3) { | 223 | | // input | 224 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx | 225 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx | 226 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx | 227 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx | 228 | | // output | 229 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 230 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 231 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 232 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 233 | | | 234 | 4.48M | __m128i w0, w1; | 235 | | | 236 | 4.48M | w0 = _mm_unpacklo_epi8( | 237 | 4.48M | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 238 | 4.48M | w1 = _mm_unpacklo_epi8( | 239 | 4.48M | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 240 | | | 241 | 4.48M | *d0 = _mm_unpacklo_epi16( | 242 | 4.48M | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 243 | | | 244 | 4.48M | *d1 = _mm_srli_si128(*d0, | 245 | 4.48M | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 246 | 4.48M | *d2 = _mm_srli_si128(*d0, | 247 | 4.48M | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 248 | 4.48M | *d3 = _mm_srli_si128(*d0, | 249 | 4.48M | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 250 | 4.48M | } |
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose4x8_8x4_low_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose4x8_8x4_low_sse2 intrapred_avx2.c:transpose4x8_8x4_low_sse2 Line | Count | Source | 222 | 115k | __m128i *d2, __m128i *d3) { | 223 | | // input | 224 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx | 225 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx | 226 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx | 227 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx | 228 | | // output | 229 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 230 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 231 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 232 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 233 | | | 234 | 115k | __m128i w0, w1; | 235 | | | 236 | 115k | w0 = _mm_unpacklo_epi8( | 237 | 115k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 238 | 115k | w1 = _mm_unpacklo_epi8( | 239 | 115k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 240 | | | 241 | 115k | *d0 = _mm_unpacklo_epi16( | 242 | 115k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 243 | | | 244 | 115k | *d1 = _mm_srli_si128(*d0, | 245 | 115k | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 246 | 115k | *d2 = _mm_srli_si128(*d0, | 247 | 115k | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 248 | 115k | *d3 = _mm_srli_si128(*d0, | 249 | 115k | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 250 | 115k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose4x8_8x4_low_sse2 |
251 | | |
252 | | static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, |
253 | | __m128i *x3, __m128i *d0, __m128i *d1, |
254 | | __m128i *d2, __m128i *d3, __m128i *d4, |
255 | | __m128i *d5, __m128i *d6, |
256 | 5.08M | __m128i *d7) { |
257 | | // input |
258 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx |
259 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx |
260 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx |
261 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx |
262 | | // output |
263 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx |
264 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx |
265 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx |
266 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx |
267 | | // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx |
268 | | // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx |
269 | | // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx |
270 | | // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx |
271 | | |
272 | 5.08M | __m128i w0, w1, ww0, ww1; |
273 | | |
274 | 5.08M | w0 = _mm_unpacklo_epi8( |
275 | 5.08M | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
276 | 5.08M | w1 = _mm_unpacklo_epi8( |
277 | 5.08M | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
278 | | |
279 | 5.08M | ww0 = _mm_unpacklo_epi16( |
280 | 5.08M | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
281 | 5.08M | ww1 = _mm_unpackhi_epi16( |
282 | 5.08M | w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 |
283 | | |
284 | 5.08M | *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx |
285 | 5.08M | *d1 = _mm_srli_si128(ww0, |
286 | 5.08M | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx |
287 | 5.08M | *d2 = _mm_srli_si128(ww0, |
288 | 5.08M | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx |
289 | 5.08M | *d3 = _mm_srli_si128(ww0, |
290 | 5.08M | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx |
291 | | |
292 | 5.08M | *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx |
293 | 5.08M | *d5 = _mm_srli_si128(ww1, |
294 | 5.08M | 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx |
295 | 5.08M | *d6 = _mm_srli_si128(ww1, |
296 | 5.08M | 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx |
297 | 5.08M | *d7 = _mm_srli_si128(ww1, |
298 | 5.08M | 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx |
299 | 5.08M | } loopfilter_sse2.c:transpose4x8_8x4_sse2 Line | Count | Source | 256 | 5.06M | __m128i *d7) { | 257 | | // input | 258 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx | 259 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx | 260 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx | 261 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx | 262 | | // output | 263 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 264 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 265 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 266 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 267 | | // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx | 268 | | // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx | 269 | | // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx | 270 | | // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx | 271 | | | 272 | 5.06M | __m128i w0, w1, ww0, ww1; | 273 | | | 274 | 5.06M | w0 = _mm_unpacklo_epi8( | 275 | 5.06M | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 276 | 5.06M | w1 = _mm_unpacklo_epi8( | 277 | 5.06M | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 278 | | | 279 | 5.06M | ww0 = _mm_unpacklo_epi16( | 280 | 5.06M | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 281 | 5.06M | ww1 = _mm_unpackhi_epi16( | 282 | 5.06M | w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 | 283 | | | 284 | 5.06M | *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 285 | 5.06M | *d1 = _mm_srli_si128(ww0, | 286 | 5.06M | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 287 | 5.06M | *d2 = _mm_srli_si128(ww0, | 288 | 5.06M | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 289 | 5.06M | *d3 = _mm_srli_si128(ww0, | 290 | 5.06M | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 291 | | | 292 | 5.06M | *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx | 293 | 5.06M | *d5 = _mm_srli_si128(ww1, | 294 | 5.06M | 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx | 295 | 5.06M | *d6 = _mm_srli_si128(ww1, | 296 | 5.06M | 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx | 297 | 5.06M | *d7 = _mm_srli_si128(ww1, | 298 | 5.06M | 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx | 299 | 5.06M | } |
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose4x8_8x4_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose4x8_8x4_sse2 intrapred_avx2.c:transpose4x8_8x4_sse2 Line | Count | Source | 256 | 23.9k | __m128i *d7) { | 257 | | // input | 258 | | // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx | 259 | | // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx | 260 | | // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx | 261 | | // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx | 262 | | // output | 263 | | // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 264 | | // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 265 | | // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 266 | | // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 267 | | // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx | 268 | | // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx | 269 | | // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx | 270 | | // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx | 271 | | | 272 | 23.9k | __m128i w0, w1, ww0, ww1; | 273 | | | 274 | 23.9k | w0 = _mm_unpacklo_epi8( | 275 | 23.9k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 276 | 23.9k | w1 = _mm_unpacklo_epi8( | 277 | 23.9k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 278 | | | 279 | 23.9k | ww0 = _mm_unpacklo_epi16( | 280 | 23.9k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 281 | 23.9k | ww1 = _mm_unpackhi_epi16( | 282 | 23.9k | w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 | 283 | | | 284 | 23.9k | *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx | 285 | 23.9k | *d1 = _mm_srli_si128(ww0, | 286 | 23.9k | 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx | 287 | 23.9k | *d2 = _mm_srli_si128(ww0, | 288 | 23.9k | 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx | 289 | 23.9k | *d3 = _mm_srli_si128(ww0, | 290 | 23.9k | 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx | 291 | | | 292 | 23.9k | *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx | 293 | 23.9k | *d5 = _mm_srli_si128(ww1, | 294 | 23.9k | 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx | 295 | 23.9k | *d6 = _mm_srli_si128(ww1, | 296 | 23.9k | 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx | 297 | 23.9k | *d7 = _mm_srli_si128(ww1, | 298 | 23.9k | 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx | 299 | 23.9k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose4x8_8x4_sse2 |
300 | | |
301 | | static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, |
302 | | __m128i *x3, __m128i *x4, __m128i *x5, |
303 | | __m128i *x6, __m128i *x7, __m128i *d0, |
304 | | __m128i *d1, __m128i *d2, |
305 | 536k | __m128i *d3) { |
306 | | // input |
307 | | // x0 00 01 02 03 04 05 06 07 |
308 | | // x1 10 11 12 13 14 15 16 17 |
309 | | // x2 20 21 22 23 24 25 26 27 |
310 | | // x3 30 31 32 33 34 35 36 37 |
311 | | // x4 40 41 42 43 44 45 46 47 |
312 | | // x5 50 51 52 53 54 55 56 57 |
313 | | // x6 60 61 62 63 64 65 66 67 |
314 | | // x7 70 71 72 73 74 75 76 77 |
315 | | // output |
316 | | // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx |
317 | | // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx |
318 | | // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx |
319 | | // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx |
320 | | |
321 | 536k | __m128i w0, w1, w2, w3, w4, w5; |
322 | | |
323 | 536k | w0 = _mm_unpacklo_epi8( |
324 | 536k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
325 | | |
326 | 536k | w1 = _mm_unpacklo_epi8( |
327 | 536k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
328 | | |
329 | 536k | w2 = _mm_unpacklo_epi8( |
330 | 536k | *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 |
331 | | |
332 | 536k | w3 = _mm_unpacklo_epi8( |
333 | 536k | *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 |
334 | | |
335 | 536k | w4 = _mm_unpacklo_epi16( |
336 | 536k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
337 | 536k | w5 = _mm_unpacklo_epi16( |
338 | 536k | w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 |
339 | | |
340 | 536k | *d0 = _mm_unpacklo_epi32( |
341 | 536k | w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
342 | 536k | *d1 = _mm_srli_si128(*d0, 8); |
343 | 536k | *d2 = _mm_unpackhi_epi32( |
344 | 536k | w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
345 | 536k | *d3 = _mm_srli_si128(*d2, 8); |
346 | 536k | } loopfilter_sse2.c:transpose8x8_low_sse2 Line | Count | Source | 305 | 495k | __m128i *d3) { | 306 | | // input | 307 | | // x0 00 01 02 03 04 05 06 07 | 308 | | // x1 10 11 12 13 14 15 16 17 | 309 | | // x2 20 21 22 23 24 25 26 27 | 310 | | // x3 30 31 32 33 34 35 36 37 | 311 | | // x4 40 41 42 43 44 45 46 47 | 312 | | // x5 50 51 52 53 54 55 56 57 | 313 | | // x6 60 61 62 63 64 65 66 67 | 314 | | // x7 70 71 72 73 74 75 76 77 | 315 | | // output | 316 | | // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx | 317 | | // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx | 318 | | // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx | 319 | | // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx | 320 | | | 321 | 495k | __m128i w0, w1, w2, w3, w4, w5; | 322 | | | 323 | 495k | w0 = _mm_unpacklo_epi8( | 324 | 495k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 325 | | | 326 | 495k | w1 = _mm_unpacklo_epi8( | 327 | 495k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 328 | | | 329 | 495k | w2 = _mm_unpacklo_epi8( | 330 | 495k | *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 | 331 | | | 332 | 495k | w3 = _mm_unpacklo_epi8( | 333 | 495k | *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 | 334 | | | 335 | 495k | w4 = _mm_unpacklo_epi16( | 336 | 495k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 337 | 495k | w5 = _mm_unpacklo_epi16( | 338 | 495k | w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 | 339 | | | 340 | 495k | *d0 = _mm_unpacklo_epi32( | 341 | 495k | w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 | 342 | 495k | *d1 = _mm_srli_si128(*d0, 8); | 343 | 495k | *d2 = _mm_unpackhi_epi32( | 344 | 495k | w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 | 345 | 495k | *d3 = _mm_srli_si128(*d2, 8); | 346 | 495k | } |
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x8_low_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose8x8_low_sse2 intrapred_avx2.c:transpose8x8_low_sse2 Line | Count | Source | 305 | 40.7k | __m128i *d3) { | 306 | | // input | 307 | | // x0 00 01 02 03 04 05 06 07 | 308 | | // x1 10 11 12 13 14 15 16 17 | 309 | | // x2 20 21 22 23 24 25 26 27 | 310 | | // x3 30 31 32 33 34 35 36 37 | 311 | | // x4 40 41 42 43 44 45 46 47 | 312 | | // x5 50 51 52 53 54 55 56 57 | 313 | | // x6 60 61 62 63 64 65 66 67 | 314 | | // x7 70 71 72 73 74 75 76 77 | 315 | | // output | 316 | | // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx | 317 | | // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx | 318 | | // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx | 319 | | // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx | 320 | | | 321 | 40.7k | __m128i w0, w1, w2, w3, w4, w5; | 322 | | | 323 | 40.7k | w0 = _mm_unpacklo_epi8( | 324 | 40.7k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 325 | | | 326 | 40.7k | w1 = _mm_unpacklo_epi8( | 327 | 40.7k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 328 | | | 329 | 40.7k | w2 = _mm_unpacklo_epi8( | 330 | 40.7k | *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 | 331 | | | 332 | 40.7k | w3 = _mm_unpacklo_epi8( | 333 | 40.7k | *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 | 334 | | | 335 | 40.7k | w4 = _mm_unpacklo_epi16( | 336 | 40.7k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 337 | 40.7k | w5 = _mm_unpacklo_epi16( | 338 | 40.7k | w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 | 339 | | | 340 | 40.7k | *d0 = _mm_unpacklo_epi32( | 341 | 40.7k | w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 | 342 | 40.7k | *d1 = _mm_srli_si128(*d0, 8); | 343 | 40.7k | *d2 = _mm_unpackhi_epi32( | 344 | 40.7k | w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 | 345 | 40.7k | *d3 = _mm_srli_si128(*d2, 8); | 346 | 40.7k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x8_low_sse2 |
347 | | |
348 | | static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, |
349 | | __m128i *x3, __m128i *x4, __m128i *x5, |
350 | | __m128i *x6, __m128i *x7, __m128i *d0d1, |
351 | | __m128i *d2d3, __m128i *d4d5, |
352 | 98.0k | __m128i *d6d7) { |
353 | 98.0k | __m128i w0, w1, w2, w3, w4, w5, w6, w7; |
354 | | // x0 00 01 02 03 04 05 06 07 |
355 | | // x1 10 11 12 13 14 15 16 17 |
356 | 98.0k | w0 = _mm_unpacklo_epi8( |
357 | 98.0k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
358 | | |
359 | | // x2 20 21 22 23 24 25 26 27 |
360 | | // x3 30 31 32 33 34 35 36 37 |
361 | 98.0k | w1 = _mm_unpacklo_epi8( |
362 | 98.0k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
363 | | |
364 | | // x4 40 41 42 43 44 45 46 47 |
365 | | // x5 50 51 52 53 54 55 56 57 |
366 | 98.0k | w2 = _mm_unpacklo_epi8( |
367 | 98.0k | *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 |
368 | | |
369 | | // x6 60 61 62 63 64 65 66 67 |
370 | | // x7 70 71 72 73 74 75 76 77 |
371 | 98.0k | w3 = _mm_unpacklo_epi8( |
372 | 98.0k | *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 |
373 | | |
374 | 98.0k | w4 = _mm_unpacklo_epi16( |
375 | 98.0k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
376 | 98.0k | w5 = _mm_unpacklo_epi16( |
377 | 98.0k | w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 |
378 | | |
379 | 98.0k | *d0d1 = _mm_unpacklo_epi32( |
380 | 98.0k | w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
381 | 98.0k | *d2d3 = _mm_unpackhi_epi32( |
382 | 98.0k | w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
383 | | |
384 | 98.0k | w6 = _mm_unpackhi_epi16( |
385 | 98.0k | w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 |
386 | 98.0k | w7 = _mm_unpackhi_epi16( |
387 | 98.0k | w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 |
388 | | |
389 | 98.0k | *d4d5 = _mm_unpacklo_epi32( |
390 | 98.0k | w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 |
391 | 98.0k | *d6d7 = _mm_unpackhi_epi32( |
392 | 98.0k | w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 |
393 | 98.0k | } Unexecuted instantiation: loopfilter_sse2.c:transpose8x8_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x8_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose8x8_sse2 intrapred_avx2.c:transpose8x8_sse2 Line | Count | Source | 352 | 98.0k | __m128i *d6d7) { | 353 | 98.0k | __m128i w0, w1, w2, w3, w4, w5, w6, w7; | 354 | | // x0 00 01 02 03 04 05 06 07 | 355 | | // x1 10 11 12 13 14 15 16 17 | 356 | 98.0k | w0 = _mm_unpacklo_epi8( | 357 | 98.0k | *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 | 358 | | | 359 | | // x2 20 21 22 23 24 25 26 27 | 360 | | // x3 30 31 32 33 34 35 36 37 | 361 | 98.0k | w1 = _mm_unpacklo_epi8( | 362 | 98.0k | *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 | 363 | | | 364 | | // x4 40 41 42 43 44 45 46 47 | 365 | | // x5 50 51 52 53 54 55 56 57 | 366 | 98.0k | w2 = _mm_unpacklo_epi8( | 367 | 98.0k | *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 | 368 | | | 369 | | // x6 60 61 62 63 64 65 66 67 | 370 | | // x7 70 71 72 73 74 75 76 77 | 371 | 98.0k | w3 = _mm_unpacklo_epi8( | 372 | 98.0k | *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 | 373 | | | 374 | 98.0k | w4 = _mm_unpacklo_epi16( | 375 | 98.0k | w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 | 376 | 98.0k | w5 = _mm_unpacklo_epi16( | 377 | 98.0k | w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 | 378 | | | 379 | 98.0k | *d0d1 = _mm_unpacklo_epi32( | 380 | 98.0k | w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 | 381 | 98.0k | *d2d3 = _mm_unpackhi_epi32( | 382 | 98.0k | w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 | 383 | | | 384 | 98.0k | w6 = _mm_unpackhi_epi16( | 385 | 98.0k | w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 | 386 | 98.0k | w7 = _mm_unpackhi_epi16( | 387 | 98.0k | w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 | 388 | | | 389 | 98.0k | *d4d5 = _mm_unpacklo_epi32( | 390 | 98.0k | w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 | 391 | 98.0k | *d6d7 = _mm_unpackhi_epi32( | 392 | 98.0k | w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 | 393 | 98.0k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x8_sse2 |
394 | | |
395 | | static inline void transpose16x8_8x16_sse2( |
396 | | __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, |
397 | | __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, |
398 | | __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, |
399 | | __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, |
400 | 195k | __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { |
401 | 195k | __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; |
402 | 195k | __m128i w10, w11, w12, w13, w14, w15; |
403 | | |
404 | 195k | w0 = _mm_unpacklo_epi8(*x0, *x1); |
405 | 195k | w1 = _mm_unpacklo_epi8(*x2, *x3); |
406 | 195k | w2 = _mm_unpacklo_epi8(*x4, *x5); |
407 | 195k | w3 = _mm_unpacklo_epi8(*x6, *x7); |
408 | | |
409 | 195k | w8 = _mm_unpacklo_epi8(*x8, *x9); |
410 | 195k | w9 = _mm_unpacklo_epi8(*x10, *x11); |
411 | 195k | w10 = _mm_unpacklo_epi8(*x12, *x13); |
412 | 195k | w11 = _mm_unpacklo_epi8(*x14, *x15); |
413 | | |
414 | 195k | w4 = _mm_unpacklo_epi16(w0, w1); |
415 | 195k | w5 = _mm_unpacklo_epi16(w2, w3); |
416 | 195k | w12 = _mm_unpacklo_epi16(w8, w9); |
417 | 195k | w13 = _mm_unpacklo_epi16(w10, w11); |
418 | | |
419 | 195k | w6 = _mm_unpacklo_epi32(w4, w5); |
420 | 195k | w7 = _mm_unpackhi_epi32(w4, w5); |
421 | 195k | w14 = _mm_unpacklo_epi32(w12, w13); |
422 | 195k | w15 = _mm_unpackhi_epi32(w12, w13); |
423 | | |
424 | | // Store first 4-line result |
425 | 195k | *d0 = _mm_unpacklo_epi64(w6, w14); |
426 | 195k | *d1 = _mm_unpackhi_epi64(w6, w14); |
427 | 195k | *d2 = _mm_unpacklo_epi64(w7, w15); |
428 | 195k | *d3 = _mm_unpackhi_epi64(w7, w15); |
429 | | |
430 | 195k | w4 = _mm_unpackhi_epi16(w0, w1); |
431 | 195k | w5 = _mm_unpackhi_epi16(w2, w3); |
432 | 195k | w12 = _mm_unpackhi_epi16(w8, w9); |
433 | 195k | w13 = _mm_unpackhi_epi16(w10, w11); |
434 | | |
435 | 195k | w6 = _mm_unpacklo_epi32(w4, w5); |
436 | 195k | w7 = _mm_unpackhi_epi32(w4, w5); |
437 | 195k | w14 = _mm_unpacklo_epi32(w12, w13); |
438 | 195k | w15 = _mm_unpackhi_epi32(w12, w13); |
439 | | |
440 | | // Store second 4-line result |
441 | 195k | *d4 = _mm_unpacklo_epi64(w6, w14); |
442 | 195k | *d5 = _mm_unpackhi_epi64(w6, w14); |
443 | 195k | *d6 = _mm_unpacklo_epi64(w7, w15); |
444 | 195k | *d7 = _mm_unpackhi_epi64(w7, w15); |
445 | 195k | } Unexecuted instantiation: loopfilter_sse2.c:transpose16x8_8x16_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose16x8_8x16_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose16x8_8x16_sse2 intrapred_avx2.c:transpose16x8_8x16_sse2 Line | Count | Source | 400 | 195k | __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { | 401 | 195k | __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; | 402 | 195k | __m128i w10, w11, w12, w13, w14, w15; | 403 | | | 404 | 195k | w0 = _mm_unpacklo_epi8(*x0, *x1); | 405 | 195k | w1 = _mm_unpacklo_epi8(*x2, *x3); | 406 | 195k | w2 = _mm_unpacklo_epi8(*x4, *x5); | 407 | 195k | w3 = _mm_unpacklo_epi8(*x6, *x7); | 408 | | | 409 | 195k | w8 = _mm_unpacklo_epi8(*x8, *x9); | 410 | 195k | w9 = _mm_unpacklo_epi8(*x10, *x11); | 411 | 195k | w10 = _mm_unpacklo_epi8(*x12, *x13); | 412 | 195k | w11 = _mm_unpacklo_epi8(*x14, *x15); | 413 | | | 414 | 195k | w4 = _mm_unpacklo_epi16(w0, w1); | 415 | 195k | w5 = _mm_unpacklo_epi16(w2, w3); | 416 | 195k | w12 = _mm_unpacklo_epi16(w8, w9); | 417 | 195k | w13 = _mm_unpacklo_epi16(w10, w11); | 418 | | | 419 | 195k | w6 = _mm_unpacklo_epi32(w4, w5); | 420 | 195k | w7 = _mm_unpackhi_epi32(w4, w5); | 421 | 195k | w14 = _mm_unpacklo_epi32(w12, w13); | 422 | 195k | w15 = _mm_unpackhi_epi32(w12, w13); | 423 | | | 424 | | // Store first 4-line result | 425 | 195k | *d0 = _mm_unpacklo_epi64(w6, w14); | 426 | 195k | *d1 = _mm_unpackhi_epi64(w6, w14); | 427 | 195k | *d2 = _mm_unpacklo_epi64(w7, w15); | 428 | 195k | *d3 = _mm_unpackhi_epi64(w7, w15); | 429 | | | 430 | 195k | w4 = _mm_unpackhi_epi16(w0, w1); | 431 | 195k | w5 = _mm_unpackhi_epi16(w2, w3); | 432 | 195k | w12 = _mm_unpackhi_epi16(w8, w9); | 433 | 195k | w13 = _mm_unpackhi_epi16(w10, w11); | 434 | | | 435 | 195k | w6 = _mm_unpacklo_epi32(w4, w5); | 436 | 195k | w7 = _mm_unpackhi_epi32(w4, w5); | 437 | 195k | w14 = _mm_unpacklo_epi32(w12, w13); | 438 | 195k | w15 = _mm_unpackhi_epi32(w12, w13); | 439 | | | 440 | | // Store second 4-line result | 441 | 195k | *d4 = _mm_unpacklo_epi64(w6, w14); | 442 | 195k | *d5 = _mm_unpackhi_epi64(w6, w14); | 443 | 195k | *d6 = _mm_unpacklo_epi64(w7, w15); | 444 | 195k | *d7 = _mm_unpackhi_epi64(w7, w15); | 445 | 195k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose16x8_8x16_sse2 |
446 | | |
447 | | static inline void transpose8x16_16x8_sse2( |
448 | | __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, |
449 | | __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, |
450 | | __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, |
451 | 26.2k | __m128i *d12d13, __m128i *d14d15) { |
452 | 26.2k | __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; |
453 | 26.2k | __m128i w10, w11, w12, w13, w14, w15; |
454 | | |
455 | 26.2k | w0 = _mm_unpacklo_epi8(*x0, *x1); |
456 | 26.2k | w1 = _mm_unpacklo_epi8(*x2, *x3); |
457 | 26.2k | w2 = _mm_unpacklo_epi8(*x4, *x5); |
458 | 26.2k | w3 = _mm_unpacklo_epi8(*x6, *x7); |
459 | | |
460 | 26.2k | w8 = _mm_unpackhi_epi8(*x0, *x1); |
461 | 26.2k | w9 = _mm_unpackhi_epi8(*x2, *x3); |
462 | 26.2k | w10 = _mm_unpackhi_epi8(*x4, *x5); |
463 | 26.2k | w11 = _mm_unpackhi_epi8(*x6, *x7); |
464 | | |
465 | 26.2k | w4 = _mm_unpacklo_epi16(w0, w1); |
466 | 26.2k | w5 = _mm_unpacklo_epi16(w2, w3); |
467 | 26.2k | w12 = _mm_unpacklo_epi16(w8, w9); |
468 | 26.2k | w13 = _mm_unpacklo_epi16(w10, w11); |
469 | | |
470 | 26.2k | w6 = _mm_unpacklo_epi32(w4, w5); |
471 | 26.2k | w7 = _mm_unpackhi_epi32(w4, w5); |
472 | 26.2k | w14 = _mm_unpacklo_epi32(w12, w13); |
473 | 26.2k | w15 = _mm_unpackhi_epi32(w12, w13); |
474 | | |
475 | | // Store first 4-line result |
476 | 26.2k | *d0d1 = _mm_unpacklo_epi64(w6, w14); |
477 | 26.2k | *d2d3 = _mm_unpackhi_epi64(w6, w14); |
478 | 26.2k | *d4d5 = _mm_unpacklo_epi64(w7, w15); |
479 | 26.2k | *d6d7 = _mm_unpackhi_epi64(w7, w15); |
480 | | |
481 | 26.2k | w4 = _mm_unpackhi_epi16(w0, w1); |
482 | 26.2k | w5 = _mm_unpackhi_epi16(w2, w3); |
483 | 26.2k | w12 = _mm_unpackhi_epi16(w8, w9); |
484 | 26.2k | w13 = _mm_unpackhi_epi16(w10, w11); |
485 | | |
486 | 26.2k | w6 = _mm_unpacklo_epi32(w4, w5); |
487 | 26.2k | w7 = _mm_unpackhi_epi32(w4, w5); |
488 | 26.2k | w14 = _mm_unpacklo_epi32(w12, w13); |
489 | 26.2k | w15 = _mm_unpackhi_epi32(w12, w13); |
490 | | |
491 | | // Store second 4-line result |
492 | 26.2k | *d8d9 = _mm_unpacklo_epi64(w6, w14); |
493 | 26.2k | *d10d11 = _mm_unpackhi_epi64(w6, w14); |
494 | 26.2k | *d12d13 = _mm_unpacklo_epi64(w7, w15); |
495 | 26.2k | *d14d15 = _mm_unpackhi_epi64(w7, w15); |
496 | 26.2k | } Unexecuted instantiation: loopfilter_sse2.c:transpose8x16_16x8_sse2 Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x16_16x8_sse2 Unexecuted instantiation: intrapred_sse4.c:transpose8x16_16x8_sse2 intrapred_avx2.c:transpose8x16_16x8_sse2 Line | Count | Source | 451 | 26.2k | __m128i *d12d13, __m128i *d14d15) { | 452 | 26.2k | __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; | 453 | 26.2k | __m128i w10, w11, w12, w13, w14, w15; | 454 | | | 455 | 26.2k | w0 = _mm_unpacklo_epi8(*x0, *x1); | 456 | 26.2k | w1 = _mm_unpacklo_epi8(*x2, *x3); | 457 | 26.2k | w2 = _mm_unpacklo_epi8(*x4, *x5); | 458 | 26.2k | w3 = _mm_unpacklo_epi8(*x6, *x7); | 459 | | | 460 | 26.2k | w8 = _mm_unpackhi_epi8(*x0, *x1); | 461 | 26.2k | w9 = _mm_unpackhi_epi8(*x2, *x3); | 462 | 26.2k | w10 = _mm_unpackhi_epi8(*x4, *x5); | 463 | 26.2k | w11 = _mm_unpackhi_epi8(*x6, *x7); | 464 | | | 465 | 26.2k | w4 = _mm_unpacklo_epi16(w0, w1); | 466 | 26.2k | w5 = _mm_unpacklo_epi16(w2, w3); | 467 | 26.2k | w12 = _mm_unpacklo_epi16(w8, w9); | 468 | 26.2k | w13 = _mm_unpacklo_epi16(w10, w11); | 469 | | | 470 | 26.2k | w6 = _mm_unpacklo_epi32(w4, w5); | 471 | 26.2k | w7 = _mm_unpackhi_epi32(w4, w5); | 472 | 26.2k | w14 = _mm_unpacklo_epi32(w12, w13); | 473 | 26.2k | w15 = _mm_unpackhi_epi32(w12, w13); | 474 | | | 475 | | // Store first 4-line result | 476 | 26.2k | *d0d1 = _mm_unpacklo_epi64(w6, w14); | 477 | 26.2k | *d2d3 = _mm_unpackhi_epi64(w6, w14); | 478 | 26.2k | *d4d5 = _mm_unpacklo_epi64(w7, w15); | 479 | 26.2k | *d6d7 = _mm_unpackhi_epi64(w7, w15); | 480 | | | 481 | 26.2k | w4 = _mm_unpackhi_epi16(w0, w1); | 482 | 26.2k | w5 = _mm_unpackhi_epi16(w2, w3); | 483 | 26.2k | w12 = _mm_unpackhi_epi16(w8, w9); | 484 | 26.2k | w13 = _mm_unpackhi_epi16(w10, w11); | 485 | | | 486 | 26.2k | w6 = _mm_unpacklo_epi32(w4, w5); | 487 | 26.2k | w7 = _mm_unpackhi_epi32(w4, w5); | 488 | 26.2k | w14 = _mm_unpacklo_epi32(w12, w13); | 489 | 26.2k | w15 = _mm_unpackhi_epi32(w12, w13); | 490 | | | 491 | | // Store second 4-line result | 492 | 26.2k | *d8d9 = _mm_unpacklo_epi64(w6, w14); | 493 | 26.2k | *d10d11 = _mm_unpackhi_epi64(w6, w14); | 494 | 26.2k | *d12d13 = _mm_unpacklo_epi64(w7, w15); | 495 | 26.2k | *d14d15 = _mm_unpackhi_epi64(w7, w15); | 496 | 26.2k | } |
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x16_16x8_sse2 |
497 | | |
498 | | static inline void transpose_16x8(unsigned char *in0, unsigned char *in1, |
499 | 0 | int in_p, unsigned char *out, int out_p) { |
500 | 0 | __m128i x0, x1, x2, x3, x4, x5, x6, x7; |
501 | 0 | __m128i x8, x9, x10, x11, x12, x13, x14, x15; |
502 | |
|
503 | 0 | x0 = _mm_loadl_epi64((__m128i *)in0); |
504 | 0 | x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); |
505 | 0 | x0 = _mm_unpacklo_epi8(x0, x1); |
506 | |
|
507 | 0 | x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); |
508 | 0 | x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); |
509 | 0 | x1 = _mm_unpacklo_epi8(x2, x3); |
510 | |
|
511 | 0 | x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); |
512 | 0 | x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); |
513 | 0 | x2 = _mm_unpacklo_epi8(x4, x5); |
514 | |
|
515 | 0 | x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); |
516 | 0 | x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); |
517 | 0 | x3 = _mm_unpacklo_epi8(x6, x7); |
518 | 0 | x4 = _mm_unpacklo_epi16(x0, x1); |
519 | |
|
520 | 0 | x8 = _mm_loadl_epi64((__m128i *)in1); |
521 | 0 | x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); |
522 | 0 | x8 = _mm_unpacklo_epi8(x8, x9); |
523 | 0 | x5 = _mm_unpacklo_epi16(x2, x3); |
524 | |
|
525 | 0 | x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); |
526 | 0 | x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); |
527 | 0 | x9 = _mm_unpacklo_epi8(x10, x11); |
528 | |
|
529 | 0 | x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); |
530 | 0 | x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); |
531 | 0 | x10 = _mm_unpacklo_epi8(x12, x13); |
532 | 0 | x12 = _mm_unpacklo_epi16(x8, x9); |
533 | |
|
534 | 0 | x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); |
535 | 0 | x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); |
536 | 0 | x11 = _mm_unpacklo_epi8(x14, x15); |
537 | 0 | x13 = _mm_unpacklo_epi16(x10, x11); |
538 | |
|
539 | 0 | x6 = _mm_unpacklo_epi32(x4, x5); |
540 | 0 | x7 = _mm_unpackhi_epi32(x4, x5); |
541 | 0 | x14 = _mm_unpacklo_epi32(x12, x13); |
542 | 0 | x15 = _mm_unpackhi_epi32(x12, x13); |
543 | | |
544 | | // Store first 4-line result |
545 | 0 | _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); |
546 | 0 | _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); |
547 | 0 | _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); |
548 | 0 | _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); |
549 | |
|
550 | 0 | x4 = _mm_unpackhi_epi16(x0, x1); |
551 | 0 | x5 = _mm_unpackhi_epi16(x2, x3); |
552 | 0 | x12 = _mm_unpackhi_epi16(x8, x9); |
553 | 0 | x13 = _mm_unpackhi_epi16(x10, x11); |
554 | |
|
555 | 0 | x6 = _mm_unpacklo_epi32(x4, x5); |
556 | 0 | x7 = _mm_unpackhi_epi32(x4, x5); |
557 | 0 | x14 = _mm_unpacklo_epi32(x12, x13); |
558 | 0 | x15 = _mm_unpackhi_epi32(x12, x13); |
559 | | |
560 | | // Store second 4-line result |
561 | 0 | _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); |
562 | 0 | _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); |
563 | 0 | _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); |
564 | 0 | _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); |
565 | 0 | } Unexecuted instantiation: loopfilter_sse2.c:transpose_16x8 Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_16x8 Unexecuted instantiation: intrapred_sse4.c:transpose_16x8 Unexecuted instantiation: intrapred_avx2.c:transpose_16x8 Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_16x8 |
566 | | |
567 | | static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p, |
568 | 0 | unsigned char *dst, int out_p) { |
569 | | // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0 |
570 | | // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1 |
571 | | // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2 |
572 | | // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3 |
573 | | // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4 |
574 | | // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5 |
575 | | // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6 |
576 | | // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7 |
577 | 0 | const __m128i x0 = _mm_loadu_si128((__m128i *)(src)); |
578 | 0 | const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p))); |
579 | 0 | const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p))); |
580 | 0 | const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p))); |
581 | 0 | const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p))); |
582 | 0 | const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p))); |
583 | 0 | const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p))); |
584 | 0 | const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p))); |
585 | | |
586 | | // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1 |
587 | | // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1 |
588 | | // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3 |
589 | | // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3 |
590 | | // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5 |
591 | | // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5 |
592 | | // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7 |
593 | | // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7 |
594 | 0 | const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1); |
595 | 0 | const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1); |
596 | 0 | const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3); |
597 | 0 | const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3); |
598 | 0 | const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5); |
599 | 0 | const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5); |
600 | 0 | const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7); |
601 | 0 | const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7); |
602 | | |
603 | | // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3 |
604 | | // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3 |
605 | | // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3 |
606 | | // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3 |
607 | | // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7 |
608 | | // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7 |
609 | | // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7 |
610 | | // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7 |
611 | 0 | const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12); |
612 | 0 | const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12); |
613 | 0 | const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13); |
614 | 0 | const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13); |
615 | 0 | const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16); |
616 | 0 | const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16); |
617 | 0 | const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17); |
618 | 0 | const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17); |
619 | | |
620 | | // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7 |
621 | | // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7 |
622 | | // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7 |
623 | | // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7 |
624 | | // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7 |
625 | | // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7 |
626 | | // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7 |
627 | | // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7 |
628 | 0 | const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24); |
629 | 0 | const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24); |
630 | 0 | const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25); |
631 | 0 | const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25); |
632 | 0 | const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26); |
633 | 0 | const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26); |
634 | 0 | const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27); |
635 | 0 | const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27); |
636 | |
|
637 | 0 | mm_storelu(dst, x_s30); |
638 | 0 | mm_storehu(dst + (1 * out_p), x_s30); |
639 | 0 | mm_storelu(dst + (2 * out_p), x_s31); |
640 | 0 | mm_storehu(dst + (3 * out_p), x_s31); |
641 | 0 | mm_storelu(dst + (4 * out_p), x_s32); |
642 | 0 | mm_storehu(dst + (5 * out_p), x_s32); |
643 | 0 | mm_storelu(dst + (6 * out_p), x_s33); |
644 | 0 | mm_storehu(dst + (7 * out_p), x_s33); |
645 | 0 | mm_storelu(dst + (8 * out_p), x_s34); |
646 | 0 | mm_storehu(dst + (9 * out_p), x_s34); |
647 | 0 | mm_storelu(dst + (10 * out_p), x_s35); |
648 | 0 | mm_storehu(dst + (11 * out_p), x_s35); |
649 | 0 | mm_storelu(dst + (12 * out_p), x_s36); |
650 | 0 | mm_storehu(dst + (13 * out_p), x_s36); |
651 | 0 | mm_storelu(dst + (14 * out_p), x_s37); |
652 | 0 | mm_storehu(dst + (15 * out_p), x_s37); |
653 | 0 | } Unexecuted instantiation: loopfilter_sse2.c:transpose_16x8_to_8x16 Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_16x8_to_8x16 Unexecuted instantiation: intrapred_sse4.c:transpose_16x8_to_8x16 Unexecuted instantiation: intrapred_avx2.c:transpose_16x8_to_8x16 Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_16x8_to_8x16 |
654 | | |
655 | | static inline void transpose_8xn(unsigned char *src[], int in_p, |
656 | | unsigned char *dst[], int out_p, |
657 | 0 | int num_8x8_to_transpose) { |
658 | 0 | int idx8x8 = 0; |
659 | 0 | __m128i x0, x1, x2, x3, x4, x5, x6, x7; |
660 | 0 | do { |
661 | 0 | unsigned char *in = src[idx8x8]; |
662 | 0 | unsigned char *out = dst[idx8x8]; |
663 | 0 |
|
664 | 0 | x0 = |
665 | 0 | _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 |
666 | 0 | x1 = |
667 | 0 | _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 |
668 | 0 | // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
669 | 0 | x0 = _mm_unpacklo_epi8(x0, x1); |
670 | 0 |
|
671 | 0 | x2 = |
672 | 0 | _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 |
673 | 0 | x3 = |
674 | 0 | _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 |
675 | 0 | // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
676 | 0 | x1 = _mm_unpacklo_epi8(x2, x3); |
677 | 0 |
|
678 | 0 | x4 = |
679 | 0 | _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 |
680 | 0 | x5 = |
681 | 0 | _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 |
682 | 0 | // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 |
683 | 0 | x2 = _mm_unpacklo_epi8(x4, x5); |
684 | 0 |
|
685 | 0 | x6 = |
686 | 0 | _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 |
687 | 0 | x7 = |
688 | 0 | _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 |
689 | 0 | // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 |
690 | 0 | x3 = _mm_unpacklo_epi8(x6, x7); |
691 | 0 |
|
692 | 0 | // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
693 | 0 | x4 = _mm_unpacklo_epi16(x0, x1); |
694 | 0 | // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 |
695 | 0 | x5 = _mm_unpacklo_epi16(x2, x3); |
696 | 0 | // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
697 | 0 | x6 = _mm_unpacklo_epi32(x4, x5); |
698 | 0 | mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 |
699 | 0 | mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 |
700 | 0 | // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
701 | 0 | x7 = _mm_unpackhi_epi32(x4, x5); |
702 | 0 | mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 |
703 | 0 | mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 |
704 | 0 |
|
705 | 0 | // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 |
706 | 0 | x4 = _mm_unpackhi_epi16(x0, x1); |
707 | 0 | // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 |
708 | 0 | x5 = _mm_unpackhi_epi16(x2, x3); |
709 | 0 | // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 |
710 | 0 | x6 = _mm_unpacklo_epi32(x4, x5); |
711 | 0 | mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 |
712 | 0 | mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 |
713 | 0 | // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 |
714 | 0 | x7 = _mm_unpackhi_epi32(x4, x5); |
715 | 0 |
|
716 | 0 | mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 |
717 | 0 | mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 |
718 | 0 | } while (++idx8x8 < num_8x8_to_transpose); |
719 | 0 | } Unexecuted instantiation: loopfilter_sse2.c:transpose_8xn Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_8xn Unexecuted instantiation: intrapred_sse4.c:transpose_8xn Unexecuted instantiation: intrapred_avx2.c:transpose_8xn Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_8xn |
720 | | |
721 | | #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ |