/src/libvpx/vpx_dsp/x86/transpose_sse2.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 | | * |
4 | | * Use of this source code is governed by a BSD-style license |
5 | | * that can be found in the LICENSE file in the root of the source |
6 | | * tree. An additional intellectual property rights grant can be found |
7 | | * in the file PATENTS. All contributing project authors may |
8 | | * be found in the AUTHORS file in the root of the source tree. |
9 | | */ |
10 | | |
11 | | #ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ |
12 | | #define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ |
13 | | |
14 | | #include <emmintrin.h> // SSE2 |
15 | | |
16 | | #include "./vpx_config.h" |
17 | | |
18 | 0 | static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { |
19 | | // Unpack 8 bit elements. Goes from: |
20 | | // in[0]: 00 01 02 03 |
21 | | // in[1]: 10 11 12 13 |
22 | | // in[2]: 20 21 22 23 |
23 | | // in[3]: 30 31 32 33 |
24 | | // to: |
25 | | // a0: 00 10 01 11 02 12 03 13 |
26 | | // a1: 20 30 21 31 22 32 23 33 |
27 | 0 | const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); |
28 | 0 | const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); |
29 | | |
30 | | // Unpack 16 bit elements resulting in: |
31 | | // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
32 | 0 | return _mm_unpacklo_epi16(a0, a1); |
33 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: inv_txfm_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: inv_txfm_ssse3.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_8bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_8bit_4x4 |
34 | | |
35 | | static INLINE void transpose_8bit_8x8(const __m128i *const in, |
36 | 0 | __m128i *const out) { |
37 | | // Unpack 8 bit elements. Goes from: |
38 | | // in[0]: 00 01 02 03 04 05 06 07 |
39 | | // in[1]: 10 11 12 13 14 15 16 17 |
40 | | // in[2]: 20 21 22 23 24 25 26 27 |
41 | | // in[3]: 30 31 32 33 34 35 36 37 |
42 | | // in[4]: 40 41 42 43 44 45 46 47 |
43 | | // in[5]: 50 51 52 53 54 55 56 57 |
44 | | // in[6]: 60 61 62 63 64 65 66 67 |
45 | | // in[7]: 70 71 72 73 74 75 76 77 |
46 | | // to: |
47 | | // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 |
48 | | // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 |
49 | | // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 |
50 | | // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 |
51 | 0 | const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); |
52 | 0 | const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); |
53 | 0 | const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); |
54 | 0 | const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); |
55 | | |
56 | | // Unpack 16 bit elements resulting in: |
57 | | // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 |
58 | | // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 |
59 | | // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 |
60 | | // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 |
61 | 0 | const __m128i b0 = _mm_unpacklo_epi16(a0, a1); |
62 | 0 | const __m128i b1 = _mm_unpackhi_epi16(a0, a1); |
63 | 0 | const __m128i b2 = _mm_unpacklo_epi16(a2, a3); |
64 | 0 | const __m128i b3 = _mm_unpackhi_epi16(a2, a3); |
65 | | |
66 | | // Unpack 32 bit elements resulting in: |
67 | | // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 |
68 | | // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 |
69 | | // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 |
70 | | // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 |
71 | 0 | const __m128i c0 = _mm_unpacklo_epi32(b0, b2); |
72 | 0 | const __m128i c1 = _mm_unpackhi_epi32(b0, b2); |
73 | 0 | const __m128i c2 = _mm_unpacklo_epi32(b1, b3); |
74 | 0 | const __m128i c3 = _mm_unpackhi_epi32(b1, b3); |
75 | | |
76 | | // Unpack 64 bit elements resulting in: |
77 | | // out[0]: 00 10 20 30 40 50 60 70 |
78 | | // out[1]: 01 11 21 31 41 51 61 71 |
79 | | // out[2]: 02 12 22 32 42 52 62 72 |
80 | | // out[3]: 03 13 23 33 43 53 63 73 |
81 | | // out[4]: 04 14 24 34 44 54 64 74 |
82 | | // out[5]: 05 15 25 35 45 55 65 75 |
83 | | // out[6]: 06 16 26 36 46 56 66 76 |
84 | | // out[7]: 07 17 27 37 47 57 67 77 |
85 | 0 | out[0] = _mm_unpacklo_epi64(c0, c0); |
86 | 0 | out[1] = _mm_unpackhi_epi64(c0, c0); |
87 | 0 | out[2] = _mm_unpacklo_epi64(c1, c1); |
88 | 0 | out[3] = _mm_unpackhi_epi64(c1, c1); |
89 | 0 | out[4] = _mm_unpacklo_epi64(c2, c2); |
90 | 0 | out[5] = _mm_unpackhi_epi64(c2, c2); |
91 | 0 | out[6] = _mm_unpacklo_epi64(c3, c3); |
92 | 0 | out[7] = _mm_unpackhi_epi64(c3, c3); |
93 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: inv_txfm_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: inv_txfm_ssse3.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_8bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_8bit_8x8 |
94 | | |
95 | | static INLINE void transpose_16bit_4x4(const __m128i *const in, |
96 | 721k | __m128i *const out) { |
97 | | // Unpack 16 bit elements. Goes from: |
98 | | // in[0]: 00 01 02 03 XX XX XX XX |
99 | | // in[1]: 10 11 12 13 XX XX XX XX |
100 | | // in[2]: 20 21 22 23 XX XX XX XX |
101 | | // in[3]: 30 31 32 33 XX XX XX XX |
102 | | // to: |
103 | | // a0: 00 10 01 11 02 12 03 13 |
104 | | // a1: 20 30 21 31 22 32 23 33 |
105 | 721k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
106 | 721k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
107 | | |
108 | | // Unpack 32 bit elements resulting in: |
109 | | // out[0]: 00 10 20 30 01 11 21 31 |
110 | | // out[1]: 02 12 22 32 03 13 23 33 |
111 | 721k | out[0] = _mm_unpacklo_epi32(a0, a1); |
112 | 721k | out[1] = _mm_unpackhi_epi32(a0, a1); |
113 | 721k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_4x4 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x4 inv_txfm_sse2.c:transpose_16bit_4x4 Line | Count | Source | 96 | 26.0k | __m128i *const out) { | 97 | | // Unpack 16 bit elements. Goes from: | 98 | | // in[0]: 00 01 02 03 XX XX XX XX | 99 | | // in[1]: 10 11 12 13 XX XX XX XX | 100 | | // in[2]: 20 21 22 23 XX XX XX XX | 101 | | // in[3]: 30 31 32 33 XX XX XX XX | 102 | | // to: | 103 | | // a0: 00 10 01 11 02 12 03 13 | 104 | | // a1: 20 30 21 31 22 32 23 33 | 105 | 26.0k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 106 | 26.0k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 107 | | | 108 | | // Unpack 32 bit elements resulting in: | 109 | | // out[0]: 00 10 20 30 01 11 21 31 | 110 | | // out[1]: 02 12 22 32 03 13 23 33 | 111 | 26.0k | out[0] = _mm_unpacklo_epi32(a0, a1); | 112 | 26.0k | out[1] = _mm_unpackhi_epi32(a0, a1); | 113 | 26.0k | } |
inv_txfm_ssse3.c:transpose_16bit_4x4 Line | Count | Source | 96 | 695k | __m128i *const out) { | 97 | | // Unpack 16 bit elements. Goes from: | 98 | | // in[0]: 00 01 02 03 XX XX XX XX | 99 | | // in[1]: 10 11 12 13 XX XX XX XX | 100 | | // in[2]: 20 21 22 23 XX XX XX XX | 101 | | // in[3]: 30 31 32 33 XX XX XX XX | 102 | | // to: | 103 | | // a0: 00 10 01 11 02 12 03 13 | 104 | | // a1: 20 30 21 31 22 32 23 33 | 105 | 695k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 106 | 695k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 107 | | | 108 | | // Unpack 32 bit elements resulting in: | 109 | | // out[0]: 00 10 20 30 01 11 21 31 | 110 | | // out[1]: 02 12 22 32 03 13 23 33 | 111 | 695k | out[0] = _mm_unpacklo_epi32(a0, a1); | 112 | 695k | out[1] = _mm_unpackhi_epi32(a0, a1); | 113 | 695k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_4x4 |
114 | | |
115 | | static INLINE void transpose_16bit_4x8(const __m128i *const in, |
116 | 52.0k | __m128i *const out) { |
117 | | // Unpack 16 bit elements. Goes from: |
118 | | // in[0]: 00 01 02 03 XX XX XX XX |
119 | | // in[1]: 10 11 12 13 XX XX XX XX |
120 | | // in[2]: 20 21 22 23 XX XX XX XX |
121 | | // in[3]: 30 31 32 33 XX XX XX XX |
122 | | // in[4]: 40 41 42 43 XX XX XX XX |
123 | | // in[5]: 50 51 52 53 XX XX XX XX |
124 | | // in[6]: 60 61 62 63 XX XX XX XX |
125 | | // in[7]: 70 71 72 73 XX XX XX XX |
126 | | // to: |
127 | | // a0: 00 10 01 11 02 12 03 13 |
128 | | // a1: 20 30 21 31 22 32 23 33 |
129 | | // a2: 40 50 41 51 42 52 43 53 |
130 | | // a3: 60 70 61 71 62 72 63 73 |
131 | 52.0k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
132 | 52.0k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
133 | 52.0k | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); |
134 | 52.0k | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); |
135 | | |
136 | | // Unpack 32 bit elements resulting in: |
137 | | // b0: 00 10 20 30 01 11 21 31 |
138 | | // b1: 40 50 60 70 41 51 61 71 |
139 | | // b2: 02 12 22 32 03 13 23 33 |
140 | | // b3: 42 52 62 72 43 53 63 73 |
141 | 52.0k | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); |
142 | 52.0k | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); |
143 | 52.0k | const __m128i b2 = _mm_unpackhi_epi32(a0, a1); |
144 | 52.0k | const __m128i b3 = _mm_unpackhi_epi32(a2, a3); |
145 | | |
146 | | // Unpack 64 bit elements resulting in: |
147 | | // out[0]: 00 10 20 30 40 50 60 70 |
148 | | // out[1]: 01 11 21 31 41 51 61 71 |
149 | | // out[2]: 02 12 22 32 42 52 62 72 |
150 | | // out[3]: 03 13 23 33 43 53 63 73 |
151 | 52.0k | out[0] = _mm_unpacklo_epi64(b0, b1); |
152 | 52.0k | out[1] = _mm_unpackhi_epi64(b0, b1); |
153 | 52.0k | out[2] = _mm_unpacklo_epi64(b2, b3); |
154 | 52.0k | out[3] = _mm_unpackhi_epi64(b2, b3); |
155 | 52.0k | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_4x8 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x8 inv_txfm_sse2.c:transpose_16bit_4x8 Line | Count | Source | 116 | 52.0k | __m128i *const out) { | 117 | | // Unpack 16 bit elements. Goes from: | 118 | | // in[0]: 00 01 02 03 XX XX XX XX | 119 | | // in[1]: 10 11 12 13 XX XX XX XX | 120 | | // in[2]: 20 21 22 23 XX XX XX XX | 121 | | // in[3]: 30 31 32 33 XX XX XX XX | 122 | | // in[4]: 40 41 42 43 XX XX XX XX | 123 | | // in[5]: 50 51 52 53 XX XX XX XX | 124 | | // in[6]: 60 61 62 63 XX XX XX XX | 125 | | // in[7]: 70 71 72 73 XX XX XX XX | 126 | | // to: | 127 | | // a0: 00 10 01 11 02 12 03 13 | 128 | | // a1: 20 30 21 31 22 32 23 33 | 129 | | // a2: 40 50 41 51 42 52 43 53 | 130 | | // a3: 60 70 61 71 62 72 63 73 | 131 | 52.0k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 132 | 52.0k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 133 | 52.0k | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 134 | 52.0k | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 135 | | | 136 | | // Unpack 32 bit elements resulting in: | 137 | | // b0: 00 10 20 30 01 11 21 31 | 138 | | // b1: 40 50 60 70 41 51 61 71 | 139 | | // b2: 02 12 22 32 03 13 23 33 | 140 | | // b3: 42 52 62 72 43 53 63 73 | 141 | 52.0k | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 142 | 52.0k | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 143 | 52.0k | const __m128i b2 = _mm_unpackhi_epi32(a0, a1); | 144 | 52.0k | const __m128i b3 = _mm_unpackhi_epi32(a2, a3); | 145 | | | 146 | | // Unpack 64 bit elements resulting in: | 147 | | // out[0]: 00 10 20 30 40 50 60 70 | 148 | | // out[1]: 01 11 21 31 41 51 61 71 | 149 | | // out[2]: 02 12 22 32 42 52 62 72 | 150 | | // out[3]: 03 13 23 33 43 53 63 73 | 151 | 52.0k | out[0] = _mm_unpacklo_epi64(b0, b1); | 152 | 52.0k | out[1] = _mm_unpackhi_epi64(b0, b1); | 153 | 52.0k | out[2] = _mm_unpacklo_epi64(b2, b3); | 154 | 52.0k | out[3] = _mm_unpackhi_epi64(b2, b3); | 155 | 52.0k | } |
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_4x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_4x8 |
156 | | |
157 | | static INLINE void transpose_16bit_8x8(const __m128i *const in, |
158 | 266M | __m128i *const out) { |
159 | | // Unpack 16 bit elements. Goes from: |
160 | | // in[0]: 00 01 02 03 04 05 06 07 |
161 | | // in[1]: 10 11 12 13 14 15 16 17 |
162 | | // in[2]: 20 21 22 23 24 25 26 27 |
163 | | // in[3]: 30 31 32 33 34 35 36 37 |
164 | | // in[4]: 40 41 42 43 44 45 46 47 |
165 | | // in[5]: 50 51 52 53 54 55 56 57 |
166 | | // in[6]: 60 61 62 63 64 65 66 67 |
167 | | // in[7]: 70 71 72 73 74 75 76 77 |
168 | | // to: |
169 | | // a0: 00 10 01 11 02 12 03 13 |
170 | | // a1: 20 30 21 31 22 32 23 33 |
171 | | // a2: 40 50 41 51 42 52 43 53 |
172 | | // a3: 60 70 61 71 62 72 63 73 |
173 | | // a4: 04 14 05 15 06 16 07 17 |
174 | | // a5: 24 34 25 35 26 36 27 37 |
175 | | // a6: 44 54 45 55 46 56 47 57 |
176 | | // a7: 64 74 65 75 66 76 67 77 |
177 | 266M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); |
178 | 266M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); |
179 | 266M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); |
180 | 266M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); |
181 | 266M | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); |
182 | 266M | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); |
183 | 266M | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); |
184 | 266M | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); |
185 | | |
186 | | // Unpack 32 bit elements resulting in: |
187 | | // b0: 00 10 20 30 01 11 21 31 |
188 | | // b1: 40 50 60 70 41 51 61 71 |
189 | | // b2: 04 14 24 34 05 15 25 35 |
190 | | // b3: 44 54 64 74 45 55 65 75 |
191 | | // b4: 02 12 22 32 03 13 23 33 |
192 | | // b5: 42 52 62 72 43 53 63 73 |
193 | | // b6: 06 16 26 36 07 17 27 37 |
194 | | // b7: 46 56 66 76 47 57 67 77 |
195 | 266M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); |
196 | 266M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); |
197 | 266M | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); |
198 | 266M | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); |
199 | 266M | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); |
200 | 266M | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); |
201 | 266M | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); |
202 | 266M | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); |
203 | | |
204 | | // Unpack 64 bit elements resulting in: |
205 | | // out[0]: 00 10 20 30 40 50 60 70 |
206 | | // out[1]: 01 11 21 31 41 51 61 71 |
207 | | // out[2]: 02 12 22 32 42 52 62 72 |
208 | | // out[3]: 03 13 23 33 43 53 63 73 |
209 | | // out[4]: 04 14 24 34 44 54 64 74 |
210 | | // out[5]: 05 15 25 35 45 55 65 75 |
211 | | // out[6]: 06 16 26 36 46 56 66 76 |
212 | | // out[7]: 07 17 27 37 47 57 67 77 |
213 | 266M | out[0] = _mm_unpacklo_epi64(b0, b1); |
214 | 266M | out[1] = _mm_unpackhi_epi64(b0, b1); |
215 | 266M | out[2] = _mm_unpacklo_epi64(b4, b5); |
216 | 266M | out[3] = _mm_unpackhi_epi64(b4, b5); |
217 | 266M | out[4] = _mm_unpacklo_epi64(b2, b3); |
218 | 266M | out[5] = _mm_unpackhi_epi64(b2, b3); |
219 | 266M | out[6] = _mm_unpacklo_epi64(b6, b7); |
220 | 266M | out[7] = _mm_unpackhi_epi64(b6, b7); |
221 | 266M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_8x8 vp9_dct_intrin_sse2.c:transpose_16bit_8x8 Line | Count | Source | 158 | 116M | __m128i *const out) { | 159 | | // Unpack 16 bit elements. Goes from: | 160 | | // in[0]: 00 01 02 03 04 05 06 07 | 161 | | // in[1]: 10 11 12 13 14 15 16 17 | 162 | | // in[2]: 20 21 22 23 24 25 26 27 | 163 | | // in[3]: 30 31 32 33 34 35 36 37 | 164 | | // in[4]: 40 41 42 43 44 45 46 47 | 165 | | // in[5]: 50 51 52 53 54 55 56 57 | 166 | | // in[6]: 60 61 62 63 64 65 66 67 | 167 | | // in[7]: 70 71 72 73 74 75 76 77 | 168 | | // to: | 169 | | // a0: 00 10 01 11 02 12 03 13 | 170 | | // a1: 20 30 21 31 22 32 23 33 | 171 | | // a2: 40 50 41 51 42 52 43 53 | 172 | | // a3: 60 70 61 71 62 72 63 73 | 173 | | // a4: 04 14 05 15 06 16 07 17 | 174 | | // a5: 24 34 25 35 26 36 27 37 | 175 | | // a6: 44 54 45 55 46 56 47 57 | 176 | | // a7: 64 74 65 75 66 76 67 77 | 177 | 116M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 178 | 116M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 179 | 116M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 180 | 116M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 181 | 116M | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); | 182 | 116M | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); | 183 | 116M | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); | 184 | 116M | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); | 185 | | | 186 | | // Unpack 32 bit elements resulting in: | 187 | | // b0: 00 10 20 30 01 11 21 31 | 188 | | // b1: 40 50 60 70 41 51 61 71 | 189 | | // b2: 04 14 24 34 05 15 25 35 | 190 | | // b3: 44 54 64 74 45 55 65 75 | 191 | | // b4: 02 12 22 32 03 13 23 33 | 192 | | // b5: 42 52 62 72 43 53 63 73 | 193 | | // b6: 06 16 26 36 07 17 27 37 | 194 | | // b7: 46 56 66 76 47 57 67 77 | 195 | 116M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 196 | 116M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 197 | 116M | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); | 198 | 116M | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); | 199 | 116M | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); | 200 | 116M | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); | 201 | 116M | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); | 202 | 116M | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); | 203 | | | 204 | | // Unpack 64 bit elements resulting in: | 205 | | // out[0]: 00 10 20 30 40 50 60 70 | 206 | | // out[1]: 01 11 21 31 41 51 61 71 | 207 | | // out[2]: 02 12 22 32 42 52 62 72 | 208 | | // out[3]: 03 13 23 33 43 53 63 73 | 209 | | // out[4]: 04 14 24 34 44 54 64 74 | 210 | | // out[5]: 05 15 25 35 45 55 65 75 | 211 | | // out[6]: 06 16 26 36 46 56 66 76 | 212 | | // out[7]: 07 17 27 37 47 57 67 77 | 213 | 116M | out[0] = _mm_unpacklo_epi64(b0, b1); | 214 | 116M | out[1] = _mm_unpackhi_epi64(b0, b1); | 215 | 116M | out[2] = _mm_unpacklo_epi64(b4, b5); | 216 | 116M | out[3] = _mm_unpackhi_epi64(b4, b5); | 217 | 116M | out[4] = _mm_unpacklo_epi64(b2, b3); | 218 | 116M | out[5] = _mm_unpackhi_epi64(b2, b3); | 219 | 116M | out[6] = _mm_unpacklo_epi64(b6, b7); | 220 | 116M | out[7] = _mm_unpackhi_epi64(b6, b7); | 221 | 116M | } |
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_8x8 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_8x8 inv_txfm_sse2.c:transpose_16bit_8x8 Line | Count | Source | 158 | 149M | __m128i *const out) { | 159 | | // Unpack 16 bit elements. Goes from: | 160 | | // in[0]: 00 01 02 03 04 05 06 07 | 161 | | // in[1]: 10 11 12 13 14 15 16 17 | 162 | | // in[2]: 20 21 22 23 24 25 26 27 | 163 | | // in[3]: 30 31 32 33 34 35 36 37 | 164 | | // in[4]: 40 41 42 43 44 45 46 47 | 165 | | // in[5]: 50 51 52 53 54 55 56 57 | 166 | | // in[6]: 60 61 62 63 64 65 66 67 | 167 | | // in[7]: 70 71 72 73 74 75 76 77 | 168 | | // to: | 169 | | // a0: 00 10 01 11 02 12 03 13 | 170 | | // a1: 20 30 21 31 22 32 23 33 | 171 | | // a2: 40 50 41 51 42 52 43 53 | 172 | | // a3: 60 70 61 71 62 72 63 73 | 173 | | // a4: 04 14 05 15 06 16 07 17 | 174 | | // a5: 24 34 25 35 26 36 27 37 | 175 | | // a6: 44 54 45 55 46 56 47 57 | 176 | | // a7: 64 74 65 75 66 76 67 77 | 177 | 149M | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 178 | 149M | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 179 | 149M | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 180 | 149M | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 181 | 149M | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); | 182 | 149M | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); | 183 | 149M | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); | 184 | 149M | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); | 185 | | | 186 | | // Unpack 32 bit elements resulting in: | 187 | | // b0: 00 10 20 30 01 11 21 31 | 188 | | // b1: 40 50 60 70 41 51 61 71 | 189 | | // b2: 04 14 24 34 05 15 25 35 | 190 | | // b3: 44 54 64 74 45 55 65 75 | 191 | | // b4: 02 12 22 32 03 13 23 33 | 192 | | // b5: 42 52 62 72 43 53 63 73 | 193 | | // b6: 06 16 26 36 07 17 27 37 | 194 | | // b7: 46 56 66 76 47 57 67 77 | 195 | 149M | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 196 | 149M | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 197 | 149M | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); | 198 | 149M | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); | 199 | 149M | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); | 200 | 149M | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); | 201 | 149M | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); | 202 | 149M | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); | 203 | | | 204 | | // Unpack 64 bit elements resulting in: | 205 | | // out[0]: 00 10 20 30 40 50 60 70 | 206 | | // out[1]: 01 11 21 31 41 51 61 71 | 207 | | // out[2]: 02 12 22 32 42 52 62 72 | 208 | | // out[3]: 03 13 23 33 43 53 63 73 | 209 | | // out[4]: 04 14 24 34 44 54 64 74 | 210 | | // out[5]: 05 15 25 35 45 55 65 75 | 211 | | // out[6]: 06 16 26 36 46 56 66 76 | 212 | | // out[7]: 07 17 27 37 47 57 67 77 | 213 | 149M | out[0] = _mm_unpacklo_epi64(b0, b1); | 214 | 149M | out[1] = _mm_unpackhi_epi64(b0, b1); | 215 | 149M | out[2] = _mm_unpacklo_epi64(b4, b5); | 216 | 149M | out[3] = _mm_unpackhi_epi64(b4, b5); | 217 | 149M | out[4] = _mm_unpacklo_epi64(b2, b3); | 218 | 149M | out[5] = _mm_unpackhi_epi64(b2, b3); | 219 | 149M | out[6] = _mm_unpacklo_epi64(b6, b7); | 220 | 149M | out[7] = _mm_unpackhi_epi64(b6, b7); | 221 | 149M | } |
inv_txfm_ssse3.c:transpose_16bit_8x8 Line | Count | Source | 158 | 102k | __m128i *const out) { | 159 | | // Unpack 16 bit elements. Goes from: | 160 | | // in[0]: 00 01 02 03 04 05 06 07 | 161 | | // in[1]: 10 11 12 13 14 15 16 17 | 162 | | // in[2]: 20 21 22 23 24 25 26 27 | 163 | | // in[3]: 30 31 32 33 34 35 36 37 | 164 | | // in[4]: 40 41 42 43 44 45 46 47 | 165 | | // in[5]: 50 51 52 53 54 55 56 57 | 166 | | // in[6]: 60 61 62 63 64 65 66 67 | 167 | | // in[7]: 70 71 72 73 74 75 76 77 | 168 | | // to: | 169 | | // a0: 00 10 01 11 02 12 03 13 | 170 | | // a1: 20 30 21 31 22 32 23 33 | 171 | | // a2: 40 50 41 51 42 52 43 53 | 172 | | // a3: 60 70 61 71 62 72 63 73 | 173 | | // a4: 04 14 05 15 06 16 07 17 | 174 | | // a5: 24 34 25 35 26 36 27 37 | 175 | | // a6: 44 54 45 55 46 56 47 57 | 176 | | // a7: 64 74 65 75 66 76 67 77 | 177 | 102k | const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); | 178 | 102k | const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); | 179 | 102k | const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); | 180 | 102k | const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); | 181 | 102k | const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); | 182 | 102k | const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); | 183 | 102k | const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); | 184 | 102k | const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); | 185 | | | 186 | | // Unpack 32 bit elements resulting in: | 187 | | // b0: 00 10 20 30 01 11 21 31 | 188 | | // b1: 40 50 60 70 41 51 61 71 | 189 | | // b2: 04 14 24 34 05 15 25 35 | 190 | | // b3: 44 54 64 74 45 55 65 75 | 191 | | // b4: 02 12 22 32 03 13 23 33 | 192 | | // b5: 42 52 62 72 43 53 63 73 | 193 | | // b6: 06 16 26 36 07 17 27 37 | 194 | | // b7: 46 56 66 76 47 57 67 77 | 195 | 102k | const __m128i b0 = _mm_unpacklo_epi32(a0, a1); | 196 | 102k | const __m128i b1 = _mm_unpacklo_epi32(a2, a3); | 197 | 102k | const __m128i b2 = _mm_unpacklo_epi32(a4, a5); | 198 | 102k | const __m128i b3 = _mm_unpacklo_epi32(a6, a7); | 199 | 102k | const __m128i b4 = _mm_unpackhi_epi32(a0, a1); | 200 | 102k | const __m128i b5 = _mm_unpackhi_epi32(a2, a3); | 201 | 102k | const __m128i b6 = _mm_unpackhi_epi32(a4, a5); | 202 | 102k | const __m128i b7 = _mm_unpackhi_epi32(a6, a7); | 203 | | | 204 | | // Unpack 64 bit elements resulting in: | 205 | | // out[0]: 00 10 20 30 40 50 60 70 | 206 | | // out[1]: 01 11 21 31 41 51 61 71 | 207 | | // out[2]: 02 12 22 32 42 52 62 72 | 208 | | // out[3]: 03 13 23 33 43 53 63 73 | 209 | | // out[4]: 04 14 24 34 44 54 64 74 | 210 | | // out[5]: 05 15 25 35 45 55 65 75 | 211 | | // out[6]: 06 16 26 36 46 56 66 76 | 212 | | // out[7]: 07 17 27 37 47 57 67 77 | 213 | 102k | out[0] = _mm_unpacklo_epi64(b0, b1); | 214 | 102k | out[1] = _mm_unpackhi_epi64(b0, b1); | 215 | 102k | out[2] = _mm_unpacklo_epi64(b4, b5); | 216 | 102k | out[3] = _mm_unpackhi_epi64(b4, b5); | 217 | 102k | out[4] = _mm_unpacklo_epi64(b2, b3); | 218 | 102k | out[5] = _mm_unpackhi_epi64(b2, b3); | 219 | 102k | out[6] = _mm_unpacklo_epi64(b6, b7); | 220 | 102k | out[7] = _mm_unpackhi_epi64(b6, b7); | 221 | 102k | } |
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_8x8 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_8x8 |
222 | | |
223 | | // Transpose in-place |
224 | | static INLINE void transpose_16bit_16x16(__m128i *const left, |
225 | 26.6M | __m128i *const right) { |
226 | 26.6M | __m128i tbuf[8]; |
227 | 26.6M | transpose_16bit_8x8(left, left); |
228 | 26.6M | transpose_16bit_8x8(right, tbuf); |
229 | 26.6M | transpose_16bit_8x8(left + 8, right); |
230 | 26.6M | transpose_16bit_8x8(right + 8, right + 8); |
231 | | |
232 | 26.6M | left[8] = tbuf[0]; |
233 | 26.6M | left[9] = tbuf[1]; |
234 | 26.6M | left[10] = tbuf[2]; |
235 | 26.6M | left[11] = tbuf[3]; |
236 | 26.6M | left[12] = tbuf[4]; |
237 | 26.6M | left[13] = tbuf[5]; |
238 | 26.6M | left[14] = tbuf[6]; |
239 | 26.6M | left[15] = tbuf[7]; |
240 | 26.6M | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_16x16 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_16x16 vp9_dct_intrin_sse2.c:transpose_16bit_16x16 Line | Count | Source | 225 | 14.0M | __m128i *const right) { | 226 | 14.0M | __m128i tbuf[8]; | 227 | 14.0M | transpose_16bit_8x8(left, left); | 228 | 14.0M | transpose_16bit_8x8(right, tbuf); | 229 | 14.0M | transpose_16bit_8x8(left + 8, right); | 230 | 14.0M | transpose_16bit_8x8(right + 8, right + 8); | 231 | | | 232 | 14.0M | left[8] = tbuf[0]; | 233 | 14.0M | left[9] = tbuf[1]; | 234 | 14.0M | left[10] = tbuf[2]; | 235 | 14.0M | left[11] = tbuf[3]; | 236 | 14.0M | left[12] = tbuf[4]; | 237 | 14.0M | left[13] = tbuf[5]; | 238 | 14.0M | left[14] = tbuf[6]; | 239 | 14.0M | left[15] = tbuf[7]; | 240 | 14.0M | } |
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_16x16 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_16x16 inv_txfm_sse2.c:transpose_16bit_16x16 Line | Count | Source | 225 | 12.6M | __m128i *const right) { | 226 | 12.6M | __m128i tbuf[8]; | 227 | 12.6M | transpose_16bit_8x8(left, left); | 228 | 12.6M | transpose_16bit_8x8(right, tbuf); | 229 | 12.6M | transpose_16bit_8x8(left + 8, right); | 230 | 12.6M | transpose_16bit_8x8(right + 8, right + 8); | 231 | | | 232 | 12.6M | left[8] = tbuf[0]; | 233 | 12.6M | left[9] = tbuf[1]; | 234 | 12.6M | left[10] = tbuf[2]; | 235 | 12.6M | left[11] = tbuf[3]; | 236 | 12.6M | left[12] = tbuf[4]; | 237 | 12.6M | left[13] = tbuf[5]; | 238 | 12.6M | left[14] = tbuf[6]; | 239 | 12.6M | left[15] = tbuf[7]; | 240 | 12.6M | } |
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_16x16 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_16x16 |
241 | | |
242 | | static INLINE void transpose_32bit_4x4(const __m128i *const in, |
243 | 0 | __m128i *const out) { |
244 | | // Unpack 32 bit elements. Goes from: |
245 | | // in[0]: 00 01 02 03 |
246 | | // in[1]: 10 11 12 13 |
247 | | // in[2]: 20 21 22 23 |
248 | | // in[3]: 30 31 32 33 |
249 | | // to: |
250 | | // a0: 00 10 01 11 |
251 | | // a1: 20 30 21 31 |
252 | | // a2: 02 12 03 13 |
253 | | // a3: 22 32 23 33 |
254 | |
|
255 | 0 | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); |
256 | 0 | const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); |
257 | 0 | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); |
258 | 0 | const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); |
259 | | |
260 | | // Unpack 64 bit elements resulting in: |
261 | | // out[0]: 00 10 20 30 |
262 | | // out[1]: 01 11 21 31 |
263 | | // out[2]: 02 12 22 32 |
264 | | // out[3]: 03 13 23 33 |
265 | 0 | out[0] = _mm_unpacklo_epi64(a0, a1); |
266 | 0 | out[1] = _mm_unpackhi_epi64(a0, a1); |
267 | 0 | out[2] = _mm_unpacklo_epi64(a2, a3); |
268 | 0 | out[3] = _mm_unpackhi_epi64(a2, a3); |
269 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_4x4 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4 Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_32bit_4x4 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_32bit_4x4 |
270 | | |
271 | | static INLINE void transpose_32bit_4x4x2(const __m128i *const in, |
272 | 0 | __m128i *const out) { |
273 | | // Unpack 32 bit elements. Goes from: |
274 | | // in[0]: 00 01 02 03 |
275 | | // in[1]: 10 11 12 13 |
276 | | // in[2]: 20 21 22 23 |
277 | | // in[3]: 30 31 32 33 |
278 | | // in[4]: 04 05 06 07 |
279 | | // in[5]: 14 15 16 17 |
280 | | // in[6]: 24 25 26 27 |
281 | | // in[7]: 34 35 36 37 |
282 | | // to: |
283 | | // a0: 00 10 01 11 |
284 | | // a1: 20 30 21 31 |
285 | | // a2: 02 12 03 13 |
286 | | // a3: 22 32 23 33 |
287 | | // a4: 04 14 05 15 |
288 | | // a5: 24 34 25 35 |
289 | | // a6: 06 16 07 17 |
290 | | // a7: 26 36 27 37 |
291 | 0 | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); |
292 | 0 | const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); |
293 | 0 | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); |
294 | 0 | const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); |
295 | 0 | const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); |
296 | 0 | const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); |
297 | 0 | const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); |
298 | 0 | const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); |
299 | | |
300 | | // Unpack 64 bit elements resulting in: |
301 | | // out[0]: 00 10 20 30 |
302 | | // out[1]: 01 11 21 31 |
303 | | // out[2]: 02 12 22 32 |
304 | | // out[3]: 03 13 23 33 |
305 | | // out[4]: 04 14 24 34 |
306 | | // out[5]: 05 15 25 35 |
307 | | // out[6]: 06 16 26 36 |
308 | | // out[7]: 07 17 27 37 |
309 | 0 | out[0] = _mm_unpacklo_epi64(a0, a1); |
310 | 0 | out[1] = _mm_unpackhi_epi64(a0, a1); |
311 | 0 | out[2] = _mm_unpacklo_epi64(a2, a3); |
312 | 0 | out[3] = _mm_unpackhi_epi64(a2, a3); |
313 | 0 | out[4] = _mm_unpacklo_epi64(a4, a5); |
314 | 0 | out[5] = _mm_unpackhi_epi64(a4, a5); |
315 | 0 | out[6] = _mm_unpacklo_epi64(a6, a7); |
316 | 0 | out[7] = _mm_unpackhi_epi64(a6, a7); |
317 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_32bit_4x4x2 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_32bit_4x4x2 |
318 | | |
319 | | static INLINE void transpose_32bit_8x4(const __m128i *const in, |
320 | 0 | __m128i *const out) { |
321 | | // Unpack 32 bit elements. Goes from: |
322 | | // in[0]: 00 01 02 03 |
323 | | // in[1]: 04 05 06 07 |
324 | | // in[2]: 10 11 12 13 |
325 | | // in[3]: 14 15 16 17 |
326 | | // in[4]: 20 21 22 23 |
327 | | // in[5]: 24 25 26 27 |
328 | | // in[6]: 30 31 32 33 |
329 | | // in[7]: 34 35 36 37 |
330 | | // to: |
331 | | // a0: 00 10 01 11 |
332 | | // a1: 20 30 21 31 |
333 | | // a2: 02 12 03 13 |
334 | | // a3: 22 32 23 33 |
335 | | // a4: 04 14 05 15 |
336 | | // a5: 24 34 25 35 |
337 | | // a6: 06 16 07 17 |
338 | | // a7: 26 36 27 37 |
339 | 0 | const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); |
340 | 0 | const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); |
341 | 0 | const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); |
342 | 0 | const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); |
343 | 0 | const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); |
344 | 0 | const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); |
345 | 0 | const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); |
346 | 0 | const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); |
347 | | |
348 | | // Unpack 64 bit elements resulting in: |
349 | | // out[0]: 00 10 20 30 |
350 | | // out[1]: 01 11 21 31 |
351 | | // out[2]: 02 12 22 32 |
352 | | // out[3]: 03 13 23 33 |
353 | | // out[4]: 04 14 24 34 |
354 | | // out[5]: 05 15 25 35 |
355 | | // out[6]: 06 16 26 36 |
356 | | // out[7]: 07 17 27 37 |
357 | 0 | out[0] = _mm_unpacklo_epi64(a0, a1); |
358 | 0 | out[1] = _mm_unpackhi_epi64(a0, a1); |
359 | 0 | out[2] = _mm_unpacklo_epi64(a2, a3); |
360 | 0 | out[3] = _mm_unpackhi_epi64(a2, a3); |
361 | 0 | out[4] = _mm_unpacklo_epi64(a4, a5); |
362 | 0 | out[5] = _mm_unpackhi_epi64(a4, a5); |
363 | 0 | out[6] = _mm_unpacklo_epi64(a6, a7); |
364 | 0 | out[7] = _mm_unpackhi_epi64(a6, a7); |
365 | 0 | } Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_32bit_8x4 Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_32bit_8x4 |
366 | | |
367 | | #endif // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_ |