Coverage Report

Created: 2025-07-23 06:32

/src/aom/aom_dsp/x86/lpf_common_sse2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
13
#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
14
15
#include <emmintrin.h>  // SSE2
16
17
#include "config/aom_config.h"
18
19
0
#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
20
0
#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
21
22
static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
23
                                            __m128i *x2, __m128i *x3,
24
                                            __m128i *x4, __m128i *x5,
25
                                            __m128i *d0, __m128i *d1,
26
                                            __m128i *d2, __m128i *d3,
27
0
                                            __m128i *d4, __m128i *d5) {
28
0
  __m128i w0, w1, w2, w3, w4, w5, ww0;
29
0
30
0
  // 00 01 02 03 04 05 xx xx
31
0
  // 10 11 12 13 14 15 xx xx
32
0
  // 20 21 22 23 24 25 xx xx
33
0
  // 30 31 32 33 34 35 xx xx
34
0
  // 40 41 42 43 44 45 xx xx
35
0
  // 50 51 52 53 54 55 xx xx
36
0
37
0
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
38
0
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
39
0
  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
40
0
41
0
  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
42
0
  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
43
0
  *d1 = _mm_unpackhi_epi64(ww0,
44
0
                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
45
0
46
0
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
47
0
  *d2 = _mm_unpacklo_epi64(ww0,
48
0
                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
49
0
50
0
  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
51
0
  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
52
0
  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
53
0
54
0
  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
55
0
56
0
  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
57
0
  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
58
0
  *d5 = _mm_unpackhi_epi64(ww0,
59
0
                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
60
0
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose6x6_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose6x6_sse2
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose6x6_sse2
Unexecuted instantiation: intrapred_avx2.c:highbd_transpose6x6_sse2
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose6x6_sse2
61
62
static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
63
                                                    __m128i *x2, __m128i *x3,
64
                                                    __m128i *d0, __m128i *d1,
65
61.7M
                                                    __m128i *d2, __m128i *d3) {
66
61.7M
  __m128i zero = _mm_setzero_si128();
67
61.7M
  __m128i w0, w1, ww0, ww1;
68
69
61.7M
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
70
61.7M
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
71
72
61.7M
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
73
61.7M
  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
74
75
61.7M
  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
76
61.7M
  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
77
61.7M
  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
78
61.7M
  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
79
61.7M
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_low_sse2
Line
Count
Source
65
61.4M
                                                    __m128i *d2, __m128i *d3) {
66
61.4M
  __m128i zero = _mm_setzero_si128();
67
61.4M
  __m128i w0, w1, ww0, ww1;
68
69
61.4M
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
70
61.4M
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
71
72
61.4M
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
73
61.4M
  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
74
75
61.4M
  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
76
61.4M
  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
77
61.4M
  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
78
61.4M
  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
79
61.4M
}
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_low_sse2
intrapred_avx2.c:highbd_transpose4x8_8x4_low_sse2
Line
Count
Source
65
265k
                                                    __m128i *d2, __m128i *d3) {
66
265k
  __m128i zero = _mm_setzero_si128();
67
265k
  __m128i w0, w1, ww0, ww1;
68
69
265k
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
70
265k
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
71
72
265k
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
73
265k
  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
74
75
265k
  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
76
265k
  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
77
265k
  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
78
265k
  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
79
265k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_low_sse2
80
81
static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
82
                                                     __m128i *x2, __m128i *x3,
83
                                                     __m128i *d4, __m128i *d5,
84
13.8M
                                                     __m128i *d6, __m128i *d7) {
85
13.8M
  __m128i w0, w1, ww2, ww3;
86
13.8M
  __m128i zero = _mm_setzero_si128();
87
88
13.8M
  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
89
13.8M
  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
90
91
13.8M
  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
92
13.8M
  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
93
94
13.8M
  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
95
13.8M
  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
96
13.8M
  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
97
13.8M
  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
98
13.8M
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_high_sse2
Line
Count
Source
84
13.7M
                                                     __m128i *d6, __m128i *d7) {
85
13.7M
  __m128i w0, w1, ww2, ww3;
86
13.7M
  __m128i zero = _mm_setzero_si128();
87
88
13.7M
  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
89
13.7M
  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
90
91
13.7M
  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
92
13.7M
  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
93
94
13.7M
  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
95
13.7M
  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
96
13.7M
  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
97
13.7M
  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
98
13.7M
}
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_high_sse2
intrapred_avx2.c:highbd_transpose4x8_8x4_high_sse2
Line
Count
Source
84
29.6k
                                                     __m128i *d6, __m128i *d7) {
85
29.6k
  __m128i w0, w1, ww2, ww3;
86
29.6k
  __m128i zero = _mm_setzero_si128();
87
88
29.6k
  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
89
29.6k
  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
90
91
29.6k
  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
92
29.6k
  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
93
94
29.6k
  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
95
29.6k
  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
96
29.6k
  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
97
29.6k
  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
98
29.6k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_high_sse2
99
100
// here in and out pointers (x and d) should be different! we don't store their
101
// values inside
102
static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
103
                                                __m128i *x2, __m128i *x3,
104
                                                __m128i *d0, __m128i *d1,
105
                                                __m128i *d2, __m128i *d3,
106
                                                __m128i *d4, __m128i *d5,
107
13.1M
                                                __m128i *d6, __m128i *d7) {
108
  // input
109
  // x0 00 01 02 03 04 05 06 07
110
  // x1 10 11 12 13 14 15 16 17
111
  // x2 20 21 22 23 24 25 26 27
112
  // x3 30 31 32 33 34 35 36 37
113
  // output
114
  // 00 10 20 30 xx xx xx xx
115
  // 01 11 21 31 xx xx xx xx
116
  // 02 12 22 32 xx xx xx xx
117
  // 03 13 23 33 xx xx xx xx
118
  // 04 14 24 34 xx xx xx xx
119
  // 05 15 25 35 xx xx xx xx
120
  // 06 16 26 36 xx xx xx xx
121
  // 07 17 27 37 xx xx xx xx
122
13.1M
  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
123
13.1M
  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
124
13.1M
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2
highbd_loopfilter_sse2.c:highbd_transpose4x8_8x4_sse2
Line
Count
Source
107
13.0M
                                                __m128i *d6, __m128i *d7) {
108
  // input
109
  // x0 00 01 02 03 04 05 06 07
110
  // x1 10 11 12 13 14 15 16 17
111
  // x2 20 21 22 23 24 25 26 27
112
  // x3 30 31 32 33 34 35 36 37
113
  // output
114
  // 00 10 20 30 xx xx xx xx
115
  // 01 11 21 31 xx xx xx xx
116
  // 02 12 22 32 xx xx xx xx
117
  // 03 13 23 33 xx xx xx xx
118
  // 04 14 24 34 xx xx xx xx
119
  // 05 15 25 35 xx xx xx xx
120
  // 06 16 26 36 xx xx xx xx
121
  // 07 17 27 37 xx xx xx xx
122
13.0M
  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
123
13.0M
  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
124
13.0M
}
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose4x8_8x4_sse2
intrapred_avx2.c:highbd_transpose4x8_8x4_sse2
Line
Count
Source
107
29.6k
                                                __m128i *d6, __m128i *d7) {
108
  // input
109
  // x0 00 01 02 03 04 05 06 07
110
  // x1 10 11 12 13 14 15 16 17
111
  // x2 20 21 22 23 24 25 26 27
112
  // x3 30 31 32 33 34 35 36 37
113
  // output
114
  // 00 10 20 30 xx xx xx xx
115
  // 01 11 21 31 xx xx xx xx
116
  // 02 12 22 32 xx xx xx xx
117
  // 03 13 23 33 xx xx xx xx
118
  // 04 14 24 34 xx xx xx xx
119
  // 05 15 25 35 xx xx xx xx
120
  // 06 16 26 36 xx xx xx xx
121
  // 07 17 27 37 xx xx xx xx
122
29.6k
  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
123
29.6k
  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
124
29.6k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose4x8_8x4_sse2
125
126
static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
127
                                                __m128i *x2, __m128i *x3,
128
                                                __m128i *x4, __m128i *x5,
129
                                                __m128i *x6, __m128i *x7,
130
                                                __m128i *d0, __m128i *d1,
131
9.70M
                                                __m128i *d2, __m128i *d3) {
132
9.70M
  __m128i w0, w1, w2, w3, ww0, ww1;
133
  // x0 00 01 02 03 04 05 06 07
134
  // x1 10 11 12 13 14 15 16 17
135
  // x2 20 21 22 23 24 25 26 27
136
  // x3 30 31 32 33 34 35 36 37
137
  // x4 40 41 42 43 44 45 46 47
138
  // x5 50 51 52 53 54 55 56 57
139
  // x6 60 61 62 63 64 65 66 67
140
  // x7 70 71 72 73 74 75 76 77
141
142
9.70M
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
143
9.70M
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
144
9.70M
  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
145
9.70M
  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
146
147
9.70M
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
148
9.70M
  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
149
150
9.70M
  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
151
9.70M
  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
152
153
9.70M
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
154
9.70M
  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
155
156
9.70M
  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
157
9.70M
  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
158
9.70M
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_low_sse2
highbd_loopfilter_sse2.c:highbd_transpose8x8_low_sse2
Line
Count
Source
131
9.11M
                                                __m128i *d2, __m128i *d3) {
132
9.11M
  __m128i w0, w1, w2, w3, ww0, ww1;
133
  // x0 00 01 02 03 04 05 06 07
134
  // x1 10 11 12 13 14 15 16 17
135
  // x2 20 21 22 23 24 25 26 27
136
  // x3 30 31 32 33 34 35 36 37
137
  // x4 40 41 42 43 44 45 46 47
138
  // x5 50 51 52 53 54 55 56 57
139
  // x6 60 61 62 63 64 65 66 67
140
  // x7 70 71 72 73 74 75 76 77
141
142
9.11M
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
143
9.11M
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
144
9.11M
  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
145
9.11M
  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
146
147
9.11M
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
148
9.11M
  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
149
150
9.11M
  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
151
9.11M
  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
152
153
9.11M
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
154
9.11M
  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
155
156
9.11M
  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
157
9.11M
  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
158
9.11M
}
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_low_sse2
intrapred_avx2.c:highbd_transpose8x8_low_sse2
Line
Count
Source
131
591k
                                                __m128i *d2, __m128i *d3) {
132
591k
  __m128i w0, w1, w2, w3, ww0, ww1;
133
  // x0 00 01 02 03 04 05 06 07
134
  // x1 10 11 12 13 14 15 16 17
135
  // x2 20 21 22 23 24 25 26 27
136
  // x3 30 31 32 33 34 35 36 37
137
  // x4 40 41 42 43 44 45 46 47
138
  // x5 50 51 52 53 54 55 56 57
139
  // x6 60 61 62 63 64 65 66 67
140
  // x7 70 71 72 73 74 75 76 77
141
142
591k
  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
143
591k
  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
144
591k
  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
145
591k
  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
146
147
591k
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
148
591k
  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
149
150
591k
  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
151
591k
  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
152
153
591k
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
154
591k
  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
155
156
591k
  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
157
591k
  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
158
591k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_low_sse2
159
160
static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
161
                                                 __m128i *x2, __m128i *x3,
162
                                                 __m128i *x4, __m128i *x5,
163
                                                 __m128i *x6, __m128i *x7,
164
                                                 __m128i *d4, __m128i *d5,
165
532k
                                                 __m128i *d6, __m128i *d7) {
166
532k
  __m128i w0, w1, w2, w3, ww0, ww1;
167
  // x0 00 01 02 03 04 05 06 07
168
  // x1 10 11 12 13 14 15 16 17
169
  // x2 20 21 22 23 24 25 26 27
170
  // x3 30 31 32 33 34 35 36 37
171
  // x4 40 41 42 43 44 45 46 47
172
  // x5 50 51 52 53 54 55 56 57
173
  // x6 60 61 62 63 64 65 66 67
174
  // x7 70 71 72 73 74 75 76 77
175
532k
  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
176
532k
  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
177
532k
  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
178
532k
  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
179
180
532k
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
181
532k
  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
182
183
532k
  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
184
532k
  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
185
186
532k
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
187
532k
  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
188
189
532k
  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
190
532k
  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
191
532k
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_high_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x8_high_sse2
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_high_sse2
intrapred_avx2.c:highbd_transpose8x8_high_sse2
Line
Count
Source
165
532k
                                                 __m128i *d6, __m128i *d7) {
166
532k
  __m128i w0, w1, w2, w3, ww0, ww1;
167
  // x0 00 01 02 03 04 05 06 07
168
  // x1 10 11 12 13 14 15 16 17
169
  // x2 20 21 22 23 24 25 26 27
170
  // x3 30 31 32 33 34 35 36 37
171
  // x4 40 41 42 43 44 45 46 47
172
  // x5 50 51 52 53 54 55 56 57
173
  // x6 60 61 62 63 64 65 66 67
174
  // x7 70 71 72 73 74 75 76 77
175
532k
  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
176
532k
  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
177
532k
  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
178
532k
  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
179
180
532k
  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
181
532k
  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
182
183
532k
  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
184
532k
  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
185
186
532k
  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
187
532k
  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
188
189
532k
  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
190
532k
  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
191
532k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_high_sse2
192
193
// here in and out pointers (x and d) should be different! we don't store their
194
// values inside
195
static inline void highbd_transpose8x8_sse2(
196
    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
197
    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
198
    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
199
532k
    __m128i *d7) {
200
532k
  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
201
532k
  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
202
532k
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x8_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x8_sse2
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x8_sse2
intrapred_avx2.c:highbd_transpose8x8_sse2
Line
Count
Source
199
532k
    __m128i *d7) {
200
532k
  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
201
532k
  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
202
532k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x8_sse2
203
204
// here in and out pointers (x and d arrays) should be different! we don't store
205
// their values inside
206
static inline void highbd_transpose8x16_sse2(
207
    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
208
    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
209
    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
210
0
    __m128i *d7) {
211
0
  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
212
0
                           d5, d6, d7);
213
0
  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
214
0
                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
215
0
                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
216
0
}
Unexecuted instantiation: loopfilter_sse2.c:highbd_transpose8x16_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:highbd_transpose8x16_sse2
Unexecuted instantiation: intrapred_sse4.c:highbd_transpose8x16_sse2
Unexecuted instantiation: intrapred_avx2.c:highbd_transpose8x16_sse2
Unexecuted instantiation: highbd_loopfilter_avx2.c:highbd_transpose8x16_sse2
217
218
// Low bit depth functions
219
static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
220
                                             __m128i *x2, __m128i *x3,
221
                                             __m128i *d0, __m128i *d1,
222
3.68M
                                             __m128i *d2, __m128i *d3) {
223
  // input
224
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
225
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
226
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
227
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
228
  // output
229
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
230
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
231
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
232
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
233
234
3.68M
  __m128i w0, w1;
235
236
3.68M
  w0 = _mm_unpacklo_epi8(
237
3.68M
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
238
3.68M
  w1 = _mm_unpacklo_epi8(
239
3.68M
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
240
241
3.68M
  *d0 = _mm_unpacklo_epi16(
242
3.68M
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
243
244
3.68M
  *d1 = _mm_srli_si128(*d0,
245
3.68M
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
246
3.68M
  *d2 = _mm_srli_si128(*d0,
247
3.68M
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
248
3.68M
  *d3 = _mm_srli_si128(*d0,
249
3.68M
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
250
3.68M
}
loopfilter_sse2.c:transpose4x8_8x4_low_sse2
Line
Count
Source
222
3.60M
                                             __m128i *d2, __m128i *d3) {
223
  // input
224
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
225
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
226
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
227
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
228
  // output
229
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
230
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
231
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
232
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
233
234
3.60M
  __m128i w0, w1;
235
236
3.60M
  w0 = _mm_unpacklo_epi8(
237
3.60M
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
238
3.60M
  w1 = _mm_unpacklo_epi8(
239
3.60M
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
240
241
3.60M
  *d0 = _mm_unpacklo_epi16(
242
3.60M
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
243
244
3.60M
  *d1 = _mm_srli_si128(*d0,
245
3.60M
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
246
3.60M
  *d2 = _mm_srli_si128(*d0,
247
3.60M
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
248
3.60M
  *d3 = _mm_srli_si128(*d0,
249
3.60M
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
250
3.60M
}
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose4x8_8x4_low_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose4x8_8x4_low_sse2
intrapred_avx2.c:transpose4x8_8x4_low_sse2
Line
Count
Source
222
73.2k
                                             __m128i *d2, __m128i *d3) {
223
  // input
224
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
225
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
226
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
227
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
228
  // output
229
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
230
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
231
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
232
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
233
234
73.2k
  __m128i w0, w1;
235
236
73.2k
  w0 = _mm_unpacklo_epi8(
237
73.2k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
238
73.2k
  w1 = _mm_unpacklo_epi8(
239
73.2k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
240
241
73.2k
  *d0 = _mm_unpacklo_epi16(
242
73.2k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
243
244
73.2k
  *d1 = _mm_srli_si128(*d0,
245
73.2k
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
246
73.2k
  *d2 = _mm_srli_si128(*d0,
247
73.2k
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
248
73.2k
  *d3 = _mm_srli_si128(*d0,
249
73.2k
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
250
73.2k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose4x8_8x4_low_sse2
251
252
static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
253
                                         __m128i *x3, __m128i *d0, __m128i *d1,
254
                                         __m128i *d2, __m128i *d3, __m128i *d4,
255
                                         __m128i *d5, __m128i *d6,
256
4.24M
                                         __m128i *d7) {
257
  // input
258
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
259
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
260
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
261
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
262
  // output
263
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
264
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
265
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
266
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
267
  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
268
  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
269
  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
270
  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
271
272
4.24M
  __m128i w0, w1, ww0, ww1;
273
274
4.24M
  w0 = _mm_unpacklo_epi8(
275
4.24M
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
276
4.24M
  w1 = _mm_unpacklo_epi8(
277
4.24M
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
278
279
4.24M
  ww0 = _mm_unpacklo_epi16(
280
4.24M
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
281
4.24M
  ww1 = _mm_unpackhi_epi16(
282
4.24M
      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
283
284
4.24M
  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
285
4.24M
  *d1 = _mm_srli_si128(ww0,
286
4.24M
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
287
4.24M
  *d2 = _mm_srli_si128(ww0,
288
4.24M
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
289
4.24M
  *d3 = _mm_srli_si128(ww0,
290
4.24M
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
291
292
4.24M
  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
293
4.24M
  *d5 = _mm_srli_si128(ww1,
294
4.24M
                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
295
4.24M
  *d6 = _mm_srli_si128(ww1,
296
4.24M
                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
297
4.24M
  *d7 = _mm_srli_si128(ww1,
298
4.24M
                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
299
4.24M
}
loopfilter_sse2.c:transpose4x8_8x4_sse2
Line
Count
Source
256
4.22M
                                         __m128i *d7) {
257
  // input
258
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
259
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
260
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
261
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
262
  // output
263
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
264
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
265
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
266
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
267
  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
268
  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
269
  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
270
  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
271
272
4.22M
  __m128i w0, w1, ww0, ww1;
273
274
4.22M
  w0 = _mm_unpacklo_epi8(
275
4.22M
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
276
4.22M
  w1 = _mm_unpacklo_epi8(
277
4.22M
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
278
279
4.22M
  ww0 = _mm_unpacklo_epi16(
280
4.22M
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
281
4.22M
  ww1 = _mm_unpackhi_epi16(
282
4.22M
      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
283
284
4.22M
  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
285
4.22M
  *d1 = _mm_srli_si128(ww0,
286
4.22M
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
287
4.22M
  *d2 = _mm_srli_si128(ww0,
288
4.22M
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
289
4.22M
  *d3 = _mm_srli_si128(ww0,
290
4.22M
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
291
292
4.22M
  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
293
4.22M
  *d5 = _mm_srli_si128(ww1,
294
4.22M
                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
295
4.22M
  *d6 = _mm_srli_si128(ww1,
296
4.22M
                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
297
4.22M
  *d7 = _mm_srli_si128(ww1,
298
4.22M
                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
299
4.22M
}
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose4x8_8x4_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose4x8_8x4_sse2
intrapred_avx2.c:transpose4x8_8x4_sse2
Line
Count
Source
256
21.8k
                                         __m128i *d7) {
257
  // input
258
  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
259
  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
260
  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
261
  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
262
  // output
263
  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
264
  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
265
  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
266
  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
267
  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
268
  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
269
  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
270
  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
271
272
21.8k
  __m128i w0, w1, ww0, ww1;
273
274
21.8k
  w0 = _mm_unpacklo_epi8(
275
21.8k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
276
21.8k
  w1 = _mm_unpacklo_epi8(
277
21.8k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
278
279
21.8k
  ww0 = _mm_unpacklo_epi16(
280
21.8k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
281
21.8k
  ww1 = _mm_unpackhi_epi16(
282
21.8k
      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
283
284
21.8k
  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
285
21.8k
  *d1 = _mm_srli_si128(ww0,
286
21.8k
                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
287
21.8k
  *d2 = _mm_srli_si128(ww0,
288
21.8k
                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
289
21.8k
  *d3 = _mm_srli_si128(ww0,
290
21.8k
                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
291
292
21.8k
  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
293
21.8k
  *d5 = _mm_srli_si128(ww1,
294
21.8k
                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
295
21.8k
  *d6 = _mm_srli_si128(ww1,
296
21.8k
                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
297
21.8k
  *d7 = _mm_srli_si128(ww1,
298
21.8k
                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
299
21.8k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose4x8_8x4_sse2
300
301
static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
302
                                         __m128i *x3, __m128i *x4, __m128i *x5,
303
                                         __m128i *x6, __m128i *x7, __m128i *d0,
304
                                         __m128i *d1, __m128i *d2,
305
480k
                                         __m128i *d3) {
306
  // input
307
  // x0 00 01 02 03 04 05 06 07
308
  // x1 10 11 12 13 14 15 16 17
309
  // x2 20 21 22 23 24 25 26 27
310
  // x3 30 31 32 33 34 35 36 37
311
  // x4 40 41 42 43 44 45 46 47
312
  // x5  50 51 52 53 54 55 56 57
313
  // x6  60 61 62 63 64 65 66 67
314
  // x7 70 71 72 73 74 75 76 77
315
  // output
316
  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
317
  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
318
  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
319
  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
320
321
480k
  __m128i w0, w1, w2, w3, w4, w5;
322
323
480k
  w0 = _mm_unpacklo_epi8(
324
480k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
325
326
480k
  w1 = _mm_unpacklo_epi8(
327
480k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
328
329
480k
  w2 = _mm_unpacklo_epi8(
330
480k
      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
331
332
480k
  w3 = _mm_unpacklo_epi8(
333
480k
      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
334
335
480k
  w4 = _mm_unpacklo_epi16(
336
480k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
337
480k
  w5 = _mm_unpacklo_epi16(
338
480k
      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
339
340
480k
  *d0 = _mm_unpacklo_epi32(
341
480k
      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
342
480k
  *d1 = _mm_srli_si128(*d0, 8);
343
480k
  *d2 = _mm_unpackhi_epi32(
344
480k
      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
345
480k
  *d3 = _mm_srli_si128(*d2, 8);
346
480k
}
loopfilter_sse2.c:transpose8x8_low_sse2
Line
Count
Source
305
445k
                                         __m128i *d3) {
306
  // input
307
  // x0 00 01 02 03 04 05 06 07
308
  // x1 10 11 12 13 14 15 16 17
309
  // x2 20 21 22 23 24 25 26 27
310
  // x3 30 31 32 33 34 35 36 37
311
  // x4 40 41 42 43 44 45 46 47
312
  // x5  50 51 52 53 54 55 56 57
313
  // x6  60 61 62 63 64 65 66 67
314
  // x7 70 71 72 73 74 75 76 77
315
  // output
316
  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
317
  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
318
  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
319
  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
320
321
445k
  __m128i w0, w1, w2, w3, w4, w5;
322
323
445k
  w0 = _mm_unpacklo_epi8(
324
445k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
325
326
445k
  w1 = _mm_unpacklo_epi8(
327
445k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
328
329
445k
  w2 = _mm_unpacklo_epi8(
330
445k
      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
331
332
445k
  w3 = _mm_unpacklo_epi8(
333
445k
      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
334
335
445k
  w4 = _mm_unpacklo_epi16(
336
445k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
337
445k
  w5 = _mm_unpacklo_epi16(
338
445k
      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
339
340
445k
  *d0 = _mm_unpacklo_epi32(
341
445k
      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
342
445k
  *d1 = _mm_srli_si128(*d0, 8);
343
445k
  *d2 = _mm_unpackhi_epi32(
344
445k
      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
345
445k
  *d3 = _mm_srli_si128(*d2, 8);
346
445k
}
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x8_low_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose8x8_low_sse2
intrapred_avx2.c:transpose8x8_low_sse2
Line
Count
Source
305
34.8k
                                         __m128i *d3) {
306
  // input
307
  // x0 00 01 02 03 04 05 06 07
308
  // x1 10 11 12 13 14 15 16 17
309
  // x2 20 21 22 23 24 25 26 27
310
  // x3 30 31 32 33 34 35 36 37
311
  // x4 40 41 42 43 44 45 46 47
312
  // x5  50 51 52 53 54 55 56 57
313
  // x6  60 61 62 63 64 65 66 67
314
  // x7 70 71 72 73 74 75 76 77
315
  // output
316
  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
317
  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
318
  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
319
  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
320
321
34.8k
  __m128i w0, w1, w2, w3, w4, w5;
322
323
34.8k
  w0 = _mm_unpacklo_epi8(
324
34.8k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
325
326
34.8k
  w1 = _mm_unpacklo_epi8(
327
34.8k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
328
329
34.8k
  w2 = _mm_unpacklo_epi8(
330
34.8k
      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
331
332
34.8k
  w3 = _mm_unpacklo_epi8(
333
34.8k
      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
334
335
34.8k
  w4 = _mm_unpacklo_epi16(
336
34.8k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
337
34.8k
  w5 = _mm_unpacklo_epi16(
338
34.8k
      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
339
340
34.8k
  *d0 = _mm_unpacklo_epi32(
341
34.8k
      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
342
34.8k
  *d1 = _mm_srli_si128(*d0, 8);
343
34.8k
  *d2 = _mm_unpackhi_epi32(
344
34.8k
      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
345
34.8k
  *d3 = _mm_srli_si128(*d2, 8);
346
34.8k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x8_low_sse2
347
348
static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
349
                                     __m128i *x3, __m128i *x4, __m128i *x5,
350
                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
351
                                     __m128i *d2d3, __m128i *d4d5,
352
88.1k
                                     __m128i *d6d7) {
353
88.1k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
354
  // x0 00 01 02 03 04 05 06 07
355
  // x1 10 11 12 13 14 15 16 17
356
88.1k
  w0 = _mm_unpacklo_epi8(
357
88.1k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
358
359
  // x2 20 21 22 23 24 25 26 27
360
  // x3 30 31 32 33 34 35 36 37
361
88.1k
  w1 = _mm_unpacklo_epi8(
362
88.1k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
363
364
  // x4 40 41 42 43 44 45 46 47
365
  // x5  50 51 52 53 54 55 56 57
366
88.1k
  w2 = _mm_unpacklo_epi8(
367
88.1k
      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
368
369
  // x6  60 61 62 63 64 65 66 67
370
  // x7 70 71 72 73 74 75 76 77
371
88.1k
  w3 = _mm_unpacklo_epi8(
372
88.1k
      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
373
374
88.1k
  w4 = _mm_unpacklo_epi16(
375
88.1k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
376
88.1k
  w5 = _mm_unpacklo_epi16(
377
88.1k
      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
378
379
88.1k
  *d0d1 = _mm_unpacklo_epi32(
380
88.1k
      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
381
88.1k
  *d2d3 = _mm_unpackhi_epi32(
382
88.1k
      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
383
384
88.1k
  w6 = _mm_unpackhi_epi16(
385
88.1k
      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
386
88.1k
  w7 = _mm_unpackhi_epi16(
387
88.1k
      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
388
389
88.1k
  *d4d5 = _mm_unpacklo_epi32(
390
88.1k
      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
391
88.1k
  *d6d7 = _mm_unpackhi_epi32(
392
88.1k
      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
393
88.1k
}
Unexecuted instantiation: loopfilter_sse2.c:transpose8x8_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x8_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose8x8_sse2
intrapred_avx2.c:transpose8x8_sse2
Line
Count
Source
352
88.1k
                                     __m128i *d6d7) {
353
88.1k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
354
  // x0 00 01 02 03 04 05 06 07
355
  // x1 10 11 12 13 14 15 16 17
356
88.1k
  w0 = _mm_unpacklo_epi8(
357
88.1k
      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
358
359
  // x2 20 21 22 23 24 25 26 27
360
  // x3 30 31 32 33 34 35 36 37
361
88.1k
  w1 = _mm_unpacklo_epi8(
362
88.1k
      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
363
364
  // x4 40 41 42 43 44 45 46 47
365
  // x5  50 51 52 53 54 55 56 57
366
88.1k
  w2 = _mm_unpacklo_epi8(
367
88.1k
      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
368
369
  // x6  60 61 62 63 64 65 66 67
370
  // x7 70 71 72 73 74 75 76 77
371
88.1k
  w3 = _mm_unpacklo_epi8(
372
88.1k
      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
373
374
88.1k
  w4 = _mm_unpacklo_epi16(
375
88.1k
      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
376
88.1k
  w5 = _mm_unpacklo_epi16(
377
88.1k
      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
378
379
88.1k
  *d0d1 = _mm_unpacklo_epi32(
380
88.1k
      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
381
88.1k
  *d2d3 = _mm_unpackhi_epi32(
382
88.1k
      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
383
384
88.1k
  w6 = _mm_unpackhi_epi16(
385
88.1k
      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
386
88.1k
  w7 = _mm_unpackhi_epi16(
387
88.1k
      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
388
389
88.1k
  *d4d5 = _mm_unpacklo_epi32(
390
88.1k
      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
391
88.1k
  *d6d7 = _mm_unpackhi_epi32(
392
88.1k
      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
393
88.1k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x8_sse2
394
395
static inline void transpose16x8_8x16_sse2(
396
    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
397
    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
398
    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
399
    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
400
169k
    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
401
169k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
402
169k
  __m128i w10, w11, w12, w13, w14, w15;
403
404
169k
  w0 = _mm_unpacklo_epi8(*x0, *x1);
405
169k
  w1 = _mm_unpacklo_epi8(*x2, *x3);
406
169k
  w2 = _mm_unpacklo_epi8(*x4, *x5);
407
169k
  w3 = _mm_unpacklo_epi8(*x6, *x7);
408
409
169k
  w8 = _mm_unpacklo_epi8(*x8, *x9);
410
169k
  w9 = _mm_unpacklo_epi8(*x10, *x11);
411
169k
  w10 = _mm_unpacklo_epi8(*x12, *x13);
412
169k
  w11 = _mm_unpacklo_epi8(*x14, *x15);
413
414
169k
  w4 = _mm_unpacklo_epi16(w0, w1);
415
169k
  w5 = _mm_unpacklo_epi16(w2, w3);
416
169k
  w12 = _mm_unpacklo_epi16(w8, w9);
417
169k
  w13 = _mm_unpacklo_epi16(w10, w11);
418
419
169k
  w6 = _mm_unpacklo_epi32(w4, w5);
420
169k
  w7 = _mm_unpackhi_epi32(w4, w5);
421
169k
  w14 = _mm_unpacklo_epi32(w12, w13);
422
169k
  w15 = _mm_unpackhi_epi32(w12, w13);
423
424
  // Store first 4-line result
425
169k
  *d0 = _mm_unpacklo_epi64(w6, w14);
426
169k
  *d1 = _mm_unpackhi_epi64(w6, w14);
427
169k
  *d2 = _mm_unpacklo_epi64(w7, w15);
428
169k
  *d3 = _mm_unpackhi_epi64(w7, w15);
429
430
169k
  w4 = _mm_unpackhi_epi16(w0, w1);
431
169k
  w5 = _mm_unpackhi_epi16(w2, w3);
432
169k
  w12 = _mm_unpackhi_epi16(w8, w9);
433
169k
  w13 = _mm_unpackhi_epi16(w10, w11);
434
435
169k
  w6 = _mm_unpacklo_epi32(w4, w5);
436
169k
  w7 = _mm_unpackhi_epi32(w4, w5);
437
169k
  w14 = _mm_unpacklo_epi32(w12, w13);
438
169k
  w15 = _mm_unpackhi_epi32(w12, w13);
439
440
  // Store second 4-line result
441
169k
  *d4 = _mm_unpacklo_epi64(w6, w14);
442
169k
  *d5 = _mm_unpackhi_epi64(w6, w14);
443
169k
  *d6 = _mm_unpacklo_epi64(w7, w15);
444
169k
  *d7 = _mm_unpackhi_epi64(w7, w15);
445
169k
}
Unexecuted instantiation: loopfilter_sse2.c:transpose16x8_8x16_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose16x8_8x16_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose16x8_8x16_sse2
intrapred_avx2.c:transpose16x8_8x16_sse2
Line
Count
Source
400
169k
    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
401
169k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
402
169k
  __m128i w10, w11, w12, w13, w14, w15;
403
404
169k
  w0 = _mm_unpacklo_epi8(*x0, *x1);
405
169k
  w1 = _mm_unpacklo_epi8(*x2, *x3);
406
169k
  w2 = _mm_unpacklo_epi8(*x4, *x5);
407
169k
  w3 = _mm_unpacklo_epi8(*x6, *x7);
408
409
169k
  w8 = _mm_unpacklo_epi8(*x8, *x9);
410
169k
  w9 = _mm_unpacklo_epi8(*x10, *x11);
411
169k
  w10 = _mm_unpacklo_epi8(*x12, *x13);
412
169k
  w11 = _mm_unpacklo_epi8(*x14, *x15);
413
414
169k
  w4 = _mm_unpacklo_epi16(w0, w1);
415
169k
  w5 = _mm_unpacklo_epi16(w2, w3);
416
169k
  w12 = _mm_unpacklo_epi16(w8, w9);
417
169k
  w13 = _mm_unpacklo_epi16(w10, w11);
418
419
169k
  w6 = _mm_unpacklo_epi32(w4, w5);
420
169k
  w7 = _mm_unpackhi_epi32(w4, w5);
421
169k
  w14 = _mm_unpacklo_epi32(w12, w13);
422
169k
  w15 = _mm_unpackhi_epi32(w12, w13);
423
424
  // Store first 4-line result
425
169k
  *d0 = _mm_unpacklo_epi64(w6, w14);
426
169k
  *d1 = _mm_unpackhi_epi64(w6, w14);
427
169k
  *d2 = _mm_unpacklo_epi64(w7, w15);
428
169k
  *d3 = _mm_unpackhi_epi64(w7, w15);
429
430
169k
  w4 = _mm_unpackhi_epi16(w0, w1);
431
169k
  w5 = _mm_unpackhi_epi16(w2, w3);
432
169k
  w12 = _mm_unpackhi_epi16(w8, w9);
433
169k
  w13 = _mm_unpackhi_epi16(w10, w11);
434
435
169k
  w6 = _mm_unpacklo_epi32(w4, w5);
436
169k
  w7 = _mm_unpackhi_epi32(w4, w5);
437
169k
  w14 = _mm_unpacklo_epi32(w12, w13);
438
169k
  w15 = _mm_unpackhi_epi32(w12, w13);
439
440
  // Store second 4-line result
441
169k
  *d4 = _mm_unpacklo_epi64(w6, w14);
442
169k
  *d5 = _mm_unpackhi_epi64(w6, w14);
443
169k
  *d6 = _mm_unpacklo_epi64(w7, w15);
444
169k
  *d7 = _mm_unpackhi_epi64(w7, w15);
445
169k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose16x8_8x16_sse2
446
447
static inline void transpose8x16_16x8_sse2(
448
    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
449
    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
450
    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
451
22.6k
    __m128i *d12d13, __m128i *d14d15) {
452
22.6k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
453
22.6k
  __m128i w10, w11, w12, w13, w14, w15;
454
455
22.6k
  w0 = _mm_unpacklo_epi8(*x0, *x1);
456
22.6k
  w1 = _mm_unpacklo_epi8(*x2, *x3);
457
22.6k
  w2 = _mm_unpacklo_epi8(*x4, *x5);
458
22.6k
  w3 = _mm_unpacklo_epi8(*x6, *x7);
459
460
22.6k
  w8 = _mm_unpackhi_epi8(*x0, *x1);
461
22.6k
  w9 = _mm_unpackhi_epi8(*x2, *x3);
462
22.6k
  w10 = _mm_unpackhi_epi8(*x4, *x5);
463
22.6k
  w11 = _mm_unpackhi_epi8(*x6, *x7);
464
465
22.6k
  w4 = _mm_unpacklo_epi16(w0, w1);
466
22.6k
  w5 = _mm_unpacklo_epi16(w2, w3);
467
22.6k
  w12 = _mm_unpacklo_epi16(w8, w9);
468
22.6k
  w13 = _mm_unpacklo_epi16(w10, w11);
469
470
22.6k
  w6 = _mm_unpacklo_epi32(w4, w5);
471
22.6k
  w7 = _mm_unpackhi_epi32(w4, w5);
472
22.6k
  w14 = _mm_unpacklo_epi32(w12, w13);
473
22.6k
  w15 = _mm_unpackhi_epi32(w12, w13);
474
475
  // Store first 4-line result
476
22.6k
  *d0d1 = _mm_unpacklo_epi64(w6, w14);
477
22.6k
  *d2d3 = _mm_unpackhi_epi64(w6, w14);
478
22.6k
  *d4d5 = _mm_unpacklo_epi64(w7, w15);
479
22.6k
  *d6d7 = _mm_unpackhi_epi64(w7, w15);
480
481
22.6k
  w4 = _mm_unpackhi_epi16(w0, w1);
482
22.6k
  w5 = _mm_unpackhi_epi16(w2, w3);
483
22.6k
  w12 = _mm_unpackhi_epi16(w8, w9);
484
22.6k
  w13 = _mm_unpackhi_epi16(w10, w11);
485
486
22.6k
  w6 = _mm_unpacklo_epi32(w4, w5);
487
22.6k
  w7 = _mm_unpackhi_epi32(w4, w5);
488
22.6k
  w14 = _mm_unpacklo_epi32(w12, w13);
489
22.6k
  w15 = _mm_unpackhi_epi32(w12, w13);
490
491
  // Store second 4-line result
492
22.6k
  *d8d9 = _mm_unpacklo_epi64(w6, w14);
493
22.6k
  *d10d11 = _mm_unpackhi_epi64(w6, w14);
494
22.6k
  *d12d13 = _mm_unpacklo_epi64(w7, w15);
495
22.6k
  *d14d15 = _mm_unpackhi_epi64(w7, w15);
496
22.6k
}
Unexecuted instantiation: loopfilter_sse2.c:transpose8x16_16x8_sse2
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose8x16_16x8_sse2
Unexecuted instantiation: intrapred_sse4.c:transpose8x16_16x8_sse2
intrapred_avx2.c:transpose8x16_16x8_sse2
Line
Count
Source
451
22.6k
    __m128i *d12d13, __m128i *d14d15) {
452
22.6k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
453
22.6k
  __m128i w10, w11, w12, w13, w14, w15;
454
455
22.6k
  w0 = _mm_unpacklo_epi8(*x0, *x1);
456
22.6k
  w1 = _mm_unpacklo_epi8(*x2, *x3);
457
22.6k
  w2 = _mm_unpacklo_epi8(*x4, *x5);
458
22.6k
  w3 = _mm_unpacklo_epi8(*x6, *x7);
459
460
22.6k
  w8 = _mm_unpackhi_epi8(*x0, *x1);
461
22.6k
  w9 = _mm_unpackhi_epi8(*x2, *x3);
462
22.6k
  w10 = _mm_unpackhi_epi8(*x4, *x5);
463
22.6k
  w11 = _mm_unpackhi_epi8(*x6, *x7);
464
465
22.6k
  w4 = _mm_unpacklo_epi16(w0, w1);
466
22.6k
  w5 = _mm_unpacklo_epi16(w2, w3);
467
22.6k
  w12 = _mm_unpacklo_epi16(w8, w9);
468
22.6k
  w13 = _mm_unpacklo_epi16(w10, w11);
469
470
22.6k
  w6 = _mm_unpacklo_epi32(w4, w5);
471
22.6k
  w7 = _mm_unpackhi_epi32(w4, w5);
472
22.6k
  w14 = _mm_unpacklo_epi32(w12, w13);
473
22.6k
  w15 = _mm_unpackhi_epi32(w12, w13);
474
475
  // Store first 4-line result
476
22.6k
  *d0d1 = _mm_unpacklo_epi64(w6, w14);
477
22.6k
  *d2d3 = _mm_unpackhi_epi64(w6, w14);
478
22.6k
  *d4d5 = _mm_unpacklo_epi64(w7, w15);
479
22.6k
  *d6d7 = _mm_unpackhi_epi64(w7, w15);
480
481
22.6k
  w4 = _mm_unpackhi_epi16(w0, w1);
482
22.6k
  w5 = _mm_unpackhi_epi16(w2, w3);
483
22.6k
  w12 = _mm_unpackhi_epi16(w8, w9);
484
22.6k
  w13 = _mm_unpackhi_epi16(w10, w11);
485
486
22.6k
  w6 = _mm_unpacklo_epi32(w4, w5);
487
22.6k
  w7 = _mm_unpackhi_epi32(w4, w5);
488
22.6k
  w14 = _mm_unpacklo_epi32(w12, w13);
489
22.6k
  w15 = _mm_unpackhi_epi32(w12, w13);
490
491
  // Store second 4-line result
492
22.6k
  *d8d9 = _mm_unpacklo_epi64(w6, w14);
493
22.6k
  *d10d11 = _mm_unpackhi_epi64(w6, w14);
494
22.6k
  *d12d13 = _mm_unpacklo_epi64(w7, w15);
495
22.6k
  *d14d15 = _mm_unpackhi_epi64(w7, w15);
496
22.6k
}
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose8x16_16x8_sse2
497
498
static inline void transpose_16x8(unsigned char *in0, unsigned char *in1,
499
0
                                  int in_p, unsigned char *out, int out_p) {
500
0
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
501
0
  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
502
503
0
  x0 = _mm_loadl_epi64((__m128i *)in0);
504
0
  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
505
0
  x0 = _mm_unpacklo_epi8(x0, x1);
506
507
0
  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
508
0
  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
509
0
  x1 = _mm_unpacklo_epi8(x2, x3);
510
511
0
  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
512
0
  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
513
0
  x2 = _mm_unpacklo_epi8(x4, x5);
514
515
0
  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
516
0
  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
517
0
  x3 = _mm_unpacklo_epi8(x6, x7);
518
0
  x4 = _mm_unpacklo_epi16(x0, x1);
519
520
0
  x8 = _mm_loadl_epi64((__m128i *)in1);
521
0
  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
522
0
  x8 = _mm_unpacklo_epi8(x8, x9);
523
0
  x5 = _mm_unpacklo_epi16(x2, x3);
524
525
0
  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
526
0
  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
527
0
  x9 = _mm_unpacklo_epi8(x10, x11);
528
529
0
  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
530
0
  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
531
0
  x10 = _mm_unpacklo_epi8(x12, x13);
532
0
  x12 = _mm_unpacklo_epi16(x8, x9);
533
534
0
  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
535
0
  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
536
0
  x11 = _mm_unpacklo_epi8(x14, x15);
537
0
  x13 = _mm_unpacklo_epi16(x10, x11);
538
539
0
  x6 = _mm_unpacklo_epi32(x4, x5);
540
0
  x7 = _mm_unpackhi_epi32(x4, x5);
541
0
  x14 = _mm_unpacklo_epi32(x12, x13);
542
0
  x15 = _mm_unpackhi_epi32(x12, x13);
543
544
  // Store first 4-line result
545
0
  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
546
0
  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
547
0
  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
548
0
  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
549
550
0
  x4 = _mm_unpackhi_epi16(x0, x1);
551
0
  x5 = _mm_unpackhi_epi16(x2, x3);
552
0
  x12 = _mm_unpackhi_epi16(x8, x9);
553
0
  x13 = _mm_unpackhi_epi16(x10, x11);
554
555
0
  x6 = _mm_unpacklo_epi32(x4, x5);
556
0
  x7 = _mm_unpackhi_epi32(x4, x5);
557
0
  x14 = _mm_unpacklo_epi32(x12, x13);
558
0
  x15 = _mm_unpackhi_epi32(x12, x13);
559
560
  // Store second 4-line result
561
0
  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
562
0
  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
563
0
  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
564
0
  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
565
0
}
Unexecuted instantiation: loopfilter_sse2.c:transpose_16x8
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_16x8
Unexecuted instantiation: intrapred_sse4.c:transpose_16x8
Unexecuted instantiation: intrapred_avx2.c:transpose_16x8
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_16x8
566
567
static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p,
568
0
                                          unsigned char *dst, int out_p) {
569
  // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
570
  // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
571
  // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
572
  // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
573
  // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
574
  // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
575
  // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
576
  // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
577
0
  const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
578
0
  const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
579
0
  const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
580
0
  const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
581
0
  const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
582
0
  const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
583
0
  const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
584
0
  const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
585
586
  // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
587
  // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
588
  // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
589
  // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
590
  // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
591
  // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
592
  // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
593
  // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
594
0
  const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
595
0
  const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
596
0
  const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
597
0
  const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
598
0
  const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
599
0
  const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
600
0
  const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
601
0
  const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
602
603
  // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
604
  // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
605
  // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
606
  // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
607
  // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
608
  // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
609
  // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
610
  // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
611
0
  const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
612
0
  const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
613
0
  const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
614
0
  const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
615
0
  const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
616
0
  const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
617
0
  const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
618
0
  const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
619
620
  // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
621
  // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
622
  // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
623
  // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
624
  // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
625
  // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
626
  // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
627
  // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
628
0
  const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
629
0
  const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
630
0
  const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
631
0
  const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
632
0
  const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
633
0
  const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
634
0
  const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
635
0
  const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
636
637
0
  mm_storelu(dst, x_s30);
638
0
  mm_storehu(dst + (1 * out_p), x_s30);
639
0
  mm_storelu(dst + (2 * out_p), x_s31);
640
0
  mm_storehu(dst + (3 * out_p), x_s31);
641
0
  mm_storelu(dst + (4 * out_p), x_s32);
642
0
  mm_storehu(dst + (5 * out_p), x_s32);
643
0
  mm_storelu(dst + (6 * out_p), x_s33);
644
0
  mm_storehu(dst + (7 * out_p), x_s33);
645
0
  mm_storelu(dst + (8 * out_p), x_s34);
646
0
  mm_storehu(dst + (9 * out_p), x_s34);
647
0
  mm_storelu(dst + (10 * out_p), x_s35);
648
0
  mm_storehu(dst + (11 * out_p), x_s35);
649
0
  mm_storelu(dst + (12 * out_p), x_s36);
650
0
  mm_storehu(dst + (13 * out_p), x_s36);
651
0
  mm_storelu(dst + (14 * out_p), x_s37);
652
0
  mm_storehu(dst + (15 * out_p), x_s37);
653
0
}
Unexecuted instantiation: loopfilter_sse2.c:transpose_16x8_to_8x16
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_16x8_to_8x16
Unexecuted instantiation: intrapred_sse4.c:transpose_16x8_to_8x16
Unexecuted instantiation: intrapred_avx2.c:transpose_16x8_to_8x16
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_16x8_to_8x16
654
655
static inline void transpose_8xn(unsigned char *src[], int in_p,
656
                                 unsigned char *dst[], int out_p,
657
0
                                 int num_8x8_to_transpose) {
658
0
  int idx8x8 = 0;
659
0
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
660
0
  do {
661
0
    unsigned char *in = src[idx8x8];
662
0
    unsigned char *out = dst[idx8x8];
663
0
664
0
    x0 =
665
0
        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
666
0
    x1 =
667
0
        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
668
0
    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
669
0
    x0 = _mm_unpacklo_epi8(x0, x1);
670
0
671
0
    x2 =
672
0
        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
673
0
    x3 =
674
0
        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
675
0
    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
676
0
    x1 = _mm_unpacklo_epi8(x2, x3);
677
0
678
0
    x4 =
679
0
        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
680
0
    x5 =
681
0
        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
682
0
    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
683
0
    x2 = _mm_unpacklo_epi8(x4, x5);
684
0
685
0
    x6 =
686
0
        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
687
0
    x7 =
688
0
        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
689
0
    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
690
0
    x3 = _mm_unpacklo_epi8(x6, x7);
691
0
692
0
    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
693
0
    x4 = _mm_unpacklo_epi16(x0, x1);
694
0
    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
695
0
    x5 = _mm_unpacklo_epi16(x2, x3);
696
0
    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
697
0
    x6 = _mm_unpacklo_epi32(x4, x5);
698
0
    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
699
0
    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
700
0
    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
701
0
    x7 = _mm_unpackhi_epi32(x4, x5);
702
0
    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
703
0
    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
704
0
705
0
    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
706
0
    x4 = _mm_unpackhi_epi16(x0, x1);
707
0
    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
708
0
    x5 = _mm_unpackhi_epi16(x2, x3);
709
0
    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
710
0
    x6 = _mm_unpacklo_epi32(x4, x5);
711
0
    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
712
0
    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
713
0
    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
714
0
    x7 = _mm_unpackhi_epi32(x4, x5);
715
0
716
0
    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
717
0
    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
718
0
  } while (++idx8x8 < num_8x8_to_transpose);
719
0
}
Unexecuted instantiation: loopfilter_sse2.c:transpose_8xn
Unexecuted instantiation: highbd_loopfilter_sse2.c:transpose_8xn
Unexecuted instantiation: intrapred_sse4.c:transpose_8xn
Unexecuted instantiation: intrapred_avx2.c:transpose_8xn
Unexecuted instantiation: highbd_loopfilter_avx2.c:transpose_8xn
720
721
#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_