Coverage Report

Created: 2026-06-07 06:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libvpx/vpx_dsp/x86/transpose_sse2.h
Line
Count
Source
1
/*
2
 *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
12
#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
13
14
#include <emmintrin.h>  // SSE2
15
16
#include "./vpx_config.h"
17
18
5.70M
static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
19
  // Unpack 8 bit elements. Goes from:
20
  // in[0]: 00 01 02 03
21
  // in[1]: 10 11 12 13
22
  // in[2]: 20 21 22 23
23
  // in[3]: 30 31 32 33
24
  // to:
25
  // a0:    00 10 01 11  02 12 03 13
26
  // a1:    20 30 21 31  22 32 23 33
27
5.70M
  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
28
5.70M
  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
29
30
  // Unpack 16 bit elements resulting in:
31
  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
32
5.70M
  return _mm_unpacklo_epi16(a0, a1);
33
5.70M
}
vpx_subpixel_8t_intrin_ssse3.c:transpose_8bit_4x4
Line
Count
Source
18
5.70M
static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
19
  // Unpack 8 bit elements. Goes from:
20
  // in[0]: 00 01 02 03
21
  // in[1]: 10 11 12 13
22
  // in[2]: 20 21 22 23
23
  // in[3]: 30 31 32 33
24
  // to:
25
  // a0:    00 10 01 11  02 12 03 13
26
  // a1:    20 30 21 31  22 32 23 33
27
5.70M
  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
28
5.70M
  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
29
30
  // Unpack 16 bit elements resulting in:
31
  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
32
5.70M
  return _mm_unpacklo_epi16(a0, a1);
33
5.70M
}
Unexecuted instantiation: inv_txfm_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_8bit_4x4
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_8bit_4x4
34
35
static INLINE void transpose_8bit_8x8(const __m128i *const in,
36
24.3M
                                      __m128i *const out) {
37
  // Unpack 8 bit elements. Goes from:
38
  // in[0]: 00 01 02 03 04 05 06 07
39
  // in[1]: 10 11 12 13 14 15 16 17
40
  // in[2]: 20 21 22 23 24 25 26 27
41
  // in[3]: 30 31 32 33 34 35 36 37
42
  // in[4]: 40 41 42 43 44 45 46 47
43
  // in[5]: 50 51 52 53 54 55 56 57
44
  // in[6]: 60 61 62 63 64 65 66 67
45
  // in[7]: 70 71 72 73 74 75 76 77
46
  // to:
47
  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
48
  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
49
  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
50
  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
51
24.3M
  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
52
24.3M
  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
53
24.3M
  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
54
24.3M
  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
55
56
  // Unpack 16 bit elements resulting in:
57
  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
58
  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
59
  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
60
  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
61
24.3M
  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
62
24.3M
  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
63
24.3M
  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
64
24.3M
  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
65
66
  // Unpack 32 bit elements resulting in:
67
  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
68
  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
69
  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
70
  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
71
24.3M
  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
72
24.3M
  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
73
24.3M
  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
74
24.3M
  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
75
76
  // Unpack 64 bit elements resulting in:
77
  // out[0]: 00 10 20 30 40 50 60 70
78
  // out[1]: 01 11 21 31 41 51 61 71
79
  // out[2]: 02 12 22 32 42 52 62 72
80
  // out[3]: 03 13 23 33 43 53 63 73
81
  // out[4]: 04 14 24 34 44 54 64 74
82
  // out[5]: 05 15 25 35 45 55 65 75
83
  // out[6]: 06 16 26 36 46 56 66 76
84
  // out[7]: 07 17 27 37 47 57 67 77
85
24.3M
  out[0] = _mm_unpacklo_epi64(c0, c0);
86
24.3M
  out[1] = _mm_unpackhi_epi64(c0, c0);
87
24.3M
  out[2] = _mm_unpacklo_epi64(c1, c1);
88
24.3M
  out[3] = _mm_unpackhi_epi64(c1, c1);
89
24.3M
  out[4] = _mm_unpacklo_epi64(c2, c2);
90
24.3M
  out[5] = _mm_unpackhi_epi64(c2, c2);
91
24.3M
  out[6] = _mm_unpacklo_epi64(c3, c3);
92
24.3M
  out[7] = _mm_unpackhi_epi64(c3, c3);
93
24.3M
}
vpx_subpixel_8t_intrin_ssse3.c:transpose_8bit_8x8
Line
Count
Source
36
24.3M
                                      __m128i *const out) {
37
  // Unpack 8 bit elements. Goes from:
38
  // in[0]: 00 01 02 03 04 05 06 07
39
  // in[1]: 10 11 12 13 14 15 16 17
40
  // in[2]: 20 21 22 23 24 25 26 27
41
  // in[3]: 30 31 32 33 34 35 36 37
42
  // in[4]: 40 41 42 43 44 45 46 47
43
  // in[5]: 50 51 52 53 54 55 56 57
44
  // in[6]: 60 61 62 63 64 65 66 67
45
  // in[7]: 70 71 72 73 74 75 76 77
46
  // to:
47
  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
48
  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
49
  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
50
  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
51
24.3M
  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
52
24.3M
  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
53
24.3M
  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
54
24.3M
  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
55
56
  // Unpack 16 bit elements resulting in:
57
  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
58
  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
59
  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
60
  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
61
24.3M
  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
62
24.3M
  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
63
24.3M
  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
64
24.3M
  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
65
66
  // Unpack 32 bit elements resulting in:
67
  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
68
  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
69
  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
70
  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
71
24.3M
  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
72
24.3M
  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
73
24.3M
  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
74
24.3M
  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
75
76
  // Unpack 64 bit elements resulting in:
77
  // out[0]: 00 10 20 30 40 50 60 70
78
  // out[1]: 01 11 21 31 41 51 61 71
79
  // out[2]: 02 12 22 32 42 52 62 72
80
  // out[3]: 03 13 23 33 43 53 63 73
81
  // out[4]: 04 14 24 34 44 54 64 74
82
  // out[5]: 05 15 25 35 45 55 65 75
83
  // out[6]: 06 16 26 36 46 56 66 76
84
  // out[7]: 07 17 27 37 47 57 67 77
85
24.3M
  out[0] = _mm_unpacklo_epi64(c0, c0);
86
24.3M
  out[1] = _mm_unpackhi_epi64(c0, c0);
87
24.3M
  out[2] = _mm_unpacklo_epi64(c1, c1);
88
24.3M
  out[3] = _mm_unpackhi_epi64(c1, c1);
89
24.3M
  out[4] = _mm_unpacklo_epi64(c2, c2);
90
24.3M
  out[5] = _mm_unpackhi_epi64(c2, c2);
91
24.3M
  out[6] = _mm_unpacklo_epi64(c3, c3);
92
24.3M
  out[7] = _mm_unpackhi_epi64(c3, c3);
93
24.3M
}
Unexecuted instantiation: inv_txfm_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_8bit_8x8
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_8bit_8x8
94
95
static INLINE void transpose_16bit_4x4(const __m128i *const in,
96
22.8M
                                       __m128i *const out) {
97
  // Unpack 16 bit elements. Goes from:
98
  // in[0]: 00 01 02 03  XX XX XX XX
99
  // in[1]: 10 11 12 13  XX XX XX XX
100
  // in[2]: 20 21 22 23  XX XX XX XX
101
  // in[3]: 30 31 32 33  XX XX XX XX
102
  // to:
103
  // a0:    00 10 01 11  02 12 03 13
104
  // a1:    20 30 21 31  22 32 23 33
105
22.8M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106
22.8M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107
108
  // Unpack 32 bit elements resulting in:
109
  // out[0]: 00 10 20 30  01 11 21 31
110
  // out[1]: 02 12 22 32  03 13 23 33
111
22.8M
  out[0] = _mm_unpacklo_epi32(a0, a1);
112
22.8M
  out[1] = _mm_unpackhi_epi32(a0, a1);
113
22.8M
}
vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x4
Line
Count
Source
96
20.3M
                                       __m128i *const out) {
97
  // Unpack 16 bit elements. Goes from:
98
  // in[0]: 00 01 02 03  XX XX XX XX
99
  // in[1]: 10 11 12 13  XX XX XX XX
100
  // in[2]: 20 21 22 23  XX XX XX XX
101
  // in[3]: 30 31 32 33  XX XX XX XX
102
  // to:
103
  // a0:    00 10 01 11  02 12 03 13
104
  // a1:    20 30 21 31  22 32 23 33
105
20.3M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106
20.3M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107
108
  // Unpack 32 bit elements resulting in:
109
  // out[0]: 00 10 20 30  01 11 21 31
110
  // out[1]: 02 12 22 32  03 13 23 33
111
20.3M
  out[0] = _mm_unpacklo_epi32(a0, a1);
112
20.3M
  out[1] = _mm_unpackhi_epi32(a0, a1);
113
20.3M
}
inv_txfm_sse2.c:transpose_16bit_4x4
Line
Count
Source
96
692k
                                       __m128i *const out) {
97
  // Unpack 16 bit elements. Goes from:
98
  // in[0]: 00 01 02 03  XX XX XX XX
99
  // in[1]: 10 11 12 13  XX XX XX XX
100
  // in[2]: 20 21 22 23  XX XX XX XX
101
  // in[3]: 30 31 32 33  XX XX XX XX
102
  // to:
103
  // a0:    00 10 01 11  02 12 03 13
104
  // a1:    20 30 21 31  22 32 23 33
105
692k
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106
692k
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107
108
  // Unpack 32 bit elements resulting in:
109
  // out[0]: 00 10 20 30  01 11 21 31
110
  // out[1]: 02 12 22 32  03 13 23 33
111
692k
  out[0] = _mm_unpacklo_epi32(a0, a1);
112
692k
  out[1] = _mm_unpackhi_epi32(a0, a1);
113
692k
}
inv_txfm_ssse3.c:transpose_16bit_4x4
Line
Count
Source
96
1.77M
                                       __m128i *const out) {
97
  // Unpack 16 bit elements. Goes from:
98
  // in[0]: 00 01 02 03  XX XX XX XX
99
  // in[1]: 10 11 12 13  XX XX XX XX
100
  // in[2]: 20 21 22 23  XX XX XX XX
101
  // in[3]: 30 31 32 33  XX XX XX XX
102
  // to:
103
  // a0:    00 10 01 11  02 12 03 13
104
  // a1:    20 30 21 31  22 32 23 33
105
1.77M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
106
1.77M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
107
108
  // Unpack 32 bit elements resulting in:
109
  // out[0]: 00 10 20 30  01 11 21 31
110
  // out[1]: 02 12 22 32  03 13 23 33
111
1.77M
  out[0] = _mm_unpacklo_epi32(a0, a1);
112
1.77M
  out[1] = _mm_unpackhi_epi32(a0, a1);
113
1.77M
}
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_16bit_4x4
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_4x4
114
115
static INLINE void transpose_16bit_4x8(const __m128i *const in,
116
181M
                                       __m128i *const out) {
117
  // Unpack 16 bit elements. Goes from:
118
  // in[0]: 00 01 02 03  XX XX XX XX
119
  // in[1]: 10 11 12 13  XX XX XX XX
120
  // in[2]: 20 21 22 23  XX XX XX XX
121
  // in[3]: 30 31 32 33  XX XX XX XX
122
  // in[4]: 40 41 42 43  XX XX XX XX
123
  // in[5]: 50 51 52 53  XX XX XX XX
124
  // in[6]: 60 61 62 63  XX XX XX XX
125
  // in[7]: 70 71 72 73  XX XX XX XX
126
  // to:
127
  // a0:    00 10 01 11  02 12 03 13
128
  // a1:    20 30 21 31  22 32 23 33
129
  // a2:    40 50 41 51  42 52 43 53
130
  // a3:    60 70 61 71  62 72 63 73
131
181M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
132
181M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
133
181M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
134
181M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
135
136
  // Unpack 32 bit elements resulting in:
137
  // b0: 00 10 20 30  01 11 21 31
138
  // b1: 40 50 60 70  41 51 61 71
139
  // b2: 02 12 22 32  03 13 23 33
140
  // b3: 42 52 62 72  43 53 63 73
141
181M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
142
181M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
143
181M
  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
144
181M
  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
145
146
  // Unpack 64 bit elements resulting in:
147
  // out[0]: 00 10 20 30  40 50 60 70
148
  // out[1]: 01 11 21 31  41 51 61 71
149
  // out[2]: 02 12 22 32  42 52 62 72
150
  // out[3]: 03 13 23 33  43 53 63 73
151
181M
  out[0] = _mm_unpacklo_epi64(b0, b1);
152
181M
  out[1] = _mm_unpackhi_epi64(b0, b1);
153
181M
  out[2] = _mm_unpacklo_epi64(b2, b3);
154
181M
  out[3] = _mm_unpackhi_epi64(b2, b3);
155
181M
}
vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_4x8
Line
Count
Source
116
180M
                                       __m128i *const out) {
117
  // Unpack 16 bit elements. Goes from:
118
  // in[0]: 00 01 02 03  XX XX XX XX
119
  // in[1]: 10 11 12 13  XX XX XX XX
120
  // in[2]: 20 21 22 23  XX XX XX XX
121
  // in[3]: 30 31 32 33  XX XX XX XX
122
  // in[4]: 40 41 42 43  XX XX XX XX
123
  // in[5]: 50 51 52 53  XX XX XX XX
124
  // in[6]: 60 61 62 63  XX XX XX XX
125
  // in[7]: 70 71 72 73  XX XX XX XX
126
  // to:
127
  // a0:    00 10 01 11  02 12 03 13
128
  // a1:    20 30 21 31  22 32 23 33
129
  // a2:    40 50 41 51  42 52 43 53
130
  // a3:    60 70 61 71  62 72 63 73
131
180M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
132
180M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
133
180M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
134
180M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
135
136
  // Unpack 32 bit elements resulting in:
137
  // b0: 00 10 20 30  01 11 21 31
138
  // b1: 40 50 60 70  41 51 61 71
139
  // b2: 02 12 22 32  03 13 23 33
140
  // b3: 42 52 62 72  43 53 63 73
141
180M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
142
180M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
143
180M
  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
144
180M
  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
145
146
  // Unpack 64 bit elements resulting in:
147
  // out[0]: 00 10 20 30  40 50 60 70
148
  // out[1]: 01 11 21 31  41 51 61 71
149
  // out[2]: 02 12 22 32  42 52 62 72
150
  // out[3]: 03 13 23 33  43 53 63 73
151
180M
  out[0] = _mm_unpacklo_epi64(b0, b1);
152
180M
  out[1] = _mm_unpackhi_epi64(b0, b1);
153
180M
  out[2] = _mm_unpacklo_epi64(b2, b3);
154
180M
  out[3] = _mm_unpackhi_epi64(b2, b3);
155
180M
}
inv_txfm_sse2.c:transpose_16bit_4x8
Line
Count
Source
116
1.38M
                                       __m128i *const out) {
117
  // Unpack 16 bit elements. Goes from:
118
  // in[0]: 00 01 02 03  XX XX XX XX
119
  // in[1]: 10 11 12 13  XX XX XX XX
120
  // in[2]: 20 21 22 23  XX XX XX XX
121
  // in[3]: 30 31 32 33  XX XX XX XX
122
  // in[4]: 40 41 42 43  XX XX XX XX
123
  // in[5]: 50 51 52 53  XX XX XX XX
124
  // in[6]: 60 61 62 63  XX XX XX XX
125
  // in[7]: 70 71 72 73  XX XX XX XX
126
  // to:
127
  // a0:    00 10 01 11  02 12 03 13
128
  // a1:    20 30 21 31  22 32 23 33
129
  // a2:    40 50 41 51  42 52 43 53
130
  // a3:    60 70 61 71  62 72 63 73
131
1.38M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
132
1.38M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
133
1.38M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
134
1.38M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
135
136
  // Unpack 32 bit elements resulting in:
137
  // b0: 00 10 20 30  01 11 21 31
138
  // b1: 40 50 60 70  41 51 61 71
139
  // b2: 02 12 22 32  03 13 23 33
140
  // b3: 42 52 62 72  43 53 63 73
141
1.38M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
142
1.38M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
143
1.38M
  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
144
1.38M
  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
145
146
  // Unpack 64 bit elements resulting in:
147
  // out[0]: 00 10 20 30  40 50 60 70
148
  // out[1]: 01 11 21 31  41 51 61 71
149
  // out[2]: 02 12 22 32  42 52 62 72
150
  // out[3]: 03 13 23 33  43 53 63 73
151
1.38M
  out[0] = _mm_unpacklo_epi64(b0, b1);
152
1.38M
  out[1] = _mm_unpackhi_epi64(b0, b1);
153
1.38M
  out[2] = _mm_unpacklo_epi64(b2, b3);
154
1.38M
  out[3] = _mm_unpackhi_epi64(b2, b3);
155
1.38M
}
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_16bit_4x8
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_4x8
156
157
static INLINE void transpose_16bit_8x8(const __m128i *const in,
158
257M
                                       __m128i *const out) {
159
  // Unpack 16 bit elements. Goes from:
160
  // in[0]: 00 01 02 03  04 05 06 07
161
  // in[1]: 10 11 12 13  14 15 16 17
162
  // in[2]: 20 21 22 23  24 25 26 27
163
  // in[3]: 30 31 32 33  34 35 36 37
164
  // in[4]: 40 41 42 43  44 45 46 47
165
  // in[5]: 50 51 52 53  54 55 56 57
166
  // in[6]: 60 61 62 63  64 65 66 67
167
  // in[7]: 70 71 72 73  74 75 76 77
168
  // to:
169
  // a0:    00 10 01 11  02 12 03 13
170
  // a1:    20 30 21 31  22 32 23 33
171
  // a2:    40 50 41 51  42 52 43 53
172
  // a3:    60 70 61 71  62 72 63 73
173
  // a4:    04 14 05 15  06 16 07 17
174
  // a5:    24 34 25 35  26 36 27 37
175
  // a6:    44 54 45 55  46 56 47 57
176
  // a7:    64 74 65 75  66 76 67 77
177
257M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178
257M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179
257M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180
257M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181
257M
  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182
257M
  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183
257M
  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184
257M
  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185
186
  // Unpack 32 bit elements resulting in:
187
  // b0: 00 10 20 30  01 11 21 31
188
  // b1: 40 50 60 70  41 51 61 71
189
  // b2: 04 14 24 34  05 15 25 35
190
  // b3: 44 54 64 74  45 55 65 75
191
  // b4: 02 12 22 32  03 13 23 33
192
  // b5: 42 52 62 72  43 53 63 73
193
  // b6: 06 16 26 36  07 17 27 37
194
  // b7: 46 56 66 76  47 57 67 77
195
257M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196
257M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197
257M
  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198
257M
  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199
257M
  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200
257M
  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201
257M
  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202
257M
  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203
204
  // Unpack 64 bit elements resulting in:
205
  // out[0]: 00 10 20 30  40 50 60 70
206
  // out[1]: 01 11 21 31  41 51 61 71
207
  // out[2]: 02 12 22 32  42 52 62 72
208
  // out[3]: 03 13 23 33  43 53 63 73
209
  // out[4]: 04 14 24 34  44 54 64 74
210
  // out[5]: 05 15 25 35  45 55 65 75
211
  // out[6]: 06 16 26 36  46 56 66 76
212
  // out[7]: 07 17 27 37  47 57 67 77
213
257M
  out[0] = _mm_unpacklo_epi64(b0, b1);
214
257M
  out[1] = _mm_unpackhi_epi64(b0, b1);
215
257M
  out[2] = _mm_unpacklo_epi64(b4, b5);
216
257M
  out[3] = _mm_unpackhi_epi64(b4, b5);
217
257M
  out[4] = _mm_unpacklo_epi64(b2, b3);
218
257M
  out[5] = _mm_unpackhi_epi64(b2, b3);
219
257M
  out[6] = _mm_unpacklo_epi64(b6, b7);
220
257M
  out[7] = _mm_unpackhi_epi64(b6, b7);
221
257M
}
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_8x8
inv_txfm_sse2.c:transpose_16bit_8x8
Line
Count
Source
158
162M
                                       __m128i *const out) {
159
  // Unpack 16 bit elements. Goes from:
160
  // in[0]: 00 01 02 03  04 05 06 07
161
  // in[1]: 10 11 12 13  14 15 16 17
162
  // in[2]: 20 21 22 23  24 25 26 27
163
  // in[3]: 30 31 32 33  34 35 36 37
164
  // in[4]: 40 41 42 43  44 45 46 47
165
  // in[5]: 50 51 52 53  54 55 56 57
166
  // in[6]: 60 61 62 63  64 65 66 67
167
  // in[7]: 70 71 72 73  74 75 76 77
168
  // to:
169
  // a0:    00 10 01 11  02 12 03 13
170
  // a1:    20 30 21 31  22 32 23 33
171
  // a2:    40 50 41 51  42 52 43 53
172
  // a3:    60 70 61 71  62 72 63 73
173
  // a4:    04 14 05 15  06 16 07 17
174
  // a5:    24 34 25 35  26 36 27 37
175
  // a6:    44 54 45 55  46 56 47 57
176
  // a7:    64 74 65 75  66 76 67 77
177
162M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178
162M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179
162M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180
162M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181
162M
  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182
162M
  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183
162M
  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184
162M
  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185
186
  // Unpack 32 bit elements resulting in:
187
  // b0: 00 10 20 30  01 11 21 31
188
  // b1: 40 50 60 70  41 51 61 71
189
  // b2: 04 14 24 34  05 15 25 35
190
  // b3: 44 54 64 74  45 55 65 75
191
  // b4: 02 12 22 32  03 13 23 33
192
  // b5: 42 52 62 72  43 53 63 73
193
  // b6: 06 16 26 36  07 17 27 37
194
  // b7: 46 56 66 76  47 57 67 77
195
162M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196
162M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197
162M
  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198
162M
  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199
162M
  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200
162M
  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201
162M
  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202
162M
  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203
204
  // Unpack 64 bit elements resulting in:
205
  // out[0]: 00 10 20 30  40 50 60 70
206
  // out[1]: 01 11 21 31  41 51 61 71
207
  // out[2]: 02 12 22 32  42 52 62 72
208
  // out[3]: 03 13 23 33  43 53 63 73
209
  // out[4]: 04 14 24 34  44 54 64 74
210
  // out[5]: 05 15 25 35  45 55 65 75
211
  // out[6]: 06 16 26 36  46 56 66 76
212
  // out[7]: 07 17 27 37  47 57 67 77
213
162M
  out[0] = _mm_unpacklo_epi64(b0, b1);
214
162M
  out[1] = _mm_unpackhi_epi64(b0, b1);
215
162M
  out[2] = _mm_unpacklo_epi64(b4, b5);
216
162M
  out[3] = _mm_unpackhi_epi64(b4, b5);
217
162M
  out[4] = _mm_unpacklo_epi64(b2, b3);
218
162M
  out[5] = _mm_unpackhi_epi64(b2, b3);
219
162M
  out[6] = _mm_unpacklo_epi64(b6, b7);
220
162M
  out[7] = _mm_unpackhi_epi64(b6, b7);
221
162M
}
inv_txfm_ssse3.c:transpose_16bit_8x8
Line
Count
Source
158
3.81M
                                       __m128i *const out) {
159
  // Unpack 16 bit elements. Goes from:
160
  // in[0]: 00 01 02 03  04 05 06 07
161
  // in[1]: 10 11 12 13  14 15 16 17
162
  // in[2]: 20 21 22 23  24 25 26 27
163
  // in[3]: 30 31 32 33  34 35 36 37
164
  // in[4]: 40 41 42 43  44 45 46 47
165
  // in[5]: 50 51 52 53  54 55 56 57
166
  // in[6]: 60 61 62 63  64 65 66 67
167
  // in[7]: 70 71 72 73  74 75 76 77
168
  // to:
169
  // a0:    00 10 01 11  02 12 03 13
170
  // a1:    20 30 21 31  22 32 23 33
171
  // a2:    40 50 41 51  42 52 43 53
172
  // a3:    60 70 61 71  62 72 63 73
173
  // a4:    04 14 05 15  06 16 07 17
174
  // a5:    24 34 25 35  26 36 27 37
175
  // a6:    44 54 45 55  46 56 47 57
176
  // a7:    64 74 65 75  66 76 67 77
177
3.81M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178
3.81M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179
3.81M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180
3.81M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181
3.81M
  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182
3.81M
  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183
3.81M
  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184
3.81M
  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185
186
  // Unpack 32 bit elements resulting in:
187
  // b0: 00 10 20 30  01 11 21 31
188
  // b1: 40 50 60 70  41 51 61 71
189
  // b2: 04 14 24 34  05 15 25 35
190
  // b3: 44 54 64 74  45 55 65 75
191
  // b4: 02 12 22 32  03 13 23 33
192
  // b5: 42 52 62 72  43 53 63 73
193
  // b6: 06 16 26 36  07 17 27 37
194
  // b7: 46 56 66 76  47 57 67 77
195
3.81M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196
3.81M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197
3.81M
  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198
3.81M
  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199
3.81M
  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200
3.81M
  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201
3.81M
  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202
3.81M
  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203
204
  // Unpack 64 bit elements resulting in:
205
  // out[0]: 00 10 20 30  40 50 60 70
206
  // out[1]: 01 11 21 31  41 51 61 71
207
  // out[2]: 02 12 22 32  42 52 62 72
208
  // out[3]: 03 13 23 33  43 53 63 73
209
  // out[4]: 04 14 24 34  44 54 64 74
210
  // out[5]: 05 15 25 35  45 55 65 75
211
  // out[6]: 06 16 26 36  46 56 66 76
212
  // out[7]: 07 17 27 37  47 57 67 77
213
3.81M
  out[0] = _mm_unpacklo_epi64(b0, b1);
214
3.81M
  out[1] = _mm_unpackhi_epi64(b0, b1);
215
3.81M
  out[2] = _mm_unpacklo_epi64(b4, b5);
216
3.81M
  out[3] = _mm_unpackhi_epi64(b4, b5);
217
3.81M
  out[4] = _mm_unpacklo_epi64(b2, b3);
218
3.81M
  out[5] = _mm_unpackhi_epi64(b2, b3);
219
3.81M
  out[6] = _mm_unpacklo_epi64(b6, b7);
220
3.81M
  out[7] = _mm_unpackhi_epi64(b6, b7);
221
3.81M
}
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_8x8
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_8x8
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_8x8
vp9_dct_intrin_sse2.c:transpose_16bit_8x8
Line
Count
Source
158
91.6M
                                       __m128i *const out) {
159
  // Unpack 16 bit elements. Goes from:
160
  // in[0]: 00 01 02 03  04 05 06 07
161
  // in[1]: 10 11 12 13  14 15 16 17
162
  // in[2]: 20 21 22 23  24 25 26 27
163
  // in[3]: 30 31 32 33  34 35 36 37
164
  // in[4]: 40 41 42 43  44 45 46 47
165
  // in[5]: 50 51 52 53  54 55 56 57
166
  // in[6]: 60 61 62 63  64 65 66 67
167
  // in[7]: 70 71 72 73  74 75 76 77
168
  // to:
169
  // a0:    00 10 01 11  02 12 03 13
170
  // a1:    20 30 21 31  22 32 23 33
171
  // a2:    40 50 41 51  42 52 43 53
172
  // a3:    60 70 61 71  62 72 63 73
173
  // a4:    04 14 05 15  06 16 07 17
174
  // a5:    24 34 25 35  26 36 27 37
175
  // a6:    44 54 45 55  46 56 47 57
176
  // a7:    64 74 65 75  66 76 67 77
177
91.6M
  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
178
91.6M
  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
179
91.6M
  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
180
91.6M
  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
181
91.6M
  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182
91.6M
  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183
91.6M
  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
184
91.6M
  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
185
186
  // Unpack 32 bit elements resulting in:
187
  // b0: 00 10 20 30  01 11 21 31
188
  // b1: 40 50 60 70  41 51 61 71
189
  // b2: 04 14 24 34  05 15 25 35
190
  // b3: 44 54 64 74  45 55 65 75
191
  // b4: 02 12 22 32  03 13 23 33
192
  // b5: 42 52 62 72  43 53 63 73
193
  // b6: 06 16 26 36  07 17 27 37
194
  // b7: 46 56 66 76  47 57 67 77
195
91.6M
  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
196
91.6M
  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
197
91.6M
  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
198
91.6M
  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
199
91.6M
  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
200
91.6M
  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
201
91.6M
  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
202
91.6M
  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
203
204
  // Unpack 64 bit elements resulting in:
205
  // out[0]: 00 10 20 30  40 50 60 70
206
  // out[1]: 01 11 21 31  41 51 61 71
207
  // out[2]: 02 12 22 32  42 52 62 72
208
  // out[3]: 03 13 23 33  43 53 63 73
209
  // out[4]: 04 14 24 34  44 54 64 74
210
  // out[5]: 05 15 25 35  45 55 65 75
211
  // out[6]: 06 16 26 36  46 56 66 76
212
  // out[7]: 07 17 27 37  47 57 67 77
213
91.6M
  out[0] = _mm_unpacklo_epi64(b0, b1);
214
91.6M
  out[1] = _mm_unpackhi_epi64(b0, b1);
215
91.6M
  out[2] = _mm_unpacklo_epi64(b4, b5);
216
91.6M
  out[3] = _mm_unpackhi_epi64(b4, b5);
217
91.6M
  out[4] = _mm_unpacklo_epi64(b2, b3);
218
91.6M
  out[5] = _mm_unpackhi_epi64(b2, b3);
219
91.6M
  out[6] = _mm_unpacklo_epi64(b6, b7);
220
91.6M
  out[7] = _mm_unpackhi_epi64(b6, b7);
221
91.6M
}
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_8x8
222
223
// Transpose in-place
224
static INLINE void transpose_16bit_16x16(__m128i *const left,
225
22.6M
                                         __m128i *const right) {
226
22.6M
  __m128i tbuf[8];
227
22.6M
  transpose_16bit_8x8(left, left);
228
22.6M
  transpose_16bit_8x8(right, tbuf);
229
22.6M
  transpose_16bit_8x8(left + 8, right);
230
22.6M
  transpose_16bit_8x8(right + 8, right + 8);
231
232
22.6M
  left[8] = tbuf[0];
233
22.6M
  left[9] = tbuf[1];
234
22.6M
  left[10] = tbuf[2];
235
22.6M
  left[11] = tbuf[3];
236
22.6M
  left[12] = tbuf[4];
237
22.6M
  left[13] = tbuf[5];
238
22.6M
  left[14] = tbuf[6];
239
22.6M
  left[15] = tbuf[7];
240
22.6M
}
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_16bit_16x16
inv_txfm_sse2.c:transpose_16bit_16x16
Line
Count
Source
225
11.2M
                                         __m128i *const right) {
226
11.2M
  __m128i tbuf[8];
227
11.2M
  transpose_16bit_8x8(left, left);
228
11.2M
  transpose_16bit_8x8(right, tbuf);
229
11.2M
  transpose_16bit_8x8(left + 8, right);
230
11.2M
  transpose_16bit_8x8(right + 8, right + 8);
231
232
11.2M
  left[8] = tbuf[0];
233
11.2M
  left[9] = tbuf[1];
234
11.2M
  left[10] = tbuf[2];
235
11.2M
  left[11] = tbuf[3];
236
11.2M
  left[12] = tbuf[4];
237
11.2M
  left[13] = tbuf[5];
238
11.2M
  left[14] = tbuf[6];
239
11.2M
  left[15] = tbuf[7];
240
11.2M
}
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_16bit_16x16
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_16bit_16x16
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_16bit_16x16
vp9_dct_intrin_sse2.c:transpose_16bit_16x16
Line
Count
Source
225
11.3M
                                         __m128i *const right) {
226
11.3M
  __m128i tbuf[8];
227
11.3M
  transpose_16bit_8x8(left, left);
228
11.3M
  transpose_16bit_8x8(right, tbuf);
229
11.3M
  transpose_16bit_8x8(left + 8, right);
230
11.3M
  transpose_16bit_8x8(right + 8, right + 8);
231
232
11.3M
  left[8] = tbuf[0];
233
11.3M
  left[9] = tbuf[1];
234
11.3M
  left[10] = tbuf[2];
235
11.3M
  left[11] = tbuf[3];
236
11.3M
  left[12] = tbuf[4];
237
11.3M
  left[13] = tbuf[5];
238
11.3M
  left[14] = tbuf[6];
239
11.3M
  left[15] = tbuf[7];
240
11.3M
}
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_16bit_16x16
241
242
static INLINE void transpose_32bit_4x4(const __m128i *const in,
243
20.7M
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
20.7M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
20.7M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
20.7M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
20.7M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
20.7M
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
20.7M
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
20.7M
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
20.7M
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
20.7M
}
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4
Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_4x4
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_4x4
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_4x4
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_4x4
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_4x4
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_4x4
highbd_idct4x4_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
2.31M
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
2.31M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
2.31M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
2.31M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
2.31M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
2.31M
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
2.31M
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
2.31M
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
2.31M
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
2.31M
}
highbd_idct8x8_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
397k
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
397k
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
397k
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
397k
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
397k
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
397k
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
397k
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
397k
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
397k
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
397k
}
highbd_idct16x16_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
4.73M
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
4.73M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
4.73M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
4.73M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
4.73M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
4.73M
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
4.73M
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
4.73M
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
4.73M
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
4.73M
}
highbd_idct32x32_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
9.45M
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
9.45M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
9.45M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
9.45M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
9.45M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
9.45M
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
9.45M
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
9.45M
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
9.45M
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
9.45M
}
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_4x4
vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
492k
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
492k
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
492k
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
492k
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
492k
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
492k
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
492k
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
492k
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
492k
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
492k
}
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_4x4
vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_4x4
Line
Count
Source
243
3.35M
                                       __m128i *const out) {
244
  // Unpack 32 bit elements. Goes from:
245
  // in[0]: 00 01 02 03
246
  // in[1]: 10 11 12 13
247
  // in[2]: 20 21 22 23
248
  // in[3]: 30 31 32 33
249
  // to:
250
  // a0:    00 10 01 11
251
  // a1:    20 30 21 31
252
  // a2:    02 12 03 13
253
  // a3:    22 32 23 33
254
255
3.35M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
256
3.35M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
257
3.35M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
258
3.35M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
259
260
  // Unpack 64 bit elements resulting in:
261
  // out[0]: 00 10 20 30
262
  // out[1]: 01 11 21 31
263
  // out[2]: 02 12 22 32
264
  // out[3]: 03 13 23 33
265
3.35M
  out[0] = _mm_unpacklo_epi64(a0, a1);
266
3.35M
  out[1] = _mm_unpackhi_epi64(a0, a1);
267
3.35M
  out[2] = _mm_unpacklo_epi64(a2, a3);
268
3.35M
  out[3] = _mm_unpackhi_epi64(a2, a3);
269
3.35M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_4x4
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_4x4
270
271
static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
272
1.41M
                                         __m128i *const out) {
273
  // Unpack 32 bit elements. Goes from:
274
  // in[0]: 00 01 02 03
275
  // in[1]: 10 11 12 13
276
  // in[2]: 20 21 22 23
277
  // in[3]: 30 31 32 33
278
  // in[4]: 04 05 06 07
279
  // in[5]: 14 15 16 17
280
  // in[6]: 24 25 26 27
281
  // in[7]: 34 35 36 37
282
  // to:
283
  // a0:    00 10 01 11
284
  // a1:    20 30 21 31
285
  // a2:    02 12 03 13
286
  // a3:    22 32 23 33
287
  // a4:    04 14 05 15
288
  // a5:    24 34 25 35
289
  // a6:    06 16 07 17
290
  // a7:    26 36 27 37
291
1.41M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
292
1.41M
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
293
1.41M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
294
1.41M
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
295
1.41M
  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
296
1.41M
  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
297
1.41M
  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
298
1.41M
  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
299
300
  // Unpack 64 bit elements resulting in:
301
  // out[0]: 00 10 20 30
302
  // out[1]: 01 11 21 31
303
  // out[2]: 02 12 22 32
304
  // out[3]: 03 13 23 33
305
  // out[4]: 04 14 24 34
306
  // out[5]: 05 15 25 35
307
  // out[6]: 06 16 26 36
308
  // out[7]: 07 17 27 37
309
1.41M
  out[0] = _mm_unpacklo_epi64(a0, a1);
310
1.41M
  out[1] = _mm_unpackhi_epi64(a0, a1);
311
1.41M
  out[2] = _mm_unpacklo_epi64(a2, a3);
312
1.41M
  out[3] = _mm_unpackhi_epi64(a2, a3);
313
1.41M
  out[4] = _mm_unpacklo_epi64(a4, a5);
314
1.41M
  out[5] = _mm_unpackhi_epi64(a4, a5);
315
1.41M
  out[6] = _mm_unpacklo_epi64(a6, a7);
316
1.41M
  out[7] = _mm_unpackhi_epi64(a6, a7);
317
1.41M
}
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_4x4x2
Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_32bit_4x4x2
highbd_idct8x8_add_sse4.c:transpose_32bit_4x4x2
Line
Count
Source
272
973k
                                         __m128i *const out) {
273
  // Unpack 32 bit elements. Goes from:
274
  // in[0]: 00 01 02 03
275
  // in[1]: 10 11 12 13
276
  // in[2]: 20 21 22 23
277
  // in[3]: 30 31 32 33
278
  // in[4]: 04 05 06 07
279
  // in[5]: 14 15 16 17
280
  // in[6]: 24 25 26 27
281
  // in[7]: 34 35 36 37
282
  // to:
283
  // a0:    00 10 01 11
284
  // a1:    20 30 21 31
285
  // a2:    02 12 03 13
286
  // a3:    22 32 23 33
287
  // a4:    04 14 05 15
288
  // a5:    24 34 25 35
289
  // a6:    06 16 07 17
290
  // a7:    26 36 27 37
291
973k
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
292
973k
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
293
973k
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
294
973k
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
295
973k
  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
296
973k
  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
297
973k
  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
298
973k
  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
299
300
  // Unpack 64 bit elements resulting in:
301
  // out[0]: 00 10 20 30
302
  // out[1]: 01 11 21 31
303
  // out[2]: 02 12 22 32
304
  // out[3]: 03 13 23 33
305
  // out[4]: 04 14 24 34
306
  // out[5]: 05 15 25 35
307
  // out[6]: 06 16 26 36
308
  // out[7]: 07 17 27 37
309
973k
  out[0] = _mm_unpacklo_epi64(a0, a1);
310
973k
  out[1] = _mm_unpackhi_epi64(a0, a1);
311
973k
  out[2] = _mm_unpacklo_epi64(a2, a3);
312
973k
  out[3] = _mm_unpackhi_epi64(a2, a3);
313
973k
  out[4] = _mm_unpacklo_epi64(a4, a5);
314
973k
  out[5] = _mm_unpackhi_epi64(a4, a5);
315
973k
  out[6] = _mm_unpacklo_epi64(a6, a7);
316
973k
  out[7] = _mm_unpackhi_epi64(a6, a7);
317
973k
}
Unexecuted instantiation: highbd_idct16x16_add_sse4.c:transpose_32bit_4x4x2
Unexecuted instantiation: highbd_idct32x32_add_sse4.c:transpose_32bit_4x4x2
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_4x4x2
vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_4x4x2
Line
Count
Source
272
441k
                                         __m128i *const out) {
273
  // Unpack 32 bit elements. Goes from:
274
  // in[0]: 00 01 02 03
275
  // in[1]: 10 11 12 13
276
  // in[2]: 20 21 22 23
277
  // in[3]: 30 31 32 33
278
  // in[4]: 04 05 06 07
279
  // in[5]: 14 15 16 17
280
  // in[6]: 24 25 26 27
281
  // in[7]: 34 35 36 37
282
  // to:
283
  // a0:    00 10 01 11
284
  // a1:    20 30 21 31
285
  // a2:    02 12 03 13
286
  // a3:    22 32 23 33
287
  // a4:    04 14 05 15
288
  // a5:    24 34 25 35
289
  // a6:    06 16 07 17
290
  // a7:    26 36 27 37
291
441k
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
292
441k
  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
293
441k
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
294
441k
  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
295
441k
  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
296
441k
  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
297
441k
  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
298
441k
  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
299
300
  // Unpack 64 bit elements resulting in:
301
  // out[0]: 00 10 20 30
302
  // out[1]: 01 11 21 31
303
  // out[2]: 02 12 22 32
304
  // out[3]: 03 13 23 33
305
  // out[4]: 04 14 24 34
306
  // out[5]: 05 15 25 35
307
  // out[6]: 06 16 26 36
308
  // out[7]: 07 17 27 37
309
441k
  out[0] = _mm_unpacklo_epi64(a0, a1);
310
441k
  out[1] = _mm_unpackhi_epi64(a0, a1);
311
441k
  out[2] = _mm_unpacklo_epi64(a2, a3);
312
441k
  out[3] = _mm_unpackhi_epi64(a2, a3);
313
441k
  out[4] = _mm_unpacklo_epi64(a4, a5);
314
441k
  out[5] = _mm_unpackhi_epi64(a4, a5);
315
441k
  out[6] = _mm_unpacklo_epi64(a6, a7);
316
441k
  out[7] = _mm_unpackhi_epi64(a6, a7);
317
441k
}
Unexecuted instantiation: vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_4x4x2
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_4x4x2
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_4x4x2
318
319
static INLINE void transpose_32bit_8x4(const __m128i *const in,
320
6.35M
                                       __m128i *const out) {
321
  // Unpack 32 bit elements. Goes from:
322
  // in[0]: 00 01 02 03
323
  // in[1]: 04 05 06 07
324
  // in[2]: 10 11 12 13
325
  // in[3]: 14 15 16 17
326
  // in[4]: 20 21 22 23
327
  // in[5]: 24 25 26 27
328
  // in[6]: 30 31 32 33
329
  // in[7]: 34 35 36 37
330
  // to:
331
  // a0: 00 10 01 11
332
  // a1: 20 30 21 31
333
  // a2: 02 12 03 13
334
  // a3: 22 32 23 33
335
  // a4: 04 14 05 15
336
  // a5: 24 34 25 35
337
  // a6: 06 16 07 17
338
  // a7: 26 36 27 37
339
6.35M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340
6.35M
  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341
6.35M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342
6.35M
  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343
6.35M
  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344
6.35M
  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345
6.35M
  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346
6.35M
  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347
348
  // Unpack 64 bit elements resulting in:
349
  // out[0]: 00 10 20 30
350
  // out[1]: 01 11 21 31
351
  // out[2]: 02 12 22 32
352
  // out[3]: 03 13 23 33
353
  // out[4]: 04 14 24 34
354
  // out[5]: 05 15 25 35
355
  // out[6]: 06 16 26 36
356
  // out[7]: 07 17 27 37
357
6.35M
  out[0] = _mm_unpacklo_epi64(a0, a1);
358
6.35M
  out[1] = _mm_unpackhi_epi64(a0, a1);
359
6.35M
  out[2] = _mm_unpacklo_epi64(a2, a3);
360
6.35M
  out[3] = _mm_unpackhi_epi64(a2, a3);
361
6.35M
  out[4] = _mm_unpacklo_epi64(a4, a5);
362
6.35M
  out[5] = _mm_unpackhi_epi64(a4, a5);
363
6.35M
  out[6] = _mm_unpacklo_epi64(a6, a7);
364
6.35M
  out[7] = _mm_unpackhi_epi64(a6, a7);
365
6.35M
}
Unexecuted instantiation: vpx_subpixel_8t_intrin_ssse3.c:transpose_32bit_8x4
Unexecuted instantiation: inv_txfm_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: inv_txfm_ssse3.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct4x4_add_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct8x8_add_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct16x16_add_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct32x32_add_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct4x4_add_sse4.c:transpose_32bit_8x4
Unexecuted instantiation: highbd_idct8x8_add_sse4.c:transpose_32bit_8x4
highbd_idct16x16_add_sse4.c:transpose_32bit_8x4
Line
Count
Source
320
1.69M
                                       __m128i *const out) {
321
  // Unpack 32 bit elements. Goes from:
322
  // in[0]: 00 01 02 03
323
  // in[1]: 04 05 06 07
324
  // in[2]: 10 11 12 13
325
  // in[3]: 14 15 16 17
326
  // in[4]: 20 21 22 23
327
  // in[5]: 24 25 26 27
328
  // in[6]: 30 31 32 33
329
  // in[7]: 34 35 36 37
330
  // to:
331
  // a0: 00 10 01 11
332
  // a1: 20 30 21 31
333
  // a2: 02 12 03 13
334
  // a3: 22 32 23 33
335
  // a4: 04 14 05 15
336
  // a5: 24 34 25 35
337
  // a6: 06 16 07 17
338
  // a7: 26 36 27 37
339
1.69M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340
1.69M
  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341
1.69M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342
1.69M
  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343
1.69M
  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344
1.69M
  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345
1.69M
  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346
1.69M
  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347
348
  // Unpack 64 bit elements resulting in:
349
  // out[0]: 00 10 20 30
350
  // out[1]: 01 11 21 31
351
  // out[2]: 02 12 22 32
352
  // out[3]: 03 13 23 33
353
  // out[4]: 04 14 24 34
354
  // out[5]: 05 15 25 35
355
  // out[6]: 06 16 26 36
356
  // out[7]: 07 17 27 37
357
1.69M
  out[0] = _mm_unpacklo_epi64(a0, a1);
358
1.69M
  out[1] = _mm_unpackhi_epi64(a0, a1);
359
1.69M
  out[2] = _mm_unpacklo_epi64(a2, a3);
360
1.69M
  out[3] = _mm_unpackhi_epi64(a2, a3);
361
1.69M
  out[4] = _mm_unpacklo_epi64(a4, a5);
362
1.69M
  out[5] = _mm_unpackhi_epi64(a4, a5);
363
1.69M
  out[6] = _mm_unpacklo_epi64(a6, a7);
364
1.69M
  out[7] = _mm_unpackhi_epi64(a6, a7);
365
1.69M
}
highbd_idct32x32_add_sse4.c:transpose_32bit_8x4
Line
Count
Source
320
2.98M
                                       __m128i *const out) {
321
  // Unpack 32 bit elements. Goes from:
322
  // in[0]: 00 01 02 03
323
  // in[1]: 04 05 06 07
324
  // in[2]: 10 11 12 13
325
  // in[3]: 14 15 16 17
326
  // in[4]: 20 21 22 23
327
  // in[5]: 24 25 26 27
328
  // in[6]: 30 31 32 33
329
  // in[7]: 34 35 36 37
330
  // to:
331
  // a0: 00 10 01 11
332
  // a1: 20 30 21 31
333
  // a2: 02 12 03 13
334
  // a3: 22 32 23 33
335
  // a4: 04 14 05 15
336
  // a5: 24 34 25 35
337
  // a6: 06 16 07 17
338
  // a7: 26 36 27 37
339
2.98M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340
2.98M
  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341
2.98M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342
2.98M
  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343
2.98M
  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344
2.98M
  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345
2.98M
  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346
2.98M
  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347
348
  // Unpack 64 bit elements resulting in:
349
  // out[0]: 00 10 20 30
350
  // out[1]: 01 11 21 31
351
  // out[2]: 02 12 22 32
352
  // out[3]: 03 13 23 33
353
  // out[4]: 04 14 24 34
354
  // out[5]: 05 15 25 35
355
  // out[6]: 06 16 26 36
356
  // out[7]: 07 17 27 37
357
2.98M
  out[0] = _mm_unpacklo_epi64(a0, a1);
358
2.98M
  out[1] = _mm_unpackhi_epi64(a0, a1);
359
2.98M
  out[2] = _mm_unpacklo_epi64(a2, a3);
360
2.98M
  out[3] = _mm_unpackhi_epi64(a2, a3);
361
2.98M
  out[4] = _mm_unpacklo_epi64(a4, a5);
362
2.98M
  out[5] = _mm_unpackhi_epi64(a4, a5);
363
2.98M
  out[6] = _mm_unpacklo_epi64(a6, a7);
364
2.98M
  out[7] = _mm_unpackhi_epi64(a6, a7);
365
2.98M
}
Unexecuted instantiation: vp9_idct_intrin_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: vp9_highbd_iht4x4_add_sse4.c:transpose_32bit_8x4
Unexecuted instantiation: vp9_highbd_iht8x8_add_sse4.c:transpose_32bit_8x4
vp9_highbd_iht16x16_add_sse4.c:transpose_32bit_8x4
Line
Count
Source
320
1.68M
                                       __m128i *const out) {
321
  // Unpack 32 bit elements. Goes from:
322
  // in[0]: 00 01 02 03
323
  // in[1]: 04 05 06 07
324
  // in[2]: 10 11 12 13
325
  // in[3]: 14 15 16 17
326
  // in[4]: 20 21 22 23
327
  // in[5]: 24 25 26 27
328
  // in[6]: 30 31 32 33
329
  // in[7]: 34 35 36 37
330
  // to:
331
  // a0: 00 10 01 11
332
  // a1: 20 30 21 31
333
  // a2: 02 12 03 13
334
  // a3: 22 32 23 33
335
  // a4: 04 14 05 15
336
  // a5: 24 34 25 35
337
  // a6: 06 16 07 17
338
  // a7: 26 36 27 37
339
1.68M
  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
340
1.68M
  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
341
1.68M
  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
342
1.68M
  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
343
1.68M
  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
344
1.68M
  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
345
1.68M
  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
346
1.68M
  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
347
348
  // Unpack 64 bit elements resulting in:
349
  // out[0]: 00 10 20 30
350
  // out[1]: 01 11 21 31
351
  // out[2]: 02 12 22 32
352
  // out[3]: 03 13 23 33
353
  // out[4]: 04 14 24 34
354
  // out[5]: 05 15 25 35
355
  // out[6]: 06 16 26 36
356
  // out[7]: 07 17 27 37
357
1.68M
  out[0] = _mm_unpacklo_epi64(a0, a1);
358
1.68M
  out[1] = _mm_unpackhi_epi64(a0, a1);
359
1.68M
  out[2] = _mm_unpacklo_epi64(a2, a3);
360
1.68M
  out[3] = _mm_unpackhi_epi64(a2, a3);
361
1.68M
  out[4] = _mm_unpacklo_epi64(a4, a5);
362
1.68M
  out[5] = _mm_unpackhi_epi64(a4, a5);
363
1.68M
  out[6] = _mm_unpacklo_epi64(a6, a7);
364
1.68M
  out[7] = _mm_unpackhi_epi64(a6, a7);
365
1.68M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_32bit_8x4
Unexecuted instantiation: vp9_frame_scale_ssse3.c:transpose_32bit_8x4
366
367
#endif  // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_