Coverage Report

Created: 2024-09-06 07:53

/src/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3
 *
4
 *  Use of this source code is governed by a BSD-style license
5
 *  that can be found in the LICENSE file in the root of the source
6
 *  tree. An additional intellectual property rights grant can be found
7
 *  in the file PATENTS.  All contributing project authors may
8
 *  be found in the AUTHORS file in the root of the source tree.
9
 */
10
11
#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
12
#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
13
14
#ifdef __cplusplus
15
extern "C" {
16
#endif
17
18
#define pair_set_epi32(a, b) \
19
  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
20
21
103M
static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22
103M
  __m128i buf0, buf1;
23
103M
  buf0 = _mm_mul_epu32(a, b);
24
103M
  a = _mm_srli_epi64(a, 32);
25
103M
  b = _mm_srli_epi64(b, 32);
26
103M
  buf1 = _mm_mul_epu32(a, b);
27
103M
  return _mm_add_epi64(buf0, buf1);
28
103M
}
fwd_txfm_sse2.c:k_madd_epi32
Line
Count
Source
21
103M
static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
22
103M
  __m128i buf0, buf1;
23
103M
  buf0 = _mm_mul_epu32(a, b);
24
103M
  a = _mm_srli_epi64(a, 32);
25
103M
  b = _mm_srli_epi64(b, 32);
26
103M
  buf1 = _mm_mul_epu32(a, b);
27
103M
  return _mm_add_epi64(buf0, buf1);
28
103M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_madd_epi32
29
30
51.7M
static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31
51.7M
  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32
51.7M
  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33
51.7M
  return _mm_unpacklo_epi64(buf0, buf1);
34
51.7M
}
fwd_txfm_sse2.c:k_packs_epi64
Line
Count
Source
30
51.7M
static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
31
51.7M
  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
32
51.7M
  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
33
51.7M
  return _mm_unpacklo_epi64(buf0, buf1);
34
51.7M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_packs_epi64
35
36
static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
37
0
                                          const __m128i *preg1) {
38
0
  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
39
0
  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
40
0
  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
41
0
                              _mm_cmpeq_epi16(*preg0, min_overflow));
42
0
  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
43
0
                              _mm_cmpeq_epi16(*preg1, min_overflow));
44
0
  cmp0 = _mm_or_si128(cmp0, cmp1);
45
0
  return _mm_movemask_epi8(cmp0);
46
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x2
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x2
47
48
static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
49
                                          const __m128i *preg1,
50
                                          const __m128i *preg2,
51
0
                                          const __m128i *preg3) {
52
0
  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
53
0
  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
54
0
  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
55
0
                              _mm_cmpeq_epi16(*preg0, min_overflow));
56
0
  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
57
0
                              _mm_cmpeq_epi16(*preg1, min_overflow));
58
0
  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
59
0
                              _mm_cmpeq_epi16(*preg2, min_overflow));
60
0
  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
61
0
                              _mm_cmpeq_epi16(*preg3, min_overflow));
62
0
  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
63
0
  return _mm_movemask_epi8(cmp0);
64
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x4
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x4
65
66
static INLINE int check_epi16_overflow_x8(
67
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
68
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
69
0
    const __m128i *preg6, const __m128i *preg7) {
70
0
  int res0, res1;
71
0
  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
72
0
  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
73
0
  return res0 + res1;
74
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x8
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x8
75
76
static INLINE int check_epi16_overflow_x12(
77
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
78
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
79
    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
80
0
    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
81
0
  int res0, res1;
82
0
  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
83
0
  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
84
0
  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
85
0
  return res0 + res1;
86
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x12
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x12
87
88
static INLINE int check_epi16_overflow_x16(
89
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
90
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
91
    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
92
    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
93
    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
94
0
    const __m128i *preg15) {
95
0
  int res0, res1;
96
0
  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
97
0
  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
98
0
  if (!res0) {
99
0
    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
100
0
    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
101
0
  }
102
0
  return res0 + res1;
103
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x16
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x16
104
105
static INLINE int check_epi16_overflow_x32(
106
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
107
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
108
    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
109
    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
110
    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
111
    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
112
    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
113
    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
114
    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
115
    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
116
0
    const __m128i *preg30, const __m128i *preg31) {
117
0
  int res0, res1;
118
0
  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
119
0
  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
120
0
  if (!res0) {
121
0
    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
122
0
    if (!res1) {
123
0
      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
124
0
      if (!res0) {
125
0
        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
126
0
        if (!res1) {
127
0
          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
128
0
          if (!res0) {
129
0
            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
130
0
            if (!res1)
131
0
              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
132
0
          }
133
0
        }
134
0
      }
135
0
    }
136
0
  }
137
0
  return res0 + res1;
138
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:check_epi16_overflow_x32
Unexecuted instantiation: vp9_dct_intrin_sse2.c:check_epi16_overflow_x32
139
140
static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
141
                                           const __m128i *preg1,
142
                                           const __m128i *preg2,
143
                                           const __m128i *preg3,
144
0
                                           const __m128i *zero) {
145
0
  __m128i minus_one = _mm_set1_epi32(-1);
146
  // Check for overflows
147
0
  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
148
0
  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
149
0
  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
150
0
  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
151
0
  __m128i reg0_top_dwords =
152
0
      _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
153
0
  __m128i reg1_top_dwords =
154
0
      _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
155
0
  __m128i reg2_top_dwords =
156
0
      _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
157
0
  __m128i reg3_top_dwords =
158
0
      _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
159
0
  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
160
0
  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
161
0
  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
162
0
  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
163
0
  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
164
0
  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
165
0
  int overflow_01 =
166
0
      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
167
0
  int overflow_23 =
168
0
      _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
169
0
  return (overflow_01 + overflow_23);
170
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_4
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_4
171
172
static INLINE int k_check_epi32_overflow_8(
173
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
174
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
175
0
    const __m128i *preg6, const __m128i *preg7, const __m128i *zero) {
176
0
  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
177
0
  if (!overflow) {
178
0
    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
179
0
  }
180
0
  return overflow;
181
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_8
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_8
182
183
static INLINE int k_check_epi32_overflow_16(
184
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
185
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
186
    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
187
    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
188
    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
189
0
    const __m128i *preg15, const __m128i *zero) {
190
0
  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
191
0
  if (!overflow) {
192
0
    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
193
0
    if (!overflow) {
194
0
      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
195
0
      if (!overflow) {
196
0
        overflow =
197
0
            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
198
0
      }
199
0
    }
200
0
  }
201
0
  return overflow;
202
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_16
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_16
203
204
static INLINE int k_check_epi32_overflow_32(
205
    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
206
    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
207
    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
208
    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
209
    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
210
    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
211
    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
212
    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
213
    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
214
    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
215
0
    const __m128i *preg30, const __m128i *preg31, const __m128i *zero) {
216
0
  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
217
0
  if (!overflow) {
218
0
    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
219
0
    if (!overflow) {
220
0
      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
221
0
      if (!overflow) {
222
0
        overflow =
223
0
            k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero);
224
0
        if (!overflow) {
225
0
          overflow =
226
0
              k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero);
227
0
          if (!overflow) {
228
0
            overflow =
229
0
                k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero);
230
0
            if (!overflow) {
231
0
              overflow = k_check_epi32_overflow_4(preg24, preg25, preg26,
232
0
                                                  preg27, zero);
233
0
              if (!overflow) {
234
0
                overflow = k_check_epi32_overflow_4(preg28, preg29, preg30,
235
0
                                                    preg31, zero);
236
0
              }
237
0
            }
238
0
          }
239
0
        }
240
0
      }
241
0
    }
242
0
  }
243
0
  return overflow;
244
0
}
Unexecuted instantiation: fwd_txfm_sse2.c:k_check_epi32_overflow_32
Unexecuted instantiation: vp9_dct_intrin_sse2.c:k_check_epi32_overflow_32
245
246
1.07G
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
247
1.07G
#if CONFIG_VP9_HIGHBITDEPTH
248
1.07G
  const __m128i zero = _mm_setzero_si128();
249
1.07G
  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
250
1.07G
  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
251
1.07G
  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
252
1.07G
  _mm_store_si128((__m128i *)(dst_ptr), out0);
253
1.07G
  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
254
#else
255
  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
256
#endif  // CONFIG_VP9_HIGHBITDEPTH
257
1.07G
}
fwd_txfm_sse2.c:store_output
Line
Count
Source
246
348M
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
247
348M
#if CONFIG_VP9_HIGHBITDEPTH
248
348M
  const __m128i zero = _mm_setzero_si128();
249
348M
  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
250
348M
  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
251
348M
  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
252
348M
  _mm_store_si128((__m128i *)(dst_ptr), out0);
253
348M
  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
254
#else
255
  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
256
#endif  // CONFIG_VP9_HIGHBITDEPTH
257
348M
}
vp9_dct_intrin_sse2.c:store_output
Line
Count
Source
246
722M
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
247
722M
#if CONFIG_VP9_HIGHBITDEPTH
248
722M
  const __m128i zero = _mm_setzero_si128();
249
722M
  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
250
722M
  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
251
722M
  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
252
722M
  _mm_store_si128((__m128i *)(dst_ptr), out0);
253
722M
  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
254
#else
255
  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
256
#endif  // CONFIG_VP9_HIGHBITDEPTH
257
722M
}
258
259
1.30G
static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
260
1.30G
#if CONFIG_VP9_HIGHBITDEPTH
261
1.30G
  const __m128i zero = _mm_setzero_si128();
262
1.30G
  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
263
1.30G
  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
264
1.30G
  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
265
1.30G
  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
266
1.30G
  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
267
#else
268
  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
269
#endif  // CONFIG_VP9_HIGHBITDEPTH
270
1.30G
}
fwd_txfm_sse2.c:storeu_output
Line
Count
Source
259
1.30G
static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
260
1.30G
#if CONFIG_VP9_HIGHBITDEPTH
261
1.30G
  const __m128i zero = _mm_setzero_si128();
262
1.30G
  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
263
1.30G
  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
264
1.30G
  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
265
1.30G
  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
266
1.30G
  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
267
#else
268
  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
269
#endif  // CONFIG_VP9_HIGHBITDEPTH
270
1.30G
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:storeu_output
271
272
static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
273
                                       const __m128i *pmultiplier,
274
                                       const __m128i *prounding,
275
894M
                                       const int shift) {
276
894M
  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
277
894M
  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
278
894M
  const __m128i v0 = _mm_add_epi32(u0, *prounding);
279
894M
  const __m128i v1 = _mm_add_epi32(u1, *prounding);
280
894M
  const __m128i w0 = _mm_srai_epi32(v0, shift);
281
894M
  const __m128i w1 = _mm_srai_epi32(v1, shift);
282
894M
  return _mm_packs_epi32(w0, w1);
283
894M
}
fwd_txfm_sse2.c:mult_round_shift
Line
Count
Source
275
894M
                                       const int shift) {
276
894M
  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
277
894M
  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
278
894M
  const __m128i v0 = _mm_add_epi32(u0, *prounding);
279
894M
  const __m128i v1 = _mm_add_epi32(u1, *prounding);
280
894M
  const __m128i w0 = _mm_srai_epi32(v0, shift);
281
894M
  const __m128i w1 = _mm_srai_epi32(v1, shift);
282
894M
  return _mm_packs_epi32(w0, w1);
283
894M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:mult_round_shift
284
285
static INLINE void transpose_and_output8x8(
286
    const __m128i *pin00, const __m128i *pin01, const __m128i *pin02,
287
    const __m128i *pin03, const __m128i *pin04, const __m128i *pin05,
288
    const __m128i *pin06, const __m128i *pin07, const int pass,
289
68.7M
    int16_t *out0_ptr, tran_low_t *out1_ptr) {
290
  // 00 01 02 03 04 05 06 07
291
  // 10 11 12 13 14 15 16 17
292
  // 20 21 22 23 24 25 26 27
293
  // 30 31 32 33 34 35 36 37
294
  // 40 41 42 43 44 45 46 47
295
  // 50 51 52 53 54 55 56 57
296
  // 60 61 62 63 64 65 66 67
297
  // 70 71 72 73 74 75 76 77
298
68.7M
  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
299
68.7M
  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
300
68.7M
  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
301
68.7M
  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
302
68.7M
  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
303
68.7M
  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
304
68.7M
  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
305
68.7M
  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
306
  // 00 10 01 11 02 12 03 13
307
  // 20 30 21 31 22 32 23 33
308
  // 04 14 05 15 06 16 07 17
309
  // 24 34 25 35 26 36 27 37
310
  // 40 50 41 51 42 52 43 53
311
  // 60 70 61 71 62 72 63 73
312
  // 54 54 55 55 56 56 57 57
313
  // 64 74 65 75 66 76 67 77
314
68.7M
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
315
68.7M
  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
316
68.7M
  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
317
68.7M
  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
318
68.7M
  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
319
68.7M
  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
320
68.7M
  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
321
68.7M
  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
322
  // 00 10 20 30 01 11 21 31
323
  // 40 50 60 70 41 51 61 71
324
  // 02 12 22 32 03 13 23 33
325
  // 42 52 62 72 43 53 63 73
326
  // 04 14 24 34 05 15 21 36
327
  // 44 54 64 74 45 55 61 76
328
  // 06 16 26 36 07 17 27 37
329
  // 46 56 66 76 47 57 67 77
330
68.7M
  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
331
68.7M
  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
332
68.7M
  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
333
68.7M
  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
334
68.7M
  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
335
68.7M
  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
336
68.7M
  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
337
68.7M
  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
338
  // 00 10 20 30 40 50 60 70
339
  // 01 11 21 31 41 51 61 71
340
  // 02 12 22 32 42 52 62 72
341
  // 03 13 23 33 43 53 63 73
342
  // 04 14 24 34 44 54 64 74
343
  // 05 15 25 35 45 55 65 75
344
  // 06 16 26 36 46 56 66 76
345
  // 07 17 27 37 47 57 67 77
346
68.7M
  if (pass == 0) {
347
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
348
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
349
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
350
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
351
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
352
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
353
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
354
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
355
34.3M
  } else {
356
34.3M
    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
357
34.3M
    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
358
34.3M
    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
359
34.3M
    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
360
34.3M
    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
361
34.3M
    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
362
34.3M
    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
363
34.3M
    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
364
34.3M
  }
365
68.7M
}
fwd_txfm_sse2.c:transpose_and_output8x8
Line
Count
Source
289
68.7M
    int16_t *out0_ptr, tran_low_t *out1_ptr) {
290
  // 00 01 02 03 04 05 06 07
291
  // 10 11 12 13 14 15 16 17
292
  // 20 21 22 23 24 25 26 27
293
  // 30 31 32 33 34 35 36 37
294
  // 40 41 42 43 44 45 46 47
295
  // 50 51 52 53 54 55 56 57
296
  // 60 61 62 63 64 65 66 67
297
  // 70 71 72 73 74 75 76 77
298
68.7M
  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
299
68.7M
  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
300
68.7M
  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
301
68.7M
  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
302
68.7M
  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
303
68.7M
  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
304
68.7M
  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
305
68.7M
  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
306
  // 00 10 01 11 02 12 03 13
307
  // 20 30 21 31 22 32 23 33
308
  // 04 14 05 15 06 16 07 17
309
  // 24 34 25 35 26 36 27 37
310
  // 40 50 41 51 42 52 43 53
311
  // 60 70 61 71 62 72 63 73
312
  // 54 54 55 55 56 56 57 57
313
  // 64 74 65 75 66 76 67 77
314
68.7M
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
315
68.7M
  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
316
68.7M
  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
317
68.7M
  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
318
68.7M
  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
319
68.7M
  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
320
68.7M
  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
321
68.7M
  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
322
  // 00 10 20 30 01 11 21 31
323
  // 40 50 60 70 41 51 61 71
324
  // 02 12 22 32 03 13 23 33
325
  // 42 52 62 72 43 53 63 73
326
  // 04 14 24 34 05 15 21 36
327
  // 44 54 64 74 45 55 61 76
328
  // 06 16 26 36 07 17 27 37
329
  // 46 56 66 76 47 57 67 77
330
68.7M
  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
331
68.7M
  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
332
68.7M
  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
333
68.7M
  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
334
68.7M
  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
335
68.7M
  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
336
68.7M
  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
337
68.7M
  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
338
  // 00 10 20 30 40 50 60 70
339
  // 01 11 21 31 41 51 61 71
340
  // 02 12 22 32 42 52 62 72
341
  // 03 13 23 33 43 53 63 73
342
  // 04 14 24 34 44 54 64 74
343
  // 05 15 25 35 45 55 65 75
344
  // 06 16 26 36 46 56 66 76
345
  // 07 17 27 37 47 57 67 77
346
68.7M
  if (pass == 0) {
347
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0);
348
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1);
349
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2);
350
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3);
351
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4);
352
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5);
353
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6);
354
34.3M
    _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7);
355
34.3M
  } else {
356
34.3M
    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
357
34.3M
    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
358
34.3M
    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
359
34.3M
    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
360
34.3M
    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
361
34.3M
    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
362
34.3M
    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
363
34.3M
    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
364
34.3M
  }
365
68.7M
}
Unexecuted instantiation: vp9_dct_intrin_sse2.c:transpose_and_output8x8
366
367
#ifdef __cplusplus
368
}  // extern "C"
369
#endif
370
371
#endif  // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_