Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/common_avx2.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
12
#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
13
#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
14
15
#include <immintrin.h>
16
17
#include "config/aom_config.h"
18
19
// Note: in and out could have the same value
20
0
static inline void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
21
0
  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
22
0
  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
23
0
  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
24
0
  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
25
0
  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
26
0
  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
27
0
  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
28
0
  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
29
0
30
0
  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
31
0
  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
32
0
  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
33
0
  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
34
0
  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
35
0
  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
36
0
  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
37
0
  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
38
0
39
0
  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
40
0
  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
41
0
  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
42
0
  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
43
0
  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
44
0
  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
45
0
  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
46
0
  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
47
0
48
0
  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
49
0
  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
50
0
  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
51
0
  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
52
0
  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
53
0
  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
54
0
  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
55
0
  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
56
0
57
0
  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
58
0
  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
59
0
  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
60
0
  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
61
0
  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
62
0
  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
63
0
  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
64
0
  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
65
0
66
0
  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
67
0
  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
68
0
  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
69
0
  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
70
0
  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
71
0
  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
72
0
  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
73
0
  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
74
0
75
0
  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
76
0
  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
77
0
  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
78
0
  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
79
0
  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
80
0
  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
81
0
  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
82
0
  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
83
0
84
0
  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
85
0
  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
86
0
  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
87
0
  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
88
0
  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
89
0
  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
90
0
  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
91
0
  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
92
0
93
0
  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
94
0
  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
95
0
  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
96
0
  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
97
0
  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
98
0
  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
99
0
  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
100
0
  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
101
0
102
0
  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
103
0
  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
104
0
  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
105
0
  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
106
0
  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
107
0
  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
108
0
  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
109
0
  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
110
0
111
0
  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
112
0
  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
113
0
  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
114
0
  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
115
0
  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
116
0
  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
117
0
  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
118
0
  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
119
0
120
0
  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
121
0
  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
122
0
  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
123
0
  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
124
0
  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
125
0
  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
126
0
  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
127
0
  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
128
0
129
0
  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
130
0
  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
131
0
  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
132
0
  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
133
0
  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
134
0
  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
135
0
  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
136
0
  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
137
0
138
0
  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
139
0
  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
140
0
  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
141
0
  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
142
0
  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
143
0
  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
144
0
  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
145
0
  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
146
0
}
147
#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_