Coverage Report

Created: 2025-06-13 07:07

/src/aom/aom_dsp/x86/intrapred_utils.h
Line
Count
Source
1
/*
2
 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
3
 *
4
 * This source code is subject to the terms of the BSD 2 Clause License and
5
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6
 * was not distributed with this source code in the LICENSE file, you can
7
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8
 * Media Patent License 1.0 was not distributed with this source code in the
9
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10
 */
11
#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
12
#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
13
14
#include <emmintrin.h>  // SSE2
15
#include "aom/aom_integer.h"
16
#include "config/aom_config.h"
17
#include "config/aom_dsp_rtcd.h"
18
19
static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
20
  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
21
  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
22
  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
23
  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
24
  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
25
  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
26
  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
27
  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
28
};
29
30
static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
31
  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
32
  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
33
  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
34
  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
35
  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
36
  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
37
  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
38
  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
39
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
40
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
41
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
42
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
43
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
44
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
45
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
46
  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
47
};
48
49
static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
50
  { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
51
  { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
52
  { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
53
  { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
54
};
55
56
17.3k
static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
57
17.3k
  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
58
17.3k
  w0 = _mm_unpacklo_epi8(x[0], x[1]);
59
17.3k
  w1 = _mm_unpacklo_epi8(x[2], x[3]);
60
17.3k
  w2 = _mm_unpackhi_epi8(x[0], x[1]);
61
17.3k
  w3 = _mm_unpackhi_epi8(x[2], x[3]);
62
63
17.3k
  ww0 = _mm_unpacklo_epi16(w0, w1);
64
17.3k
  ww1 = _mm_unpacklo_epi16(w2, w3);
65
17.3k
  ww2 = _mm_unpackhi_epi16(w0, w1);
66
17.3k
  ww3 = _mm_unpackhi_epi16(w2, w3);
67
68
17.3k
  w0 = _mm_unpacklo_epi32(ww0, ww1);
69
17.3k
  w2 = _mm_unpacklo_epi32(ww2, ww3);
70
17.3k
  w1 = _mm_unpackhi_epi32(ww0, ww1);
71
17.3k
  w3 = _mm_unpackhi_epi32(ww2, ww3);
72
73
17.3k
  d[0] = _mm_unpacklo_epi64(w0, w2);
74
17.3k
  d[1] = _mm_unpackhi_epi64(w0, w2);
75
17.3k
  d[2] = _mm_unpacklo_epi64(w1, w3);
76
17.3k
  d[3] = _mm_unpackhi_epi64(w1, w3);
77
78
17.3k
  d[4] = _mm_srli_si128(d[0], 8);
79
17.3k
  d[5] = _mm_srli_si128(d[1], 8);
80
17.3k
  d[6] = _mm_srli_si128(d[2], 8);
81
17.3k
  d[7] = _mm_srli_si128(d[3], 8);
82
83
17.3k
  d[8] = _mm_srli_si128(d[0], 4);
84
17.3k
  d[9] = _mm_srli_si128(d[1], 4);
85
17.3k
  d[10] = _mm_srli_si128(d[2], 4);
86
17.3k
  d[11] = _mm_srli_si128(d[3], 4);
87
88
17.3k
  d[12] = _mm_srli_si128(d[0], 12);
89
17.3k
  d[13] = _mm_srli_si128(d[1], 12);
90
17.3k
  d[14] = _mm_srli_si128(d[2], 12);
91
17.3k
  d[15] = _mm_srli_si128(d[3], 12);
92
17.3k
}
Unexecuted instantiation: intrapred_sse4.c:transpose4x16_sse2
intrapred_avx2.c:transpose4x16_sse2
Line
Count
Source
56
17.3k
static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
57
17.3k
  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
58
17.3k
  w0 = _mm_unpacklo_epi8(x[0], x[1]);
59
17.3k
  w1 = _mm_unpacklo_epi8(x[2], x[3]);
60
17.3k
  w2 = _mm_unpackhi_epi8(x[0], x[1]);
61
17.3k
  w3 = _mm_unpackhi_epi8(x[2], x[3]);
62
63
17.3k
  ww0 = _mm_unpacklo_epi16(w0, w1);
64
17.3k
  ww1 = _mm_unpacklo_epi16(w2, w3);
65
17.3k
  ww2 = _mm_unpackhi_epi16(w0, w1);
66
17.3k
  ww3 = _mm_unpackhi_epi16(w2, w3);
67
68
17.3k
  w0 = _mm_unpacklo_epi32(ww0, ww1);
69
17.3k
  w2 = _mm_unpacklo_epi32(ww2, ww3);
70
17.3k
  w1 = _mm_unpackhi_epi32(ww0, ww1);
71
17.3k
  w3 = _mm_unpackhi_epi32(ww2, ww3);
72
73
17.3k
  d[0] = _mm_unpacklo_epi64(w0, w2);
74
17.3k
  d[1] = _mm_unpackhi_epi64(w0, w2);
75
17.3k
  d[2] = _mm_unpacklo_epi64(w1, w3);
76
17.3k
  d[3] = _mm_unpackhi_epi64(w1, w3);
77
78
17.3k
  d[4] = _mm_srli_si128(d[0], 8);
79
17.3k
  d[5] = _mm_srli_si128(d[1], 8);
80
17.3k
  d[6] = _mm_srli_si128(d[2], 8);
81
17.3k
  d[7] = _mm_srli_si128(d[3], 8);
82
83
17.3k
  d[8] = _mm_srli_si128(d[0], 4);
84
17.3k
  d[9] = _mm_srli_si128(d[1], 4);
85
17.3k
  d[10] = _mm_srli_si128(d[2], 4);
86
17.3k
  d[11] = _mm_srli_si128(d[3], 4);
87
88
17.3k
  d[12] = _mm_srli_si128(d[0], 12);
89
17.3k
  d[13] = _mm_srli_si128(d[1], 12);
90
17.3k
  d[14] = _mm_srli_si128(d[2], 12);
91
17.3k
  d[15] = _mm_srli_si128(d[3], 12);
92
17.3k
}
93
94
615k
static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
95
615k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
96
615k
  __m128i w10, w11, w12, w13, w14, w15;
97
98
615k
  w0 = _mm_unpacklo_epi8(x[0], x[1]);
99
615k
  w1 = _mm_unpacklo_epi8(x[2], x[3]);
100
615k
  w2 = _mm_unpacklo_epi8(x[4], x[5]);
101
615k
  w3 = _mm_unpacklo_epi8(x[6], x[7]);
102
103
615k
  w8 = _mm_unpacklo_epi8(x[8], x[9]);
104
615k
  w9 = _mm_unpacklo_epi8(x[10], x[11]);
105
615k
  w10 = _mm_unpacklo_epi8(x[12], x[13]);
106
615k
  w11 = _mm_unpacklo_epi8(x[14], x[15]);
107
108
615k
  w4 = _mm_unpacklo_epi16(w0, w1);
109
615k
  w5 = _mm_unpacklo_epi16(w2, w3);
110
615k
  w12 = _mm_unpacklo_epi16(w8, w9);
111
615k
  w13 = _mm_unpacklo_epi16(w10, w11);
112
113
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
114
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
115
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
116
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
117
118
  // Store first 4-line result
119
615k
  d[0] = _mm_unpacklo_epi64(w6, w14);
120
615k
  d[1] = _mm_unpackhi_epi64(w6, w14);
121
615k
  d[2] = _mm_unpacklo_epi64(w7, w15);
122
615k
  d[3] = _mm_unpackhi_epi64(w7, w15);
123
124
615k
  w4 = _mm_unpackhi_epi16(w0, w1);
125
615k
  w5 = _mm_unpackhi_epi16(w2, w3);
126
615k
  w12 = _mm_unpackhi_epi16(w8, w9);
127
615k
  w13 = _mm_unpackhi_epi16(w10, w11);
128
129
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
130
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
131
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
132
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
133
134
  // Store second 4-line result
135
615k
  d[4] = _mm_unpacklo_epi64(w6, w14);
136
615k
  d[5] = _mm_unpackhi_epi64(w6, w14);
137
615k
  d[6] = _mm_unpacklo_epi64(w7, w15);
138
615k
  d[7] = _mm_unpackhi_epi64(w7, w15);
139
140
  // upper half
141
615k
  w0 = _mm_unpackhi_epi8(x[0], x[1]);
142
615k
  w1 = _mm_unpackhi_epi8(x[2], x[3]);
143
615k
  w2 = _mm_unpackhi_epi8(x[4], x[5]);
144
615k
  w3 = _mm_unpackhi_epi8(x[6], x[7]);
145
146
615k
  w8 = _mm_unpackhi_epi8(x[8], x[9]);
147
615k
  w9 = _mm_unpackhi_epi8(x[10], x[11]);
148
615k
  w10 = _mm_unpackhi_epi8(x[12], x[13]);
149
615k
  w11 = _mm_unpackhi_epi8(x[14], x[15]);
150
151
615k
  w4 = _mm_unpacklo_epi16(w0, w1);
152
615k
  w5 = _mm_unpacklo_epi16(w2, w3);
153
615k
  w12 = _mm_unpacklo_epi16(w8, w9);
154
615k
  w13 = _mm_unpacklo_epi16(w10, w11);
155
156
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
157
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
158
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
159
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
160
161
  // Store first 4-line result
162
615k
  d[8] = _mm_unpacklo_epi64(w6, w14);
163
615k
  d[9] = _mm_unpackhi_epi64(w6, w14);
164
615k
  d[10] = _mm_unpacklo_epi64(w7, w15);
165
615k
  d[11] = _mm_unpackhi_epi64(w7, w15);
166
167
615k
  w4 = _mm_unpackhi_epi16(w0, w1);
168
615k
  w5 = _mm_unpackhi_epi16(w2, w3);
169
615k
  w12 = _mm_unpackhi_epi16(w8, w9);
170
615k
  w13 = _mm_unpackhi_epi16(w10, w11);
171
172
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
173
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
174
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
175
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
176
177
  // Store second 4-line result
178
615k
  d[12] = _mm_unpacklo_epi64(w6, w14);
179
615k
  d[13] = _mm_unpackhi_epi64(w6, w14);
180
615k
  d[14] = _mm_unpacklo_epi64(w7, w15);
181
615k
  d[15] = _mm_unpackhi_epi64(w7, w15);
182
615k
}
Unexecuted instantiation: intrapred_sse4.c:transpose16x16_sse2
intrapred_avx2.c:transpose16x16_sse2
Line
Count
Source
94
615k
static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
95
615k
  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
96
615k
  __m128i w10, w11, w12, w13, w14, w15;
97
98
615k
  w0 = _mm_unpacklo_epi8(x[0], x[1]);
99
615k
  w1 = _mm_unpacklo_epi8(x[2], x[3]);
100
615k
  w2 = _mm_unpacklo_epi8(x[4], x[5]);
101
615k
  w3 = _mm_unpacklo_epi8(x[6], x[7]);
102
103
615k
  w8 = _mm_unpacklo_epi8(x[8], x[9]);
104
615k
  w9 = _mm_unpacklo_epi8(x[10], x[11]);
105
615k
  w10 = _mm_unpacklo_epi8(x[12], x[13]);
106
615k
  w11 = _mm_unpacklo_epi8(x[14], x[15]);
107
108
615k
  w4 = _mm_unpacklo_epi16(w0, w1);
109
615k
  w5 = _mm_unpacklo_epi16(w2, w3);
110
615k
  w12 = _mm_unpacklo_epi16(w8, w9);
111
615k
  w13 = _mm_unpacklo_epi16(w10, w11);
112
113
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
114
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
115
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
116
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
117
118
  // Store first 4-line result
119
615k
  d[0] = _mm_unpacklo_epi64(w6, w14);
120
615k
  d[1] = _mm_unpackhi_epi64(w6, w14);
121
615k
  d[2] = _mm_unpacklo_epi64(w7, w15);
122
615k
  d[3] = _mm_unpackhi_epi64(w7, w15);
123
124
615k
  w4 = _mm_unpackhi_epi16(w0, w1);
125
615k
  w5 = _mm_unpackhi_epi16(w2, w3);
126
615k
  w12 = _mm_unpackhi_epi16(w8, w9);
127
615k
  w13 = _mm_unpackhi_epi16(w10, w11);
128
129
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
130
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
131
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
132
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
133
134
  // Store second 4-line result
135
615k
  d[4] = _mm_unpacklo_epi64(w6, w14);
136
615k
  d[5] = _mm_unpackhi_epi64(w6, w14);
137
615k
  d[6] = _mm_unpacklo_epi64(w7, w15);
138
615k
  d[7] = _mm_unpackhi_epi64(w7, w15);
139
140
  // upper half
141
615k
  w0 = _mm_unpackhi_epi8(x[0], x[1]);
142
615k
  w1 = _mm_unpackhi_epi8(x[2], x[3]);
143
615k
  w2 = _mm_unpackhi_epi8(x[4], x[5]);
144
615k
  w3 = _mm_unpackhi_epi8(x[6], x[7]);
145
146
615k
  w8 = _mm_unpackhi_epi8(x[8], x[9]);
147
615k
  w9 = _mm_unpackhi_epi8(x[10], x[11]);
148
615k
  w10 = _mm_unpackhi_epi8(x[12], x[13]);
149
615k
  w11 = _mm_unpackhi_epi8(x[14], x[15]);
150
151
615k
  w4 = _mm_unpacklo_epi16(w0, w1);
152
615k
  w5 = _mm_unpacklo_epi16(w2, w3);
153
615k
  w12 = _mm_unpacklo_epi16(w8, w9);
154
615k
  w13 = _mm_unpacklo_epi16(w10, w11);
155
156
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
157
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
158
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
159
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
160
161
  // Store first 4-line result
162
615k
  d[8] = _mm_unpacklo_epi64(w6, w14);
163
615k
  d[9] = _mm_unpackhi_epi64(w6, w14);
164
615k
  d[10] = _mm_unpacklo_epi64(w7, w15);
165
615k
  d[11] = _mm_unpackhi_epi64(w7, w15);
166
167
615k
  w4 = _mm_unpackhi_epi16(w0, w1);
168
615k
  w5 = _mm_unpackhi_epi16(w2, w3);
169
615k
  w12 = _mm_unpackhi_epi16(w8, w9);
170
615k
  w13 = _mm_unpackhi_epi16(w10, w11);
171
172
615k
  w6 = _mm_unpacklo_epi32(w4, w5);
173
615k
  w7 = _mm_unpackhi_epi32(w4, w5);
174
615k
  w14 = _mm_unpacklo_epi32(w12, w13);
175
615k
  w15 = _mm_unpackhi_epi32(w12, w13);
176
177
  // Store second 4-line result
178
615k
  d[12] = _mm_unpacklo_epi64(w6, w14);
179
615k
  d[13] = _mm_unpackhi_epi64(w6, w14);
180
615k
  d[14] = _mm_unpacklo_epi64(w7, w15);
181
615k
  d[15] = _mm_unpackhi_epi64(w7, w15);
182
615k
}
183
184
static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
185
420k
                               uint8_t *dst, ptrdiff_t pitchDst) {
186
420k
  __m128i r[16];
187
420k
  __m128i d[16];
188
7.14M
  for (int j = 0; j < 16; j++) {
189
6.72M
    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
190
6.72M
  }
191
420k
  transpose16x16_sse2(r, d);
192
7.14M
  for (int j = 0; j < 16; j++) {
193
6.72M
    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
194
6.72M
  }
195
420k
}
Unexecuted instantiation: intrapred_sse4.c:transpose_TX_16X16
intrapred_avx2.c:transpose_TX_16X16
Line
Count
Source
185
420k
                               uint8_t *dst, ptrdiff_t pitchDst) {
186
420k
  __m128i r[16];
187
420k
  __m128i d[16];
188
7.14M
  for (int j = 0; j < 16; j++) {
189
6.72M
    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
190
6.72M
  }
191
420k
  transpose16x16_sse2(r, d);
192
7.14M
  for (int j = 0; j < 16; j++) {
193
6.72M
    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
194
6.72M
  }
195
420k
}
196
197
static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
198
31.6k
                      ptrdiff_t pitchDst, int width, int height) {
199
152k
  for (int j = 0; j < height; j += 16)
200
541k
    for (int i = 0; i < width; i += 16)
201
420k
      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
202
420k
                         dst + j * pitchDst + i, pitchDst);
203
31.6k
}
Unexecuted instantiation: intrapred_sse4.c:transpose
intrapred_avx2.c:transpose
Line
Count
Source
198
31.6k
                      ptrdiff_t pitchDst, int width, int height) {
199
152k
  for (int j = 0; j < height; j += 16)
200
541k
    for (int i = 0; i < width; i += 16)
201
420k
      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
202
420k
                         dst + j * pitchDst + i, pitchDst);
203
31.6k
}
204
205
#endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_