/src/libde265/libde265/x86/transform-avx2.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2026 Dirk Farin <dirk.farin@gmail.com> |
4 | | * |
5 | | * This file is part of libde265. |
6 | | * |
7 | | * libde265 is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as |
9 | | * published by the Free Software Foundation, either version 3 of |
10 | | * the License, or (at your option) any later version. |
11 | | * |
12 | | * libde265 is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
19 | | */ |
20 | | |
21 | | // AVX2 inverse transforms (16x16, 32x32) for 8-bit. |
22 | | // |
23 | | // These produce bit-identical results to the SSE4.1 / scalar kernels but |
24 | | // process all columns of a block in 256-bit registers (16 int16 per register) |
25 | | // instead of 8-column slabs. The per-column butterfly arithmetic matches the |
26 | | // SSE version: every 256-bit op acts independently on its two 128-bit halves, |
27 | | // so each half does exactly what the SSE code does for 8 columns. The only |
28 | | // genuinely new piece is a full 16x16 / 32x32 in-register transpose between the |
29 | | // vertical and horizontal passes (the SSE code only transposed 8x8 sub-blocks). |
30 | | // |
31 | | // The 32-point inverse transform's even part is itself a 16-point inverse |
32 | | // transform of the even-indexed input rows, so idct16_core() is reused for it. |
33 | | |
34 | | #include "x86/transform-avx2.h" |
35 | | |
36 | | #ifdef HAVE_CONFIG_H |
37 | | #include "config.h" |
38 | | #endif |
39 | | |
40 | | #if HAVE_AVX2 |
41 | | |
42 | | #include <immintrin.h> |
43 | | #include "x86/transform-dct-tables.h" // idct_T16_{1,2,3}, idct_T32 |
44 | | |
45 | | namespace { |
46 | | |
47 | 63.9k | #define shift_1st 7 |
48 | 31.9k | #define add_1st (1 << (shift_1st - 1)) // 64 |
49 | | // 8-bit: shift_2nd = 20 - 8 = 12, add_2nd = 1<<11 |
50 | 63.9k | #define shift_2nd 12 |
51 | 31.9k | #define add_2nd (1 << (shift_2nd - 1)) // 2048 |
52 | | |
53 | 10.5M | static inline __m256i bc(const int16_t* p) { |
54 | 10.5M | return _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)p)); |
55 | 10.5M | } |
56 | | |
57 | | // --- transposes ---------------------------------------------------------- |
58 | | |
59 | | // 8x8 int16 transpose performed independently in each 128-bit lane. |
60 | 188k | static inline void transpose8x8_lanes(__m256i a[8]) { |
61 | 188k | __m256i b0=_mm256_unpacklo_epi16(a[0],a[1]), b1=_mm256_unpackhi_epi16(a[0],a[1]); |
62 | 188k | __m256i b2=_mm256_unpacklo_epi16(a[2],a[3]), b3=_mm256_unpackhi_epi16(a[2],a[3]); |
63 | 188k | __m256i b4=_mm256_unpacklo_epi16(a[4],a[5]), b5=_mm256_unpackhi_epi16(a[4],a[5]); |
64 | 188k | __m256i b6=_mm256_unpacklo_epi16(a[6],a[7]), b7=_mm256_unpackhi_epi16(a[6],a[7]); |
65 | 188k | __m256i c0=_mm256_unpacklo_epi32(b0,b2), c1=_mm256_unpackhi_epi32(b0,b2); |
66 | 188k | __m256i c2=_mm256_unpacklo_epi32(b1,b3), c3=_mm256_unpackhi_epi32(b1,b3); |
67 | 188k | __m256i c4=_mm256_unpacklo_epi32(b4,b6), c5=_mm256_unpackhi_epi32(b4,b6); |
68 | 188k | __m256i c6=_mm256_unpacklo_epi32(b5,b7), c7=_mm256_unpackhi_epi32(b5,b7); |
69 | 188k | a[0]=_mm256_unpacklo_epi64(c0,c4); a[1]=_mm256_unpackhi_epi64(c0,c4); |
70 | 188k | a[2]=_mm256_unpacklo_epi64(c1,c5); a[3]=_mm256_unpackhi_epi64(c1,c5); |
71 | 188k | a[4]=_mm256_unpacklo_epi64(c2,c6); a[5]=_mm256_unpackhi_epi64(c2,c6); |
72 | 188k | a[6]=_mm256_unpacklo_epi64(c3,c7); a[7]=_mm256_unpackhi_epi64(c3,c7); |
73 | 188k | } |
74 | | |
75 | | // Full 16x16 int16 transpose of S[0..15] (each ymm = one row: lane0 cols0-7, |
76 | | // lane1 cols8-15): four 8x8 quadrant transposes + a lane swap. |
77 | 94.2k | static inline void transpose16x16(__m256i S[16]) { |
78 | 94.2k | __m256i top[8], bot[8]; |
79 | 847k | for (int r=0;r<8;r++){ top[r]=S[r]; bot[r]=S[r+8]; } |
80 | 94.2k | transpose8x8_lanes(top); |
81 | 94.2k | transpose8x8_lanes(bot); |
82 | 847k | for (int r=0;r<8;r++){ |
83 | 753k | S[r] = _mm256_permute2x128_si256(top[r], bot[r], 0x20); |
84 | 753k | S[r+8] = _mm256_permute2x128_si256(top[r], bot[r], 0x31); |
85 | 753k | } |
86 | 94.2k | } |
87 | | |
88 | | // Full 32x32 int16 transpose. Lo[r]=cols0-15 of row r, Hi[r]=cols16-31. |
89 | | // Built from four 16x16 block transposes: out = [[A^T,C^T],[B^T,D^T]]. |
90 | 15.1k | static inline void transpose32x32(__m256i Lo[32], __m256i Hi[32]) { |
91 | 15.1k | __m256i A[16],B[16],C[16],D[16]; |
92 | 257k | for (int r=0;r<16;r++){ A[r]=Lo[r]; B[r]=Hi[r]; C[r]=Lo[r+16]; D[r]=Hi[r+16]; } |
93 | 15.1k | transpose16x16(A); transpose16x16(B); transpose16x16(C); transpose16x16(D); |
94 | 257k | for (int r=0;r<16;r++){ Lo[r]=A[r]; Hi[r]=C[r]; Lo[r+16]=B[r]; Hi[r+16]=D[r]; } |
95 | 15.1k | } |
96 | | |
97 | | // --- 16-point core ------------------------------------------------------- |
98 | | |
99 | | // Computes the 16 outputs of a 1-D 16-point inverse transform applied down the |
100 | | // 16 rows S[0..15], for 16 columns at once, as int32 (no rounding/shift). The |
101 | | // result lanes split lo = cols{0-3,8-11}, hi = cols{4-7,12-15} (recombined to |
102 | | // natural order by packs_epi32 in the caller). |
103 | 63.9k | static inline void idct16_core(const __m256i S[16], __m256i resL[16], __m256i resH[16]) { |
104 | 63.9k | const __m256i T00=bc(idct_T16_1[0][0]),T01=bc(idct_T16_1[0][1]),T02=bc(idct_T16_1[0][2]),T03=bc(idct_T16_1[0][3]); |
105 | 63.9k | const __m256i T04=bc(idct_T16_1[0][4]),T05=bc(idct_T16_1[0][5]),T06=bc(idct_T16_1[0][6]),T07=bc(idct_T16_1[0][7]); |
106 | 63.9k | const __m256i T10=bc(idct_T16_1[1][0]),T11=bc(idct_T16_1[1][1]),T12=bc(idct_T16_1[1][2]),T13=bc(idct_T16_1[1][3]); |
107 | 63.9k | const __m256i T14=bc(idct_T16_1[1][4]),T15=bc(idct_T16_1[1][5]),T16=bc(idct_T16_1[1][6]),T17=bc(idct_T16_1[1][7]); |
108 | 63.9k | const __m256i T20=bc(idct_T16_1[2][0]),T21=bc(idct_T16_1[2][1]),T22=bc(idct_T16_1[2][2]),T23=bc(idct_T16_1[2][3]); |
109 | 63.9k | const __m256i T24=bc(idct_T16_1[2][4]),T25=bc(idct_T16_1[2][5]),T26=bc(idct_T16_1[2][6]),T27=bc(idct_T16_1[2][7]); |
110 | 63.9k | const __m256i T30=bc(idct_T16_1[3][0]),T31=bc(idct_T16_1[3][1]),T32_=bc(idct_T16_1[3][2]),T33=bc(idct_T16_1[3][3]); |
111 | 63.9k | const __m256i T34=bc(idct_T16_1[3][4]),T35=bc(idct_T16_1[3][5]),T36=bc(idct_T16_1[3][6]),T37=bc(idct_T16_1[3][7]); |
112 | 63.9k | const __m256i U00=bc(idct_T16_2[0][0]),U01=bc(idct_T16_2[0][1]),U02=bc(idct_T16_2[0][2]),U03=bc(idct_T16_2[0][3]); |
113 | 63.9k | const __m256i U10=bc(idct_T16_2[1][0]),U11=bc(idct_T16_2[1][1]),U12=bc(idct_T16_2[1][2]),U13=bc(idct_T16_2[1][3]); |
114 | 63.9k | const __m256i V00=bc(idct_T16_3[0][0]),V01=bc(idct_T16_3[0][1]),V10=bc(idct_T16_3[1][0]),V11=bc(idct_T16_3[1][1]); |
115 | | |
116 | 63.9k | __m256i m0,m1,m2,m3,m4,m5,m6,m7; |
117 | 63.9k | __m256i E0l,E1l,E2l,E3l,E0h,E1h,E2h,E3h; |
118 | 63.9k | __m256i O0l,O1l,O2l,O3l,O4l,O5l,O6l,O7l,O0h,O1h,O2h,O3h,O4h,O5h,O6h,O7h; |
119 | 63.9k | __m256i E00l,E01l,E00h,E01h,EE0l,EE1l,EE2l,EE3l,EE0h,EE1h,EE2h,EE3h; |
120 | 63.9k | __m256i E4l,E5l,E6l,E7l,E4h,E5h,E6h,E7h; |
121 | | |
122 | 63.9k | m0=_mm256_unpacklo_epi16(S[1],S[3]); E0l=_mm256_madd_epi16(m0,T00); |
123 | 63.9k | m1=_mm256_unpackhi_epi16(S[1],S[3]); E0h=_mm256_madd_epi16(m1,T00); |
124 | 63.9k | m2=_mm256_unpacklo_epi16(S[5],S[7]); E1l=_mm256_madd_epi16(m2,T10); |
125 | 63.9k | m3=_mm256_unpackhi_epi16(S[5],S[7]); E1h=_mm256_madd_epi16(m3,T10); |
126 | 63.9k | m4=_mm256_unpacklo_epi16(S[9],S[11]); E2l=_mm256_madd_epi16(m4,T20); |
127 | 63.9k | m5=_mm256_unpackhi_epi16(S[9],S[11]); E2h=_mm256_madd_epi16(m5,T20); |
128 | 63.9k | m6=_mm256_unpacklo_epi16(S[13],S[15]);E3l=_mm256_madd_epi16(m6,T30); |
129 | 63.9k | m7=_mm256_unpackhi_epi16(S[13],S[15]);E3h=_mm256_madd_epi16(m7,T30); |
130 | 63.9k | O0l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
131 | 63.9k | O0h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
132 | | |
133 | 63.9k | E0l=_mm256_madd_epi16(m0,T01);E0h=_mm256_madd_epi16(m1,T01);E1l=_mm256_madd_epi16(m2,T11);E1h=_mm256_madd_epi16(m3,T11); |
134 | 63.9k | E2l=_mm256_madd_epi16(m4,T21);E2h=_mm256_madd_epi16(m5,T21);E3l=_mm256_madd_epi16(m6,T31);E3h=_mm256_madd_epi16(m7,T31); |
135 | 63.9k | O1l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
136 | 63.9k | O1h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
137 | | |
138 | 63.9k | E0l=_mm256_madd_epi16(m0,T02);E0h=_mm256_madd_epi16(m1,T02);E1l=_mm256_madd_epi16(m2,T12);E1h=_mm256_madd_epi16(m3,T12); |
139 | 63.9k | E2l=_mm256_madd_epi16(m4,T22);E2h=_mm256_madd_epi16(m5,T22);E3l=_mm256_madd_epi16(m6,T32_);E3h=_mm256_madd_epi16(m7,T32_); |
140 | 63.9k | O2l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
141 | 63.9k | O2h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
142 | | |
143 | 63.9k | E0l=_mm256_madd_epi16(m0,T03);E0h=_mm256_madd_epi16(m1,T03);E1l=_mm256_madd_epi16(m2,T13);E1h=_mm256_madd_epi16(m3,T13); |
144 | 63.9k | E2l=_mm256_madd_epi16(m4,T23);E2h=_mm256_madd_epi16(m5,T23);E3l=_mm256_madd_epi16(m6,T33);E3h=_mm256_madd_epi16(m7,T33); |
145 | 63.9k | O3l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
146 | 63.9k | O3h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
147 | | |
148 | 63.9k | E0l=_mm256_madd_epi16(m0,T04);E0h=_mm256_madd_epi16(m1,T04);E1l=_mm256_madd_epi16(m2,T14);E1h=_mm256_madd_epi16(m3,T14); |
149 | 63.9k | E2l=_mm256_madd_epi16(m4,T24);E2h=_mm256_madd_epi16(m5,T24);E3l=_mm256_madd_epi16(m6,T34);E3h=_mm256_madd_epi16(m7,T34); |
150 | 63.9k | O4l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
151 | 63.9k | O4h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
152 | | |
153 | 63.9k | E0l=_mm256_madd_epi16(m0,T05);E0h=_mm256_madd_epi16(m1,T05);E1l=_mm256_madd_epi16(m2,T15);E1h=_mm256_madd_epi16(m3,T15); |
154 | 63.9k | E2l=_mm256_madd_epi16(m4,T25);E2h=_mm256_madd_epi16(m5,T25);E3l=_mm256_madd_epi16(m6,T35);E3h=_mm256_madd_epi16(m7,T35); |
155 | 63.9k | O5l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
156 | 63.9k | O5h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
157 | | |
158 | 63.9k | E0l=_mm256_madd_epi16(m0,T06);E0h=_mm256_madd_epi16(m1,T06);E1l=_mm256_madd_epi16(m2,T16);E1h=_mm256_madd_epi16(m3,T16); |
159 | 63.9k | E2l=_mm256_madd_epi16(m4,T26);E2h=_mm256_madd_epi16(m5,T26);E3l=_mm256_madd_epi16(m6,T36);E3h=_mm256_madd_epi16(m7,T36); |
160 | 63.9k | O6l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
161 | 63.9k | O6h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
162 | | |
163 | 63.9k | E0l=_mm256_madd_epi16(m0,T07);E0h=_mm256_madd_epi16(m1,T07);E1l=_mm256_madd_epi16(m2,T17);E1h=_mm256_madd_epi16(m3,T17); |
164 | 63.9k | E2l=_mm256_madd_epi16(m4,T27);E2h=_mm256_madd_epi16(m5,T27);E3l=_mm256_madd_epi16(m6,T37);E3h=_mm256_madd_epi16(m7,T37); |
165 | 63.9k | O7l=_mm256_add_epi32(_mm256_add_epi32(E0l,E1l),_mm256_add_epi32(E2l,E3l)); |
166 | 63.9k | O7h=_mm256_add_epi32(_mm256_add_epi32(E0h,E1h),_mm256_add_epi32(E2h,E3h)); |
167 | | |
168 | | // even part |
169 | 63.9k | m0=_mm256_unpacklo_epi16(S[2],S[6]); E0l=_mm256_madd_epi16(m0,U00); |
170 | 63.9k | m1=_mm256_unpackhi_epi16(S[2],S[6]); E0h=_mm256_madd_epi16(m1,U00); |
171 | 63.9k | m2=_mm256_unpacklo_epi16(S[10],S[14]);E0l=_mm256_add_epi32(E0l,_mm256_madd_epi16(m2,U10)); |
172 | 63.9k | m3=_mm256_unpackhi_epi16(S[10],S[14]);E0h=_mm256_add_epi32(E0h,_mm256_madd_epi16(m3,U10)); |
173 | 63.9k | E1l=_mm256_madd_epi16(m0,U01);E1h=_mm256_madd_epi16(m1,U01); |
174 | 63.9k | E1l=_mm256_add_epi32(E1l,_mm256_madd_epi16(m2,U11));E1h=_mm256_add_epi32(E1h,_mm256_madd_epi16(m3,U11)); |
175 | 63.9k | E2l=_mm256_madd_epi16(m0,U02);E2h=_mm256_madd_epi16(m1,U02); |
176 | 63.9k | E2l=_mm256_add_epi32(E2l,_mm256_madd_epi16(m2,U12));E2h=_mm256_add_epi32(E2h,_mm256_madd_epi16(m3,U12)); |
177 | 63.9k | E3l=_mm256_madd_epi16(m0,U03);E3h=_mm256_madd_epi16(m1,U03); |
178 | 63.9k | E3l=_mm256_add_epi32(E3l,_mm256_madd_epi16(m2,U13));E3h=_mm256_add_epi32(E3h,_mm256_madd_epi16(m3,U13)); |
179 | | |
180 | 63.9k | m0=_mm256_unpacklo_epi16(S[4],S[12]); E00l=_mm256_madd_epi16(m0,V00); |
181 | 63.9k | m1=_mm256_unpackhi_epi16(S[4],S[12]); E00h=_mm256_madd_epi16(m1,V00); |
182 | 63.9k | m2=_mm256_unpacklo_epi16(S[0],S[8]); EE0l=_mm256_madd_epi16(m2,V10); |
183 | 63.9k | m3=_mm256_unpackhi_epi16(S[0],S[8]); EE0h=_mm256_madd_epi16(m3,V10); |
184 | 63.9k | E01l=_mm256_madd_epi16(m0,V01);E01h=_mm256_madd_epi16(m1,V01); |
185 | 63.9k | EE1l=_mm256_madd_epi16(m2,V11);EE1h=_mm256_madd_epi16(m3,V11); |
186 | | |
187 | 63.9k | EE2l=_mm256_sub_epi32(EE1l,E01l);EE3l=_mm256_sub_epi32(EE0l,E00l); |
188 | 63.9k | EE2h=_mm256_sub_epi32(EE1h,E01h);EE3h=_mm256_sub_epi32(EE0h,E00h); |
189 | 63.9k | EE0l=_mm256_add_epi32(EE0l,E00l);EE1l=_mm256_add_epi32(EE1l,E01l); |
190 | 63.9k | EE0h=_mm256_add_epi32(EE0h,E00h);EE1h=_mm256_add_epi32(EE1h,E01h); |
191 | | |
192 | 63.9k | E4l=_mm256_sub_epi32(EE3l,E3l); E5l=_mm256_sub_epi32(EE2l,E2l); |
193 | 63.9k | E6l=_mm256_sub_epi32(EE1l,E1l); E7l=_mm256_sub_epi32(EE0l,E0l); |
194 | 63.9k | E4h=_mm256_sub_epi32(EE3h,E3h); E5h=_mm256_sub_epi32(EE2h,E2h); |
195 | 63.9k | E6h=_mm256_sub_epi32(EE1h,E1h); E7h=_mm256_sub_epi32(EE0h,E0h); |
196 | 63.9k | E0l=_mm256_add_epi32(EE0l,E0l); E1l=_mm256_add_epi32(EE1l,E1l); |
197 | 63.9k | E2l=_mm256_add_epi32(EE2l,E2l); E3l=_mm256_add_epi32(EE3l,E3l); |
198 | 63.9k | E0h=_mm256_add_epi32(EE0h,E0h); E1h=_mm256_add_epi32(EE1h,E1h); |
199 | 63.9k | E2h=_mm256_add_epi32(EE2h,E2h); E3h=_mm256_add_epi32(EE3h,E3h); |
200 | | |
201 | 63.9k | resL[0]=_mm256_add_epi32(E0l,O0l); resH[0]=_mm256_add_epi32(E0h,O0h); |
202 | 63.9k | resL[1]=_mm256_add_epi32(E1l,O1l); resH[1]=_mm256_add_epi32(E1h,O1h); |
203 | 63.9k | resL[2]=_mm256_add_epi32(E2l,O2l); resH[2]=_mm256_add_epi32(E2h,O2h); |
204 | 63.9k | resL[3]=_mm256_add_epi32(E3l,O3l); resH[3]=_mm256_add_epi32(E3h,O3h); |
205 | 63.9k | resL[4]=_mm256_add_epi32(E4l,O4l); resH[4]=_mm256_add_epi32(E4h,O4h); |
206 | 63.9k | resL[5]=_mm256_add_epi32(E5l,O5l); resH[5]=_mm256_add_epi32(E5h,O5h); |
207 | 63.9k | resL[6]=_mm256_add_epi32(E6l,O6l); resH[6]=_mm256_add_epi32(E6h,O6h); |
208 | 63.9k | resL[7]=_mm256_add_epi32(E7l,O7l); resH[7]=_mm256_add_epi32(E7h,O7h); |
209 | 63.9k | resL[15]=_mm256_sub_epi32(E0l,O0l); resH[15]=_mm256_sub_epi32(E0h,O0h); |
210 | 63.9k | resL[14]=_mm256_sub_epi32(E1l,O1l); resH[14]=_mm256_sub_epi32(E1h,O1h); |
211 | 63.9k | resL[13]=_mm256_sub_epi32(E2l,O2l); resH[13]=_mm256_sub_epi32(E2h,O2h); |
212 | 63.9k | resL[12]=_mm256_sub_epi32(E3l,O3l); resH[12]=_mm256_sub_epi32(E3h,O3h); |
213 | 63.9k | resL[11]=_mm256_sub_epi32(E4l,O4l); resH[11]=_mm256_sub_epi32(E4h,O4h); |
214 | 63.9k | resL[10]=_mm256_sub_epi32(E5l,O5l); resH[10]=_mm256_sub_epi32(E5h,O5h); |
215 | 63.9k | resL[9] =_mm256_sub_epi32(E6l,O6l); resH[9] =_mm256_sub_epi32(E6h,O6h); |
216 | 63.9k | resL[8] =_mm256_sub_epi32(E7l,O7l); resH[8] =_mm256_sub_epi32(E7h,O7h); |
217 | 63.9k | } |
218 | | |
219 | 1.50M | static inline __m256i round_shift_pack(__m256i lo, __m256i hi, __m256i vr, int shift) { |
220 | 1.50M | return _mm256_packs_epi32( |
221 | 1.50M | _mm256_srai_epi32(_mm256_add_epi32(lo,vr),shift), |
222 | 1.50M | _mm256_srai_epi32(_mm256_add_epi32(hi,vr),shift)); |
223 | 1.50M | } |
224 | | |
225 | | // 1-D 16-point inverse transform in place on S[0..15] (16 columns). |
226 | 33.5k | static inline void idct16_vpass(__m256i S[16], int add, int shift) { |
227 | 33.5k | __m256i resL[16], resH[16]; |
228 | 33.5k | idct16_core(S, resL, resH); |
229 | 33.5k | const __m256i vr = _mm256_set1_epi32(add); |
230 | 570k | for (int k=0;k<16;k++) S[k] = round_shift_pack(resL[k], resH[k], vr, shift); |
231 | 33.5k | } |
232 | | |
233 | | // 1-D 32-point inverse transform in place on S[0..31] (16 columns). Even part |
234 | | // reuses idct16_core on the even rows; odd part uses the T32 coefficients. |
235 | 30.3k | static inline void idct32_vpass(__m256i S[32], int add, int shift) { |
236 | 30.3k | __m256i ev[16]; |
237 | 515k | for (int i=0;i<16;i++) ev[i]=S[2*i]; |
238 | 30.3k | __m256i EL[16], EH[16]; |
239 | 30.3k | idct16_core(ev, EL, EH); |
240 | | |
241 | 30.3k | __m256i pl[8], ph[8]; |
242 | 272k | for (int g=0; g<8; g++) { |
243 | 242k | pl[g]=_mm256_unpacklo_epi16(S[4*g+1], S[4*g+3]); |
244 | 242k | ph[g]=_mm256_unpackhi_epi16(S[4*g+1], S[4*g+3]); |
245 | 242k | } |
246 | | |
247 | 30.3k | const __m256i vr = _mm256_set1_epi32(add); |
248 | 515k | for (int k=0;k<16;k++) { |
249 | 485k | __m256i OL=_mm256_madd_epi16(pl[0], bc(idct_T32[0][k])); |
250 | 485k | __m256i OH=_mm256_madd_epi16(ph[0], bc(idct_T32[0][k])); |
251 | 3.88M | for (int g=1; g<8; g++) { |
252 | 3.39M | OL=_mm256_add_epi32(OL,_mm256_madd_epi16(pl[g], bc(idct_T32[g][k]))); |
253 | 3.39M | OH=_mm256_add_epi32(OH,_mm256_madd_epi16(ph[g], bc(idct_T32[g][k]))); |
254 | 3.39M | } |
255 | 485k | S[k] = round_shift_pack(_mm256_add_epi32(EL[k],OL), _mm256_add_epi32(EH[k],OH), vr, shift); |
256 | 485k | S[31-k] = round_shift_pack(_mm256_sub_epi32(EL[k],OL), _mm256_sub_epi32(EH[k],OH), vr, shift); |
257 | 485k | } |
258 | 30.3k | } |
259 | | |
260 | | } // namespace |
261 | | |
262 | | |
263 | | void transform_16x16_add_8_avx2(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
264 | 16.7k | { |
265 | 16.7k | __m256i S[16]; |
266 | 285k | for (int i=0;i<16;i++) S[i]=_mm256_loadu_si256((const __m256i*)(coeffs + i*16)); |
267 | | |
268 | 16.7k | idct16_vpass(S, add_1st, shift_1st); |
269 | 16.7k | transpose16x16(S); |
270 | 16.7k | idct16_vpass(S, add_2nd, shift_2nd); |
271 | 16.7k | transpose16x16(S); |
272 | | |
273 | 285k | for (int r=0;r<16;r++) { |
274 | 268k | uint8_t* d = dst + r*stride; |
275 | 268k | __m256i pred = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)d)); |
276 | 268k | __m256i sum = _mm256_adds_epi16(S[r], pred); |
277 | 268k | __m256i pk = _mm256_packus_epi16(sum, sum); // [c0-7|c0-7 ; c8-15|c8-15] |
278 | 268k | pk = _mm256_permute4x64_epi64(pk, 0x08); // low 128 = c0-7,c8-15 |
279 | 268k | _mm_storeu_si128((__m128i*)d, _mm256_castsi256_si128(pk)); |
280 | 268k | } |
281 | 16.7k | } |
282 | | |
283 | | |
284 | | void transform_32x32_add_8_avx2(uint8_t *dst, const int16_t *coeffs, ptrdiff_t stride) |
285 | 7.57k | { |
286 | 7.57k | __m256i Lo[32], Hi[32]; |
287 | 250k | for (int r=0;r<32;r++) { |
288 | 242k | Lo[r]=_mm256_loadu_si256((const __m256i*)(coeffs + r*32)); |
289 | 242k | Hi[r]=_mm256_loadu_si256((const __m256i*)(coeffs + r*32 + 16)); |
290 | 242k | } |
291 | | |
292 | 7.57k | idct32_vpass(Lo, add_1st, shift_1st); |
293 | 7.57k | idct32_vpass(Hi, add_1st, shift_1st); |
294 | 7.57k | transpose32x32(Lo, Hi); |
295 | 7.57k | idct32_vpass(Lo, add_2nd, shift_2nd); |
296 | 7.57k | idct32_vpass(Hi, add_2nd, shift_2nd); |
297 | 7.57k | transpose32x32(Lo, Hi); |
298 | | |
299 | 250k | for (int r=0;r<32;r++) { |
300 | 242k | uint8_t* d = dst + r*stride; |
301 | 242k | __m256i predLo = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)d)); |
302 | 242k | __m256i predHi = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)(d+16))); |
303 | 242k | __m256i sumLo = _mm256_adds_epi16(Lo[r], predLo); |
304 | 242k | __m256i sumHi = _mm256_adds_epi16(Hi[r], predHi); |
305 | | // pack to bytes: qwords come out as [c0-7, c16-23, c8-15, c24-31] |
306 | 242k | __m256i pk = _mm256_packus_epi16(sumLo, sumHi); |
307 | | pk = _mm256_permute4x64_epi64(pk, _MM_SHUFFLE(3,1,2,0)); // -> c0-31 |
308 | 242k | _mm256_storeu_si256((__m256i*)d, pk); |
309 | 242k | } |
310 | 7.57k | } |
311 | | |
312 | | // Note: an AVX2 inverse-quantization kernel was implemented and benchmarked too, |
313 | | // but inverse quantization is scatter-bound (the result is scattered to |
314 | | // coeffBuf[coeffPos[i]], and there is no 16-bit SIMD scatter), so the wider |
315 | | // arithmetic gave no benefit -- AVX2 measured ~equal-to-slightly-slower than the |
316 | | // SSE version. The SSE version (dequant_coeff_block_sse4) is used instead. |
317 | | |
318 | | #endif // HAVE_AVX2 |