/src/libde265/libde265/x86/sse-dct.cc
Line | Count | Source |
1 | | /* |
2 | | * H.265 video codec. |
3 | | * Copyright (c) 2013 openHEVC contributors |
4 | | * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de> |
5 | | * |
6 | | * This file is part of libde265. |
7 | | * |
8 | | * libde265 is free software: you can redistribute it and/or modify |
9 | | * it under the terms of the GNU Lesser General Public License as |
10 | | * published by the Free Software Foundation, either version 3 of |
11 | | * the License, or (at your option) any later version. |
12 | | * |
13 | | * libde265 is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | | * GNU Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public License |
19 | | * along with libde265. If not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "x86/sse-dct.h" |
23 | | #include "libde265/util.h" |
24 | | |
25 | | #ifdef HAVE_CONFIG_H |
26 | | #include "config.h" |
27 | | #endif |
28 | | |
29 | | #include <emmintrin.h> // SSE2 |
30 | | #include <tmmintrin.h> // SSSE3 |
31 | | |
32 | | #if HAVE_SSE4_1 |
33 | | #include <smmintrin.h> // SSE4.1 |
34 | | #endif |
35 | | |
36 | | |
37 | | ALIGNED_16(static const int16_t) transform4x4_luma[8][8] = |
38 | | { |
39 | | { 29, +84, 29, +84, 29, +84, 29, +84 }, |
40 | | { +74, +55, +74, +55, +74, +55, +74, +55 }, |
41 | | { 55, -29, 55, -29, 55, -29, 55, -29 }, |
42 | | { +74, -84, +74, -84, +74, -84, +74, -84 }, |
43 | | { 74, -74, 74, -74, 74, -74, 74, -74 }, |
44 | | { 0, +74, 0, +74, 0, +74, 0, +74 }, |
45 | | { 84, +55, 84, +55, 84, +55, 84, +55 }, |
46 | | { -74, -29, -74, -29, -74, -29, -74, -29 } |
47 | | }; |
48 | | |
49 | | ALIGNED_16(static const int16_t) transform4x4[4][8] = { |
50 | | { 64, 64, 64, 64, 64, 64, 64, 64 }, |
51 | | { 64, -64, 64, -64, 64, -64, 64, -64 }, |
52 | | { 83, 36, 83, 36, 83, 36, 83, 36 }, |
53 | | { 36, -83, 36, -83, 36, -83, 36, -83 } |
54 | | }; |
55 | | |
56 | | ALIGNED_16(static const int16_t) transform8x8[12][8] = |
57 | | { |
58 | | { 89, 75, 89, 75, 89, 75, 89, 75 }, |
59 | | { 50, 18, 50, 18, 50, 18, 50, 18 }, |
60 | | { 75, -18, 75, -18, 75, -18, 75, -18 }, |
61 | | { -89, -50, -89, -50,-89, -50,-89, -50 }, |
62 | | { 50, -89, 50, -89, 50, -89, 50, -89 }, |
63 | | { 18, 75, 18, 75, 18, 75, 18, 75 }, |
64 | | { 18, -50, 18, -50, 18, -50, 18, -50 }, |
65 | | { 75, -89, 75, -89, 75, -89, 75, -89 }, |
66 | | { 64, 64, 64, 64, 64, 64, 64, 64 }, |
67 | | { 64, -64, 64, -64, 64, -64, 64, -64 }, |
68 | | { 83, 36, 83, 36, 83, 36, 83, 36 }, |
69 | | { 36, -83, 36, -83, 36, -83, 36, -83 } |
70 | | }; |
71 | | |
72 | | ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] = |
73 | | { |
74 | | {/*1-3*/ /*2-6*/ |
75 | | { 90, 87, 90, 87, 90, 87, 90, 87 }, |
76 | | { 87, 57, 87, 57, 87, 57, 87, 57 }, |
77 | | { 80, 9, 80, 9, 80, 9, 80, 9 }, |
78 | | { 70, -43, 70, -43, 70, -43, 70, -43 }, |
79 | | { 57, -80, 57, -80, 57, -80, 57, -80 }, |
80 | | { 43, -90, 43, -90, 43, -90, 43, -90 }, |
81 | | { 25, -70, 25, -70, 25, -70, 25, -70 }, |
82 | | { 9, -25, 9, -25, 9, -25, 9, -25 }, |
83 | | },{ /*5-7*/ /*10-14*/ |
84 | | { 80, 70, 80, 70, 80, 70, 80, 70 }, |
85 | | { 9, -43, 9, -43, 9, -43, 9, -43 }, |
86 | | { -70, -87, -70, -87, -70, -87, -70, -87 }, |
87 | | { -87, 9, -87, 9, -87, 9, -87, 9 }, |
88 | | { -25, 90, -25, 90, -25, 90, -25, 90 }, |
89 | | { 57, 25, 57, 25, 57, 25, 57, 25 }, |
90 | | { 90, -80, 90, -80, 90, -80, 90, -80 }, |
91 | | { 43, -57, 43, -57, 43, -57, 43, -57 }, |
92 | | },{ /*9-11*/ /*18-22*/ |
93 | | { 57, 43, 57, 43, 57, 43, 57, 43 }, |
94 | | { -80, -90, -80, -90, -80, -90, -80, -90 }, |
95 | | { -25, 57, -25, 57, -25, 57, -25, 57 }, |
96 | | { 90, 25, 90, 25, 90, 25, 90, 25 }, |
97 | | { -9, -87, -9, -87, -9, -87, -9, -87 }, |
98 | | { -87, 70, -87, 70, -87, 70, -87, 70 }, |
99 | | { 43, 9, 43, 9, 43, 9, 43, 9 }, |
100 | | { 70, -80, 70, -80, 70, -80, 70, -80 }, |
101 | | },{/*13-15*/ /* 26-30 */ |
102 | | { 25, 9, 25, 9, 25, 9, 25, 9 }, |
103 | | { -70, -25, -70, -25, -70, -25, -70, -25 }, |
104 | | { 90, 43, 90, 43, 90, 43, 90, 43 }, |
105 | | { -80, -57, -80, -57, -80, -57, -80, -57 }, |
106 | | { 43, 70, 43, 70, 43, 70, 43, 70 }, |
107 | | { 9, -80, 9, -80, 9, -80, 9, -80 }, |
108 | | { -57, 87, -57, 87, -57, 87, -57, 87 }, |
109 | | { 87, -90, 87, -90, 87, -90, 87, -90 }, |
110 | | } |
111 | | }; |
112 | | |
113 | | ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] = |
114 | | { |
115 | | { /*2-6*/ /*4-12*/ |
116 | | { 89, 75, 89, 75, 89, 75, 89, 75 }, |
117 | | { 75, -18, 75, -18, 75, -18, 75, -18 }, |
118 | | { 50, -89, 50, -89, 50, -89, 50, -89 }, |
119 | | { 18, -50, 18, -50, 18, -50, 18, -50 }, |
120 | | },{ /*10-14*/ /*20-28*/ |
121 | | { 50, 18, 50, 18, 50, 18, 50, 18 }, |
122 | | { -89, -50, -89, -50, -89, -50, -89, -50 }, |
123 | | { 18, 75, 18, 75, 18, 75, 18, 75 }, |
124 | | { 75, -89, 75, -89, 75, -89, 75, -89 }, |
125 | | } |
126 | | }; |
127 | | |
128 | | ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] = |
129 | | { |
130 | | {/*4-12*/ /*8-24*/ |
131 | | { 83, 36, 83, 36, 83, 36, 83, 36 }, |
132 | | { 36, -83, 36, -83, 36, -83, 36, -83 }, |
133 | | },{ /*0-8*/ /*0-16*/ |
134 | | { 64, 64, 64, 64, 64, 64, 64, 64 }, |
135 | | { 64, -64, 64, -64, 64, -64, 64, -64 }, |
136 | | } |
137 | | }; |
138 | | |
139 | | |
140 | | ALIGNED_16(static const int16_t) transform32x32[8][16][8] = |
141 | | { |
142 | | { /* 1-3 */ |
143 | | { 90, 90, 90, 90, 90, 90, 90, 90 }, |
144 | | { 90, 82, 90, 82, 90, 82, 90, 82 }, |
145 | | { 88, 67, 88, 67, 88, 67, 88, 67 }, |
146 | | { 85, 46, 85, 46, 85, 46, 85, 46 }, |
147 | | { 82, 22, 82, 22, 82, 22, 82, 22 }, |
148 | | { 78, -4, 78, -4, 78, -4, 78, -4 }, |
149 | | { 73, -31, 73, -31, 73, -31, 73, -31 }, |
150 | | { 67, -54, 67, -54, 67, -54, 67, -54 }, |
151 | | { 61, -73, 61, -73, 61, -73, 61, -73 }, |
152 | | { 54, -85, 54, -85, 54, -85, 54, -85 }, |
153 | | { 46, -90, 46, -90, 46, -90, 46, -90 }, |
154 | | { 38, -88, 38, -88, 38, -88, 38, -88 }, |
155 | | { 31, -78, 31, -78, 31, -78, 31, -78 }, |
156 | | { 22, -61, 22, -61, 22, -61, 22, -61 }, |
157 | | { 13, -38, 13, -38, 13, -38, 13, -38 }, |
158 | | { 4, -13, 4, -13, 4, -13, 4, -13 }, |
159 | | },{/* 5-7 */ |
160 | | { 88, 85, 88, 85, 88, 85, 88, 85 }, |
161 | | { 67, 46, 67, 46, 67, 46, 67, 46 }, |
162 | | { 31, -13, 31, -13, 31, -13, 31, -13 }, |
163 | | { -13, -67, -13, -67, -13, -67, -13, -67 }, |
164 | | { -54, -90, -54, -90, -54, -90, -54, -90 }, |
165 | | { -82, -73, -82, -73, -82, -73, -82, -73 }, |
166 | | { -90, -22, -90, -22, -90, -22, -90, -22 }, |
167 | | { -78, 38, -78, 38, -78, 38, -78, 38 }, |
168 | | { -46, 82, -46, 82, -46, 82, -46, 82 }, |
169 | | { -4, 88, -4, 88, -4, 88, -4, 88 }, |
170 | | { 38, 54, 38, 54, 38, 54, 38, 54 }, |
171 | | { 73, -4, 73, -4, 73, -4, 73, -4 }, |
172 | | { 90, -61, 90, -61, 90, -61, 90, -61 }, |
173 | | { 85, -90, 85, -90, 85, -90, 85, -90 }, |
174 | | { 61, -78, 61, -78, 61, -78, 61, -78 }, |
175 | | { 22, -31, 22, -31, 22, -31, 22, -31 }, |
176 | | },{/* 9-11 */ |
177 | | { 82, 78, 82, 78, 82, 78, 82, 78 }, |
178 | | { 22, -4, 22, -4, 22, -4, 22, -4 }, |
179 | | { -54, -82, -54, -82, -54, -82, -54, -82 }, |
180 | | { -90, -73, -90, -73, -90, -73, -90, -73 }, |
181 | | { -61, 13, -61, 13, -61, 13, -61, 13 }, |
182 | | { 13, 85, 13, 85, 13, 85, 13, 85 }, |
183 | | { 78, 67, 78, 67, 78, 67, 78, 67 }, |
184 | | { 85, -22, 85, -22, 85, -22, 85, -22 }, |
185 | | { 31, -88, 31, -88, 31, -88, 31, -88 }, |
186 | | { -46, -61, -46, -61, -46, -61, -46, -61 }, |
187 | | { -90, 31, -90, 31, -90, 31, -90, 31 }, |
188 | | { -67, 90, -67, 90, -67, 90, -67, 90 }, |
189 | | { 4, 54, 4, 54, 4, 54, 4, 54 }, |
190 | | { 73, -38, 73, -38, 73, -38, 73, -38 }, |
191 | | { 88, -90, 88, -90, 88, -90, 88, -90 }, |
192 | | { 38, -46, 38, -46, 38, -46, 38, -46 }, |
193 | | },{/* 13-15 */ |
194 | | { 73, 67, 73, 67, 73, 67, 73, 67 }, |
195 | | { -31, -54, -31, -54, -31, -54, -31, -54 }, |
196 | | { -90, -78, -90, -78, -90, -78, -90, -78 }, |
197 | | { -22, 38, -22, 38, -22, 38, -22, 38 }, |
198 | | { 78, 85, 78, 85, 78, 85, 78, 85 }, |
199 | | { 67, -22, 67, -22, 67, -22, 67, -22 }, |
200 | | { -38, -90, -38, -90, -38, -90, -38, -90 }, |
201 | | { -90, 4, -90, 4, -90, 4, -90, 4 }, |
202 | | { -13, 90, -13, 90, -13, 90, -13, 90 }, |
203 | | { 82, 13, 82, 13, 82, 13, 82, 13 }, |
204 | | { 61, -88, 61, -88, 61, -88, 61, -88 }, |
205 | | { -46, -31, -46, -31, -46, -31, -46, -31 }, |
206 | | { -88, 82, -88, 82, -88, 82, -88, 82 }, |
207 | | { -4, 46, -4, 46, -4, 46, -4, 46 }, |
208 | | { 85, -73, 85, -73, 85, -73, 85, -73 }, |
209 | | { 54, -61, 54, -61, 54, -61, 54, -61 }, |
210 | | },{/* 17-19 */ |
211 | | { 61, 54, 61, 54, 61, 54, 61, 54 }, |
212 | | { -73, -85, -73, -85, -73, -85, -73, -85 }, |
213 | | { -46, -4, -46, -4, -46, -4, -46, -4 }, |
214 | | { 82, 88, 82, 88, 82, 88, 82, 88 }, |
215 | | { 31, -46, 31, -46, 31, -46, 31, -46 }, |
216 | | { -88, -61, -88, -61, -88, -61, -88, -61 }, |
217 | | { -13, 82, -13, 82, -13, 82, -13, 82 }, |
218 | | { 90, 13, 90, 13, 90, 13, 90, 13 }, |
219 | | { -4, -90, -4, -90, -4, -90, -4, -90 }, |
220 | | { -90, 38, -90, 38, -90, 38, -90, 38 }, |
221 | | { 22, 67, 22, 67, 22, 67, 22, 67 }, |
222 | | { 85, -78, 85, -78, 85, -78, 85, -78 }, |
223 | | { -38, -22, -38, -22, -38, -22, -38, -22 }, |
224 | | { -78, 90, -78, 90, -78, 90, -78, 90 }, |
225 | | { 54, -31, 54, -31, 54, -31, 54, -31 }, |
226 | | { 67, -73, 67, -73, 67, -73, 67, -73 }, |
227 | | },{ /* 21-23 */ |
228 | | { 46, 38, 46, 38, 46, 38, 46, 38 }, |
229 | | { -90, -88, -90, -88, -90, -88, -90, -88 }, |
230 | | { 38, 73, 38, 73, 38, 73, 38, 73 }, |
231 | | { 54, -4, 54, -4, 54, -4, 54, -4 }, |
232 | | { -90, -67, -90, -67, -90, -67, -90, -67 }, |
233 | | { 31, 90, 31, 90, 31, 90, 31, 90 }, |
234 | | { 61, -46, 61, -46, 61, -46, 61, -46 }, |
235 | | { -88, -31, -88, -31, -88, -31, -88, -31 }, |
236 | | { 22, 85, 22, 85, 22, 85, 22, 85 }, |
237 | | { 67, -78, 67, -78, 67, -78, 67, -78 }, |
238 | | { -85, 13, -85, 13, -85, 13, -85, 13 }, |
239 | | { 13, 61, 13, 61, 13, 61, 13, 61 }, |
240 | | { 73, -90, 73, -90, 73, -90, 73, -90 }, |
241 | | { -82, 54, -82, 54, -82, 54, -82, 54 }, |
242 | | { 4, 22, 4, 22, 4, 22, 4, 22 }, |
243 | | { 78, -82, 78, -82, 78, -82, 78, -82 }, |
244 | | },{ /* 25-27 */ |
245 | | { 31, 22, 31, 22, 31, 22, 31, 22 }, |
246 | | { -78, -61, -78, -61, -78, -61, -78, -61 }, |
247 | | { 90, 85, 90, 85, 90, 85, 90, 85 }, |
248 | | { -61, -90, -61, -90, -61, -90, -61, -90 }, |
249 | | { 4, 73, 4, 73, 4, 73, 4, 73 }, |
250 | | { 54, -38, 54, -38, 54, -38, 54, -38 }, |
251 | | { -88, -4, -88, -4, -88, -4, -88, -4 }, |
252 | | { 82, 46, 82, 46, 82, 46, 82, 46 }, |
253 | | { -38, -78, -38, -78, -38, -78, -38, -78 }, |
254 | | { -22, 90, -22, 90, -22, 90, -22, 90 }, |
255 | | { 73, -82, 73, -82, 73, -82, 73, -82 }, |
256 | | { -90, 54, -90, 54, -90, 54, -90, 54 }, |
257 | | { 67, -13, 67, -13, 67, -13, 67, -13 }, |
258 | | { -13, -31, -13, -31, -13, -31, -13, -31 }, |
259 | | { -46, 67, -46, 67, -46, 67, -46, 67 }, |
260 | | { 85, -88, 85, -88, 85, -88, 85, -88 }, |
261 | | },{/* 29-31 */ |
262 | | { 13, 4, 13, 4, 13, 4, 13, 4 }, |
263 | | { -38, -13, -38, -13, -38, -13, -38, -13 }, |
264 | | { 61, 22, 61, 22, 61, 22, 61, 22 }, |
265 | | { -78, -31, -78, -31, -78, -31, -78, -31 }, |
266 | | { 88, 38, 88, 38, 88, 38, 88, 38 }, |
267 | | { -90, -46, -90, -46, -90, -46, -90, -46 }, |
268 | | { 85, 54, 85, 54, 85, 54, 85, 54 }, |
269 | | { -73, -61, -73, -61, -73, -61, -73, -61 }, |
270 | | { 54, 67, 54, 67, 54, 67, 54, 67 }, |
271 | | { -31, -73, -31, -73, -31, -73, -31, -73 }, |
272 | | { 4, 78, 4, 78, 4, 78, 4, 78 }, |
273 | | { 22, -82, 22, -82, 22, -82, 22, -82 }, |
274 | | { -46, 85, -46, 85, -46, 85, -46, 85 }, |
275 | | { 67, -88, 67, -88, 67, -88, 67, -88 }, |
276 | | { -82, 90, -82, 90, -82, 90, -82, 90 }, |
277 | | { 90, -90, 90, -90, 90, -90, 90, -90 }, |
278 | | } |
279 | | }; |
280 | | |
281 | 4.67M | #define shift_1st 7 |
282 | 289k | #define add_1st (1 << (shift_1st - 1)) |
283 | | |
284 | | |
285 | | void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride) |
286 | 0 | { |
287 | 0 | uint8_t *dst = (uint8_t*)_dst; |
288 | 0 | ptrdiff_t stride = _stride; |
289 | 0 | int shift = 5; |
290 | 0 | int offset = 16; |
291 | 0 | __m128i r0,r1,r2,r3,r4,r5,r6,r9; |
292 | |
|
293 | 0 | r9= _mm_setzero_si128(); |
294 | | //r8= _mm_set_epi32(0,0,0,-1); |
295 | 0 | r2= _mm_set1_epi16(offset); |
296 | |
|
297 | 0 | r0= _mm_load_si128((__m128i*)(coeffs)); |
298 | 0 | r1= _mm_load_si128((__m128i*)(coeffs+8)); |
299 | | |
300 | |
|
301 | 0 | r0= _mm_adds_epi16(r0,r2); |
302 | 0 | r1= _mm_adds_epi16(r1,r2); |
303 | |
|
304 | 0 | r0= _mm_srai_epi16(r0,shift); |
305 | 0 | r1= _mm_srai_epi16(r1,shift); |
306 | |
|
307 | 0 | r3= _mm_loadl_epi64((__m128i*)(dst)); |
308 | 0 | r4= _mm_loadl_epi64((__m128i*)(dst + stride)); |
309 | 0 | r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride)); |
310 | 0 | r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride)); |
311 | |
|
312 | 0 | r3= _mm_unpacklo_epi8(r3,r9); |
313 | 0 | r4= _mm_unpacklo_epi8(r4,r9); |
314 | 0 | r5= _mm_unpacklo_epi8(r5,r9); |
315 | 0 | r6= _mm_unpacklo_epi8(r6,r9); |
316 | 0 | r3= _mm_unpacklo_epi64(r3,r4); |
317 | 0 | r4= _mm_unpacklo_epi64(r5,r6); |
318 | | |
319 | |
|
320 | 0 | r3= _mm_adds_epi16(r3,r0); |
321 | 0 | r4= _mm_adds_epi16(r4,r1); |
322 | |
|
323 | 0 | r3= _mm_packus_epi16(r3,r4); |
324 | | //r8= _mm_set_epi32(0,0,0,-1); |
325 | | |
326 | | //_mm_maskmoveu_si128(r3,r8,(char *) (dst)); |
327 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3); |
328 | |
|
329 | 0 | r3= _mm_srli_si128(r3,4); |
330 | | //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride)); |
331 | 0 | *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3); |
332 | |
|
333 | 0 | r3= _mm_srli_si128(r3,4); |
334 | | //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride)); |
335 | 0 | *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3); |
336 | |
|
337 | 0 | r3= _mm_srli_si128(r3,4); |
338 | | //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride)); |
339 | 0 | *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3); |
340 | 0 | } |
341 | | |
342 | | |
343 | | |
344 | | #if HAVE_SSE4_1 |
345 | | void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, |
346 | 0 | ptrdiff_t _stride) { |
347 | |
|
348 | 0 | uint8_t shift_2nd = 12; // 20 - Bit depth |
349 | 0 | uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) |
350 | |
|
351 | 0 | uint8_t *dst = (uint8_t*) _dst; |
352 | 0 | ptrdiff_t stride = _stride; |
353 | 0 | const int16_t *src = coeffs; |
354 | 0 | __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, |
355 | 0 | m128iD; |
356 | 0 | m128iAdd = _mm_set1_epi32(64); |
357 | |
|
358 | 0 | S0 = _mm_load_si128((__m128i *) (src)); |
359 | 0 | S8 = _mm_load_si128((__m128i *) (src + 8)); |
360 | |
|
361 | 0 | m128iAC = _mm_unpacklo_epi16(S0, S8); |
362 | 0 | m128iBD = _mm_unpackhi_epi16(S0, S8); |
363 | |
|
364 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
365 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[0]))); |
366 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
367 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[1]))); |
368 | 0 | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
369 | 0 | S0 = _mm_add_epi32(S0, m128iAdd); |
370 | 0 | S0 = _mm_srai_epi32(S0, shift_1st); |
371 | |
|
372 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
373 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[2]))); |
374 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
375 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[3]))); |
376 | 0 | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
377 | 0 | S8 = _mm_add_epi32(S8, m128iAdd); |
378 | 0 | S8 = _mm_srai_epi32(S8, shift_1st); |
379 | |
|
380 | 0 | m128iA = _mm_packs_epi32(S0, S8); |
381 | |
|
382 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
383 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[4]))); |
384 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
385 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[5]))); |
386 | 0 | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
387 | 0 | S0 = _mm_add_epi32(S0, m128iAdd); |
388 | 0 | S0 = _mm_srai_epi32(S0, shift_1st); |
389 | |
|
390 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
391 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[6]))); |
392 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
393 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[7]))); |
394 | 0 | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
395 | 0 | S8 = _mm_add_epi32(S8, m128iAdd); |
396 | 0 | S8 = _mm_srai_epi32(S8, shift_1st); |
397 | |
|
398 | 0 | m128iD = _mm_packs_epi32(S0, S8); |
399 | |
|
400 | 0 | S0 = _mm_unpacklo_epi16(m128iA, m128iD); |
401 | 0 | S8 = _mm_unpackhi_epi16(m128iA, m128iD); |
402 | |
|
403 | 0 | m128iA = _mm_unpacklo_epi16(S0, S8); |
404 | 0 | m128iD = _mm_unpackhi_epi16(S0, S8); |
405 | | |
406 | | /* ################### */ |
407 | 0 | m128iAdd = _mm_set1_epi32(add_2nd); |
408 | |
|
409 | 0 | m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); |
410 | 0 | m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); |
411 | |
|
412 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
413 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[0]))); |
414 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
415 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[1]))); |
416 | 0 | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
417 | 0 | S0 = _mm_add_epi32(S0, m128iAdd); |
418 | 0 | S0 = _mm_srai_epi32(S0, shift_2nd); |
419 | |
|
420 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
421 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[2]))); |
422 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
423 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[3]))); |
424 | 0 | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
425 | 0 | S8 = _mm_add_epi32(S8, m128iAdd); |
426 | 0 | S8 = _mm_srai_epi32(S8, shift_2nd); |
427 | |
|
428 | 0 | m128iA = _mm_packs_epi32(S0, S8); |
429 | |
|
430 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
431 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[4]))); |
432 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
433 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[5]))); |
434 | 0 | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
435 | 0 | S0 = _mm_add_epi32(S0, m128iAdd); |
436 | 0 | S0 = _mm_srai_epi32(S0, shift_2nd); |
437 | |
|
438 | 0 | m128iTmp1 = _mm_madd_epi16(m128iAC, |
439 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[6]))); |
440 | 0 | m128iTmp2 = _mm_madd_epi16(m128iBD, |
441 | 0 | _mm_load_si128((__m128i *) (transform4x4_luma[7]))); |
442 | 0 | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
443 | 0 | S8 = _mm_add_epi32(S8, m128iAdd); |
444 | 0 | S8 = _mm_srai_epi32(S8, shift_2nd); |
445 | |
|
446 | 0 | m128iD = _mm_packs_epi32(S0, S8); |
447 | | |
448 | | // _mm_storeu_si128((__m128i *) (src), m128iA); |
449 | | // _mm_storeu_si128((__m128i *) (src + 8), m128iD); |
450 | |
|
451 | 0 | S0 = _mm_move_epi64(m128iA); //contains row 0 |
452 | 0 | S8 = _mm_move_epi64(m128iD); //row 2 |
453 | 0 | m128iA = _mm_srli_si128(m128iA, 8); // row 1 |
454 | 0 | m128iD = _mm_srli_si128(m128iD, 8); // row 3 |
455 | 0 | m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); |
456 | 0 | m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); |
457 | 0 | S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); |
458 | 0 | S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); |
459 | | |
460 | | //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data |
461 | |
|
462 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
463 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
464 | 0 | m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values |
465 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
466 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
467 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
468 | |
|
469 | 0 | dst += stride; |
470 | |
|
471 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
472 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
473 | 0 | m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); |
474 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
475 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
476 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
477 | |
|
478 | 0 | dst += stride; |
479 | |
|
480 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
481 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
482 | 0 | m128iTmp1 = _mm_adds_epi16(S8, m128iA); |
483 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
484 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
485 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
486 | |
|
487 | 0 | dst += stride; |
488 | |
|
489 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
490 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
491 | 0 | m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); |
492 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
493 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
494 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
495 | 0 | } |
496 | | #endif // SSE4.1 |
497 | | |
498 | | #if 0 |
499 | | void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, |
500 | | ptrdiff_t _stride) { |
501 | | int i,j; |
502 | | uint8_t shift_2nd = 10; // 20 - Bit depth |
503 | | uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) |
504 | | |
505 | | uint16_t *dst = (uint16_t*) _dst; |
506 | | ptrdiff_t stride = _stride/(sizeof(uint16_t)); |
507 | | int16_t *src = coeffs; |
508 | | __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA, |
509 | | m128iD; |
510 | | |
511 | | m128iAdd = _mm_set1_epi32(64); |
512 | | |
513 | | S0 = _mm_loadu_si128((__m128i *) (src)); |
514 | | S8 = _mm_loadu_si128((__m128i *) (src + 8)); |
515 | | |
516 | | m128iAC = _mm_unpacklo_epi16(S0, S8); |
517 | | m128iBD = _mm_unpackhi_epi16(S0, S8); |
518 | | |
519 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
520 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[0]))); |
521 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
522 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[1]))); |
523 | | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
524 | | S0 = _mm_add_epi32(S0, m128iAdd); |
525 | | S0 = _mm_srai_epi32(S0, shift_1st); |
526 | | |
527 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
528 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[2]))); |
529 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
530 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[3]))); |
531 | | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
532 | | S8 = _mm_add_epi32(S8, m128iAdd); |
533 | | S8 = _mm_srai_epi32(S8, shift_1st); |
534 | | |
535 | | m128iA = _mm_packs_epi32(S0, S8); |
536 | | |
537 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
538 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[4]))); |
539 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
540 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[5]))); |
541 | | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
542 | | S0 = _mm_add_epi32(S0, m128iAdd); |
543 | | S0 = _mm_srai_epi32(S0, shift_1st); |
544 | | |
545 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
546 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[6]))); |
547 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
548 | | _mm_loadu_si128((__m128i *) (transform4x4_luma[7]))); |
549 | | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
550 | | S8 = _mm_add_epi32(S8, m128iAdd); |
551 | | S8 = _mm_srai_epi32(S8, shift_1st); |
552 | | |
553 | | m128iD = _mm_packs_epi32(S0, S8); |
554 | | |
555 | | S0 = _mm_unpacklo_epi16(m128iA, m128iD); |
556 | | S8 = _mm_unpackhi_epi16(m128iA, m128iD); |
557 | | |
558 | | m128iA = _mm_unpacklo_epi16(S0, S8); |
559 | | m128iD = _mm_unpackhi_epi16(S0, S8); |
560 | | |
561 | | /* ################### */ |
562 | | m128iAdd = _mm_set1_epi32(add_2nd); |
563 | | |
564 | | m128iAC = _mm_unpacklo_epi16(m128iA, m128iD); |
565 | | m128iBD = _mm_unpackhi_epi16(m128iA, m128iD); |
566 | | |
567 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
568 | | _mm_load_si128((__m128i *) (transform4x4_luma[0]))); |
569 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
570 | | _mm_load_si128((__m128i *) (transform4x4_luma[1]))); |
571 | | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
572 | | S0 = _mm_add_epi32(S0, m128iAdd); |
573 | | S0 = _mm_srai_epi32(S0, shift_2nd); |
574 | | |
575 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
576 | | _mm_load_si128((__m128i *) (transform4x4_luma[2]))); |
577 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
578 | | _mm_load_si128((__m128i *) (transform4x4_luma[3]))); |
579 | | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
580 | | S8 = _mm_add_epi32(S8, m128iAdd); |
581 | | S8 = _mm_srai_epi32(S8, shift_2nd); |
582 | | |
583 | | m128iA = _mm_packs_epi32(S0, S8); |
584 | | |
585 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
586 | | _mm_load_si128((__m128i *) (transform4x4_luma[4]))); |
587 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
588 | | _mm_load_si128((__m128i *) (transform4x4_luma[5]))); |
589 | | S0 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
590 | | S0 = _mm_add_epi32(S0, m128iAdd); |
591 | | S0 = _mm_srai_epi32(S0, shift_2nd); |
592 | | |
593 | | m128iTmp1 = _mm_madd_epi16(m128iAC, |
594 | | _mm_load_si128((__m128i *) (transform4x4_luma[6]))); |
595 | | m128iTmp2 = _mm_madd_epi16(m128iBD, |
596 | | _mm_load_si128((__m128i *) (transform4x4_luma[7]))); |
597 | | S8 = _mm_add_epi32(m128iTmp1, m128iTmp2); |
598 | | S8 = _mm_add_epi32(S8, m128iAdd); |
599 | | S8 = _mm_srai_epi32(S8, shift_2nd); |
600 | | |
601 | | m128iD = _mm_packs_epi32(S0, S8); |
602 | | |
603 | | _mm_storeu_si128((__m128i *) (src), m128iA); |
604 | | _mm_storeu_si128((__m128i *) (src + 8), m128iD); |
605 | | j = 0; |
606 | | for (i = 0; i < 2; i++) { |
607 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
608 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); |
609 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); |
610 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); |
611 | | j += 1; |
612 | | dst += stride; |
613 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
614 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); |
615 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); |
616 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); |
617 | | j += 1; |
618 | | dst += stride; |
619 | | } |
620 | | |
621 | | } |
622 | | #endif |
623 | | |
624 | | |
625 | | #if HAVE_SSE4_1 |
626 | | void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, |
627 | 0 | ptrdiff_t _stride) { |
628 | 0 | uint8_t shift_2nd = 12; // 20 - Bit depth |
629 | 0 | uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) |
630 | |
|
631 | 0 | uint8_t *dst = (uint8_t*) _dst; |
632 | 0 | ptrdiff_t stride = _stride; |
633 | 0 | const int16_t *src = coeffs; |
634 | |
|
635 | 0 | __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2; |
636 | 0 | S0 = _mm_load_si128((__m128i *) (src)); |
637 | 0 | S8 = _mm_load_si128((__m128i *) (src + 8)); |
638 | 0 | m128iAdd = _mm_set1_epi32(add_1st); |
639 | |
|
640 | 0 | m128Tmp = _mm_unpacklo_epi16(S0, S8); |
641 | 0 | E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); |
642 | 0 | E1 = _mm_add_epi32(E1, m128iAdd); |
643 | |
|
644 | 0 | E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); |
645 | 0 | E2 = _mm_add_epi32(E2, m128iAdd); |
646 | |
|
647 | 0 | m128Tmp = _mm_unpackhi_epi16(S0, S8); |
648 | 0 | O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); |
649 | 0 | O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); |
650 | |
|
651 | 0 | m128iA = _mm_add_epi32(E1, O1); |
652 | 0 | m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum |
653 | 0 | m128Tmp = _mm_add_epi32(E2, O2); |
654 | 0 | m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum |
655 | 0 | m128iA = _mm_packs_epi32(m128iA, m128Tmp); |
656 | |
|
657 | 0 | m128iD = _mm_sub_epi32(E2, O2); |
658 | 0 | m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum |
659 | |
|
660 | 0 | m128Tmp = _mm_sub_epi32(E1, O1); |
661 | 0 | m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum |
662 | |
|
663 | 0 | m128iD = _mm_packs_epi32(m128iD, m128Tmp); |
664 | |
|
665 | 0 | S0 = _mm_unpacklo_epi16(m128iA, m128iD); |
666 | 0 | S8 = _mm_unpackhi_epi16(m128iA, m128iD); |
667 | |
|
668 | 0 | m128iA = _mm_unpacklo_epi16(S0, S8); |
669 | 0 | m128iD = _mm_unpackhi_epi16(S0, S8); |
670 | | |
671 | | /* ########################## */ |
672 | |
|
673 | 0 | m128iAdd = _mm_set1_epi32(add_2nd); |
674 | 0 | m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); |
675 | 0 | E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); |
676 | 0 | E1 = _mm_add_epi32(E1, m128iAdd); |
677 | |
|
678 | 0 | E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); |
679 | 0 | E2 = _mm_add_epi32(E2, m128iAdd); |
680 | |
|
681 | 0 | m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); |
682 | 0 | O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); |
683 | 0 | O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); |
684 | |
|
685 | 0 | m128iA = _mm_add_epi32(E1, O1); |
686 | 0 | m128iA = _mm_srai_epi32(m128iA, shift_2nd); |
687 | 0 | m128Tmp = _mm_add_epi32(E2, O2); |
688 | 0 | m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); |
689 | 0 | m128iA = _mm_packs_epi32(m128iA, m128Tmp); |
690 | |
|
691 | 0 | m128iD = _mm_sub_epi32(E2, O2); |
692 | 0 | m128iD = _mm_srai_epi32(m128iD, shift_2nd); |
693 | |
|
694 | 0 | m128Tmp = _mm_sub_epi32(E1, O1); |
695 | 0 | m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); |
696 | |
|
697 | 0 | m128iD = _mm_packs_epi32(m128iD, m128Tmp); |
698 | |
|
699 | 0 | S0 = _mm_move_epi64(m128iA); //contains row 0 |
700 | 0 | S8 = _mm_move_epi64(m128iD); //row 2 |
701 | 0 | m128iA = _mm_srli_si128(m128iA, 8); // row 1 |
702 | 0 | m128iD = _mm_srli_si128(m128iD, 8); // row 3 |
703 | 0 | m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA); |
704 | 0 | m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD); |
705 | 0 | S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2); |
706 | 0 | S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2); |
707 | | |
708 | | //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data |
709 | |
|
710 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
711 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
712 | 0 | m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values |
713 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
714 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
715 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
716 | |
|
717 | 0 | dst += stride; |
718 | |
|
719 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
720 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
721 | 0 | m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA); |
722 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
723 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
724 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
725 | |
|
726 | 0 | dst += stride; |
727 | |
|
728 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
729 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
730 | 0 | m128iTmp1 = _mm_adds_epi16(S8, m128iA); |
731 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
732 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
733 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
734 | |
|
735 | 0 | dst += stride; |
736 | |
|
737 | 0 | m128iA = _mm_loadl_epi64((__m128i *) dst); |
738 | 0 | m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128()); |
739 | 0 | m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA); |
740 | 0 | m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128()); |
741 | | //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst); |
742 | 0 | *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1); |
743 | 0 | } |
744 | | #endif |
745 | | |
746 | | #if 0 |
747 | | void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, |
748 | | ptrdiff_t _stride) { |
749 | | int i; |
750 | | uint8_t shift_2nd = 10; // 20 - Bit depth |
751 | | uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) |
752 | | |
753 | | uint16_t *dst = (uint16_t*) _dst; |
754 | | ptrdiff_t stride = _stride/2; |
755 | | int16_t *src = coeffs; |
756 | | |
757 | | int j; |
758 | | __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD; |
759 | | S0 = _mm_load_si128((__m128i *) (src)); |
760 | | S8 = _mm_load_si128((__m128i *) (src + 8)); |
761 | | m128iAdd = _mm_set1_epi32(add_1st); |
762 | | |
763 | | m128Tmp = _mm_unpacklo_epi16(S0, S8); |
764 | | E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); |
765 | | E1 = _mm_add_epi32(E1, m128iAdd); |
766 | | |
767 | | E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); |
768 | | E2 = _mm_add_epi32(E2, m128iAdd); |
769 | | |
770 | | m128Tmp = _mm_unpackhi_epi16(S0, S8); |
771 | | O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); |
772 | | O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); |
773 | | |
774 | | m128iA = _mm_add_epi32(E1, O1); |
775 | | m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum |
776 | | m128Tmp = _mm_add_epi32(E2, O2); |
777 | | m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum |
778 | | m128iA = _mm_packs_epi32(m128iA, m128Tmp); |
779 | | |
780 | | m128iD = _mm_sub_epi32(E2, O2); |
781 | | m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum |
782 | | |
783 | | m128Tmp = _mm_sub_epi32(E1, O1); |
784 | | m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum |
785 | | |
786 | | m128iD = _mm_packs_epi32(m128iD, m128Tmp); |
787 | | |
788 | | S0 = _mm_unpacklo_epi16(m128iA, m128iD); |
789 | | S8 = _mm_unpackhi_epi16(m128iA, m128iD); |
790 | | |
791 | | m128iA = _mm_unpacklo_epi16(S0, S8); |
792 | | m128iD = _mm_unpackhi_epi16(S0, S8); |
793 | | |
794 | | /* ########################## */ |
795 | | |
796 | | m128iAdd = _mm_set1_epi32(add_2nd); |
797 | | m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD); |
798 | | E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0]))); |
799 | | E1 = _mm_add_epi32(E1, m128iAdd); |
800 | | |
801 | | E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1]))); |
802 | | E2 = _mm_add_epi32(E2, m128iAdd); |
803 | | |
804 | | m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD); |
805 | | O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2]))); |
806 | | O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3]))); |
807 | | |
808 | | m128iA = _mm_add_epi32(E1, O1); |
809 | | m128iA = _mm_srai_epi32(m128iA, shift_2nd); |
810 | | m128Tmp = _mm_add_epi32(E2, O2); |
811 | | m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); |
812 | | m128iA = _mm_packs_epi32(m128iA, m128Tmp); |
813 | | |
814 | | m128iD = _mm_sub_epi32(E2, O2); |
815 | | m128iD = _mm_srai_epi32(m128iD, shift_2nd); |
816 | | |
817 | | m128Tmp = _mm_sub_epi32(E1, O1); |
818 | | m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd); |
819 | | |
820 | | m128iD = _mm_packs_epi32(m128iD, m128Tmp); |
821 | | _mm_storeu_si128((__m128i *) (src), m128iA); |
822 | | _mm_storeu_si128((__m128i *) (src + 8), m128iD); |
823 | | j = 0; |
824 | | for (i = 0; i < 2; i++) { |
825 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
826 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); |
827 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); |
828 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); |
829 | | j += 1; |
830 | | dst += stride; |
831 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
832 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10); |
833 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10); |
834 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10); |
835 | | j += 1; |
836 | | dst += stride; |
837 | | } |
838 | | } |
839 | | #endif |
840 | | |
841 | | #if HAVE_SSE4_1 |
842 | | void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, |
843 | 272k | ptrdiff_t _stride) { |
844 | 272k | uint8_t shift_2nd = 12; // 20 - Bit depth |
845 | 272k | uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) |
846 | | |
847 | 272k | uint8_t *dst = (uint8_t*) _dst; |
848 | 272k | ptrdiff_t stride = _stride / sizeof(uint8_t); |
849 | 272k | const int16_t *src = coeffs; |
850 | 272k | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
851 | 272k | m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, |
852 | 272k | E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, |
853 | | |
854 | 272k | O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h, |
855 | 272k | T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11; |
856 | 272k | T0= _mm_load_si128((__m128i *) (transform8x8[0])); |
857 | 272k | T1= _mm_load_si128((__m128i *) (transform8x8[1])); |
858 | 272k | T2= _mm_load_si128((__m128i *) (transform8x8[2])); |
859 | 272k | T3= _mm_load_si128((__m128i *) (transform8x8[3])); |
860 | 272k | T4= _mm_load_si128((__m128i *) (transform8x8[4])); |
861 | 272k | T5= _mm_load_si128((__m128i *) (transform8x8[5])); |
862 | 272k | T6= _mm_load_si128((__m128i *) (transform8x8[6])); |
863 | 272k | T7= _mm_load_si128((__m128i *) (transform8x8[7])); |
864 | 272k | T8= _mm_load_si128((__m128i *) (transform8x8[8])); |
865 | 272k | T9= _mm_load_si128((__m128i *) (transform8x8[9])); |
866 | 272k | T10= _mm_load_si128((__m128i *) (transform8x8[10])); |
867 | 272k | T11= _mm_load_si128((__m128i *) (transform8x8[11])); |
868 | | |
869 | 272k | m128iAdd = _mm_set1_epi32(add_1st); |
870 | | |
871 | 272k | m128iS1 = _mm_load_si128((__m128i *) (src + 8)); |
872 | 272k | m128iS3 = _mm_load_si128((__m128i *) (src + 24)); |
873 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
874 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T0); |
875 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
876 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T0); |
877 | 272k | m128iS5 = _mm_load_si128((__m128i *) (src + 40)); |
878 | 272k | m128iS7 = _mm_load_si128((__m128i *) (src + 56)); |
879 | 272k | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
880 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T1); |
881 | 272k | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
882 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T1); |
883 | 272k | O0l = _mm_add_epi32(E1l, E2l); |
884 | 272k | O0h = _mm_add_epi32(E1h, E2h); |
885 | | |
886 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T2); |
887 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T2); |
888 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T3); |
889 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T3); |
890 | | |
891 | 272k | O1l = _mm_add_epi32(E1l, E2l); |
892 | 272k | O1h = _mm_add_epi32(E1h, E2h); |
893 | | |
894 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T4); |
895 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T4); |
896 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T5); |
897 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T5); |
898 | 272k | O2l = _mm_add_epi32(E1l, E2l); |
899 | 272k | O2h = _mm_add_epi32(E1h, E2h); |
900 | | |
901 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T6); |
902 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T6); |
903 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T7); |
904 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T7); |
905 | 272k | O3h = _mm_add_epi32(E1h, E2h); |
906 | 272k | O3l = _mm_add_epi32(E1l, E2l); |
907 | | |
908 | | /* ------- */ |
909 | | |
910 | 272k | m128iS0 = _mm_load_si128((__m128i *) (src + 0)); |
911 | 272k | m128iS4 = _mm_load_si128((__m128i *) (src + 32)); |
912 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); |
913 | 272k | EE0l = _mm_madd_epi16(m128Tmp0, T8); |
914 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); |
915 | 272k | EE0h = _mm_madd_epi16(m128Tmp1, T8); |
916 | | |
917 | 272k | EE1l = _mm_madd_epi16(m128Tmp0, T9); |
918 | 272k | EE1h = _mm_madd_epi16(m128Tmp1, T9); |
919 | | |
920 | | /* ------- */ |
921 | | |
922 | 272k | m128iS2 = _mm_load_si128((__m128i *) (src + 16)); |
923 | 272k | m128iS6 = _mm_load_si128((__m128i *) (src + 48)); |
924 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
925 | 272k | E00l = _mm_madd_epi16(m128Tmp0, T10); |
926 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
927 | 272k | E00h = _mm_madd_epi16(m128Tmp1, T10); |
928 | 272k | E01l = _mm_madd_epi16(m128Tmp0, T11); |
929 | 272k | E01h = _mm_madd_epi16(m128Tmp1, T11); |
930 | 272k | E0l = _mm_add_epi32(EE0l, E00l); |
931 | 272k | E0l = _mm_add_epi32(E0l, m128iAdd); |
932 | 272k | E0h = _mm_add_epi32(EE0h, E00h); |
933 | 272k | E0h = _mm_add_epi32(E0h, m128iAdd); |
934 | 272k | E3l = _mm_sub_epi32(EE0l, E00l); |
935 | 272k | E3l = _mm_add_epi32(E3l, m128iAdd); |
936 | 272k | E3h = _mm_sub_epi32(EE0h, E00h); |
937 | 272k | E3h = _mm_add_epi32(E3h, m128iAdd); |
938 | | |
939 | 272k | E1l = _mm_add_epi32(EE1l, E01l); |
940 | 272k | E1l = _mm_add_epi32(E1l, m128iAdd); |
941 | 272k | E1h = _mm_add_epi32(EE1h, E01h); |
942 | 272k | E1h = _mm_add_epi32(E1h, m128iAdd); |
943 | 272k | E2l = _mm_sub_epi32(EE1l, E01l); |
944 | 272k | E2l = _mm_add_epi32(E2l, m128iAdd); |
945 | 272k | E2h = _mm_sub_epi32(EE1h, E01h); |
946 | 272k | E2h = _mm_add_epi32(E2h, m128iAdd); |
947 | 272k | m128iS0 = _mm_packs_epi32( |
948 | 272k | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), |
949 | 272k | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); |
950 | 272k | m128iS1 = _mm_packs_epi32( |
951 | 272k | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), |
952 | 272k | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); |
953 | 272k | m128iS2 = _mm_packs_epi32( |
954 | 272k | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), |
955 | 272k | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); |
956 | 272k | m128iS3 = _mm_packs_epi32( |
957 | 272k | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), |
958 | 272k | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); |
959 | 272k | m128iS4 = _mm_packs_epi32( |
960 | 272k | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), |
961 | 272k | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); |
962 | 272k | m128iS5 = _mm_packs_epi32( |
963 | 272k | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), |
964 | 272k | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); |
965 | 272k | m128iS6 = _mm_packs_epi32( |
966 | 272k | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), |
967 | 272k | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); |
968 | 272k | m128iS7 = _mm_packs_epi32( |
969 | 272k | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), |
970 | 272k | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); |
971 | | /* Invers matrix */ |
972 | | |
973 | 272k | E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); |
974 | 272k | E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); |
975 | 272k | E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); |
976 | 272k | E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); |
977 | 272k | O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); |
978 | 272k | O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); |
979 | 272k | O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); |
980 | 272k | O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); |
981 | 272k | m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); |
982 | 272k | m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); |
983 | 272k | m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
984 | 272k | m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
985 | 272k | m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); |
986 | 272k | m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); |
987 | 272k | m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
988 | 272k | m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
989 | 272k | m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); |
990 | 272k | m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); |
991 | 272k | m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
992 | 272k | m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
993 | 272k | m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); |
994 | 272k | m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); |
995 | 272k | m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
996 | 272k | m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
997 | | |
998 | 272k | m128iAdd = _mm_set1_epi32(add_2nd); |
999 | | |
1000 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
1001 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T0); |
1002 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
1003 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T0); |
1004 | 272k | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
1005 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T1); |
1006 | 272k | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
1007 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T1); |
1008 | 272k | O0l = _mm_add_epi32(E1l, E2l); |
1009 | 272k | O0h = _mm_add_epi32(E1h, E2h); |
1010 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T2); |
1011 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T2); |
1012 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T3); |
1013 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T3); |
1014 | 272k | O1l = _mm_add_epi32(E1l, E2l); |
1015 | 272k | O1h = _mm_add_epi32(E1h, E2h); |
1016 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T4); |
1017 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T4); |
1018 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T5); |
1019 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T5); |
1020 | 272k | O2l = _mm_add_epi32(E1l, E2l); |
1021 | 272k | O2h = _mm_add_epi32(E1h, E2h); |
1022 | 272k | E1l = _mm_madd_epi16(m128Tmp0, T6); |
1023 | 272k | E1h = _mm_madd_epi16(m128Tmp1, T6); |
1024 | 272k | E2l = _mm_madd_epi16(m128Tmp2, T7); |
1025 | 272k | E2h = _mm_madd_epi16(m128Tmp3, T7); |
1026 | 272k | O3h = _mm_add_epi32(E1h, E2h); |
1027 | 272k | O3l = _mm_add_epi32(E1l, E2l); |
1028 | | |
1029 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); |
1030 | 272k | EE0l = _mm_madd_epi16(m128Tmp0, T8); |
1031 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); |
1032 | 272k | EE0h = _mm_madd_epi16(m128Tmp1, T8); |
1033 | 272k | EE1l = _mm_madd_epi16(m128Tmp0, T9); |
1034 | 272k | EE1h = _mm_madd_epi16(m128Tmp1, T9); |
1035 | | |
1036 | 272k | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1037 | 272k | E00l = _mm_madd_epi16(m128Tmp0, T10); |
1038 | 272k | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1039 | 272k | E00h = _mm_madd_epi16(m128Tmp1, T10); |
1040 | 272k | E01l = _mm_madd_epi16(m128Tmp0, T11); |
1041 | 272k | E01h = _mm_madd_epi16(m128Tmp1, T11); |
1042 | 272k | E0l = _mm_add_epi32(EE0l, E00l); |
1043 | 272k | E0l = _mm_add_epi32(E0l, m128iAdd); |
1044 | 272k | E0h = _mm_add_epi32(EE0h, E00h); |
1045 | 272k | E0h = _mm_add_epi32(E0h, m128iAdd); |
1046 | 272k | E3l = _mm_sub_epi32(EE0l, E00l); |
1047 | 272k | E3l = _mm_add_epi32(E3l, m128iAdd); |
1048 | 272k | E3h = _mm_sub_epi32(EE0h, E00h); |
1049 | 272k | E3h = _mm_add_epi32(E3h, m128iAdd); |
1050 | 272k | E1l = _mm_add_epi32(EE1l, E01l); |
1051 | 272k | E1l = _mm_add_epi32(E1l, m128iAdd); |
1052 | 272k | E1h = _mm_add_epi32(EE1h, E01h); |
1053 | 272k | E1h = _mm_add_epi32(E1h, m128iAdd); |
1054 | 272k | E2l = _mm_sub_epi32(EE1l, E01l); |
1055 | 272k | E2l = _mm_add_epi32(E2l, m128iAdd); |
1056 | 272k | E2h = _mm_sub_epi32(EE1h, E01h); |
1057 | 272k | E2h = _mm_add_epi32(E2h, m128iAdd); |
1058 | | |
1059 | 272k | m128iS0 = _mm_packs_epi32( |
1060 | 272k | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), |
1061 | 272k | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); |
1062 | 272k | m128iS1 = _mm_packs_epi32( |
1063 | 272k | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), |
1064 | 272k | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); |
1065 | 272k | m128iS2 = _mm_packs_epi32( |
1066 | 272k | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), |
1067 | 272k | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); |
1068 | 272k | m128iS3 = _mm_packs_epi32( |
1069 | 272k | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), |
1070 | 272k | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); |
1071 | 272k | m128iS4 = _mm_packs_epi32( |
1072 | 272k | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), |
1073 | 272k | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); |
1074 | 272k | m128iS5 = _mm_packs_epi32( |
1075 | 272k | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), |
1076 | 272k | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); |
1077 | 272k | m128iS6 = _mm_packs_epi32( |
1078 | 272k | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), |
1079 | 272k | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); |
1080 | 272k | m128iS7 = _mm_packs_epi32( |
1081 | 272k | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), |
1082 | 272k | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); |
1083 | | |
1084 | 272k | E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); |
1085 | 272k | E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); |
1086 | 272k | E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1087 | 272k | E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); |
1088 | 272k | O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); |
1089 | 272k | O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); |
1090 | 272k | O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1091 | 272k | O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); |
1092 | 272k | m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); |
1093 | 272k | m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); |
1094 | 272k | m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
1095 | 272k | m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
1096 | 272k | m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); |
1097 | 272k | m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); |
1098 | 272k | m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
1099 | 272k | m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
1100 | 272k | m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); |
1101 | 272k | m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); |
1102 | 272k | m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
1103 | 272k | m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
1104 | 272k | m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); |
1105 | 272k | m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); |
1106 | 272k | m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
1107 | 272k | m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
1108 | | |
1109 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1110 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1111 | | |
1112 | 272k | E0l = _mm_adds_epi16(E0l, m128iS0); |
1113 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1114 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1115 | 272k | dst += stride; |
1116 | | |
1117 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1118 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1119 | | |
1120 | 272k | E0l = _mm_adds_epi16(E0l, m128iS1); |
1121 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1122 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1123 | 272k | dst += stride; |
1124 | | |
1125 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1126 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1127 | | |
1128 | 272k | E0l = _mm_adds_epi16(E0l, m128iS2); |
1129 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1130 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1131 | 272k | dst += stride; |
1132 | | |
1133 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1134 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1135 | | |
1136 | 272k | E0l = _mm_adds_epi16(E0l, m128iS3); |
1137 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1138 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1139 | 272k | dst += stride; |
1140 | | |
1141 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1142 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1143 | | |
1144 | 272k | E0l = _mm_adds_epi16(E0l, m128iS4); |
1145 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1146 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1147 | 272k | dst += stride; |
1148 | | |
1149 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1150 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1151 | | |
1152 | 272k | E0l = _mm_adds_epi16(E0l, m128iS5); |
1153 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1154 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1155 | 272k | dst += stride; |
1156 | | |
1157 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1158 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1159 | | |
1160 | 272k | E0l = _mm_adds_epi16(E0l, m128iS6); |
1161 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1162 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1163 | 272k | dst += stride; |
1164 | | |
1165 | 272k | E0l = _mm_loadl_epi64((__m128i *) dst); |
1166 | 272k | E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128()); |
1167 | | |
1168 | 272k | E0l = _mm_adds_epi16(E0l, m128iS7); |
1169 | 272k | E0l = _mm_packus_epi16(E0l, _mm_setzero_si128()); |
1170 | 272k | _mm_storel_epi64((__m128i *) dst, E0l); |
1171 | 272k | dst += stride; |
1172 | | |
1173 | 272k | } |
1174 | | #endif |
1175 | | |
1176 | | #if 0 |
1177 | | void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, |
1178 | | ptrdiff_t _stride) { |
1179 | | int i; |
1180 | | uint16_t *dst = (uint16_t*) _dst; |
1181 | | ptrdiff_t stride = _stride / sizeof(uint16_t); |
1182 | | int16_t *src = coeffs; |
1183 | | uint8_t shift_2nd = 10; // 20 - Bit depth |
1184 | | uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1)) |
1185 | | |
1186 | | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
1187 | | m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h, |
1188 | | E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l, |
1189 | | O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h; |
1190 | | int j; |
1191 | | m128iAdd = _mm_set1_epi32(add_1st); |
1192 | | |
1193 | | m128iS1 = _mm_load_si128((__m128i *) (src + 8)); |
1194 | | m128iS3 = _mm_load_si128((__m128i *) (src + 24)); |
1195 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
1196 | | E1l = _mm_madd_epi16(m128Tmp0, |
1197 | | _mm_load_si128((__m128i *) (transform8x8[0]))); |
1198 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
1199 | | E1h = _mm_madd_epi16(m128Tmp1, |
1200 | | _mm_load_si128((__m128i *) (transform8x8[0]))); |
1201 | | m128iS5 = _mm_load_si128((__m128i *) (src + 40)); |
1202 | | m128iS7 = _mm_load_si128((__m128i *) (src + 56)); |
1203 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
1204 | | E2l = _mm_madd_epi16(m128Tmp2, |
1205 | | _mm_load_si128((__m128i *) (transform8x8[1]))); |
1206 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
1207 | | E2h = _mm_madd_epi16(m128Tmp3, |
1208 | | _mm_load_si128((__m128i *) (transform8x8[1]))); |
1209 | | O0l = _mm_add_epi32(E1l, E2l); |
1210 | | O0h = _mm_add_epi32(E1h, E2h); |
1211 | | |
1212 | | E1l = _mm_madd_epi16(m128Tmp0, |
1213 | | _mm_load_si128((__m128i *) (transform8x8[2]))); |
1214 | | E1h = _mm_madd_epi16(m128Tmp1, |
1215 | | _mm_load_si128((__m128i *) (transform8x8[2]))); |
1216 | | E2l = _mm_madd_epi16(m128Tmp2, |
1217 | | _mm_load_si128((__m128i *) (transform8x8[3]))); |
1218 | | E2h = _mm_madd_epi16(m128Tmp3, |
1219 | | _mm_load_si128((__m128i *) (transform8x8[3]))); |
1220 | | |
1221 | | O1l = _mm_add_epi32(E1l, E2l); |
1222 | | O1h = _mm_add_epi32(E1h, E2h); |
1223 | | |
1224 | | E1l = _mm_madd_epi16(m128Tmp0, |
1225 | | _mm_load_si128((__m128i *) (transform8x8[4]))); |
1226 | | E1h = _mm_madd_epi16(m128Tmp1, |
1227 | | _mm_load_si128((__m128i *) (transform8x8[4]))); |
1228 | | E2l = _mm_madd_epi16(m128Tmp2, |
1229 | | _mm_load_si128((__m128i *) (transform8x8[5]))); |
1230 | | E2h = _mm_madd_epi16(m128Tmp3, |
1231 | | _mm_load_si128((__m128i *) (transform8x8[5]))); |
1232 | | O2l = _mm_add_epi32(E1l, E2l); |
1233 | | O2h = _mm_add_epi32(E1h, E2h); |
1234 | | |
1235 | | E1l = _mm_madd_epi16(m128Tmp0, |
1236 | | _mm_load_si128((__m128i *) (transform8x8[6]))); |
1237 | | E1h = _mm_madd_epi16(m128Tmp1, |
1238 | | _mm_load_si128((__m128i *) (transform8x8[6]))); |
1239 | | E2l = _mm_madd_epi16(m128Tmp2, |
1240 | | _mm_load_si128((__m128i *) (transform8x8[7]))); |
1241 | | E2h = _mm_madd_epi16(m128Tmp3, |
1242 | | _mm_load_si128((__m128i *) (transform8x8[7]))); |
1243 | | O3h = _mm_add_epi32(E1h, E2h); |
1244 | | O3l = _mm_add_epi32(E1l, E2l); |
1245 | | |
1246 | | /* ------- */ |
1247 | | |
1248 | | m128iS0 = _mm_load_si128((__m128i *) (src + 0)); |
1249 | | m128iS4 = _mm_load_si128((__m128i *) (src + 32)); |
1250 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); |
1251 | | EE0l = _mm_madd_epi16(m128Tmp0, |
1252 | | _mm_load_si128((__m128i *) (transform8x8[8]))); |
1253 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); |
1254 | | EE0h = _mm_madd_epi16(m128Tmp1, |
1255 | | _mm_load_si128((__m128i *) (transform8x8[8]))); |
1256 | | |
1257 | | EE1l = _mm_madd_epi16(m128Tmp0, |
1258 | | _mm_load_si128((__m128i *) (transform8x8[9]))); |
1259 | | EE1h = _mm_madd_epi16(m128Tmp1, |
1260 | | _mm_load_si128((__m128i *) (transform8x8[9]))); |
1261 | | |
1262 | | /* ------- */ |
1263 | | |
1264 | | m128iS2 = _mm_load_si128((__m128i *) (src + 16)); |
1265 | | m128iS6 = _mm_load_si128((__m128i *) (src + 48)); |
1266 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1267 | | E00l = _mm_madd_epi16(m128Tmp0, |
1268 | | _mm_load_si128((__m128i *) (transform8x8[10]))); |
1269 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1270 | | E00h = _mm_madd_epi16(m128Tmp1, |
1271 | | _mm_load_si128((__m128i *) (transform8x8[10]))); |
1272 | | E01l = _mm_madd_epi16(m128Tmp0, |
1273 | | _mm_load_si128((__m128i *) (transform8x8[11]))); |
1274 | | E01h = _mm_madd_epi16(m128Tmp1, |
1275 | | _mm_load_si128((__m128i *) (transform8x8[11]))); |
1276 | | E0l = _mm_add_epi32(EE0l, E00l); |
1277 | | E0l = _mm_add_epi32(E0l, m128iAdd); |
1278 | | E0h = _mm_add_epi32(EE0h, E00h); |
1279 | | E0h = _mm_add_epi32(E0h, m128iAdd); |
1280 | | E3l = _mm_sub_epi32(EE0l, E00l); |
1281 | | E3l = _mm_add_epi32(E3l, m128iAdd); |
1282 | | E3h = _mm_sub_epi32(EE0h, E00h); |
1283 | | E3h = _mm_add_epi32(E3h, m128iAdd); |
1284 | | |
1285 | | E1l = _mm_add_epi32(EE1l, E01l); |
1286 | | E1l = _mm_add_epi32(E1l, m128iAdd); |
1287 | | E1h = _mm_add_epi32(EE1h, E01h); |
1288 | | E1h = _mm_add_epi32(E1h, m128iAdd); |
1289 | | E2l = _mm_sub_epi32(EE1l, E01l); |
1290 | | E2l = _mm_add_epi32(E2l, m128iAdd); |
1291 | | E2h = _mm_sub_epi32(EE1h, E01h); |
1292 | | E2h = _mm_add_epi32(E2h, m128iAdd); |
1293 | | m128iS0 = _mm_packs_epi32( |
1294 | | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st), |
1295 | | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st)); |
1296 | | m128iS1 = _mm_packs_epi32( |
1297 | | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st), |
1298 | | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st)); |
1299 | | m128iS2 = _mm_packs_epi32( |
1300 | | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st), |
1301 | | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st)); |
1302 | | m128iS3 = _mm_packs_epi32( |
1303 | | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st), |
1304 | | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st)); |
1305 | | m128iS4 = _mm_packs_epi32( |
1306 | | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st), |
1307 | | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st)); |
1308 | | m128iS5 = _mm_packs_epi32( |
1309 | | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st), |
1310 | | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st)); |
1311 | | m128iS6 = _mm_packs_epi32( |
1312 | | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st), |
1313 | | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st)); |
1314 | | m128iS7 = _mm_packs_epi32( |
1315 | | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st), |
1316 | | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st)); |
1317 | | /* Invers matrix */ |
1318 | | |
1319 | | E0l = _mm_unpacklo_epi16(m128iS0, m128iS4); |
1320 | | E1l = _mm_unpacklo_epi16(m128iS1, m128iS5); |
1321 | | E2l = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1322 | | E3l = _mm_unpacklo_epi16(m128iS3, m128iS7); |
1323 | | O0l = _mm_unpackhi_epi16(m128iS0, m128iS4); |
1324 | | O1l = _mm_unpackhi_epi16(m128iS1, m128iS5); |
1325 | | O2l = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1326 | | O3l = _mm_unpackhi_epi16(m128iS3, m128iS7); |
1327 | | m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l); |
1328 | | m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l); |
1329 | | m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
1330 | | m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
1331 | | m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l); |
1332 | | m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l); |
1333 | | m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
1334 | | m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
1335 | | m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l); |
1336 | | m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l); |
1337 | | m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1); |
1338 | | m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1); |
1339 | | m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l); |
1340 | | m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l); |
1341 | | m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3); |
1342 | | m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3); |
1343 | | |
1344 | | m128iAdd = _mm_set1_epi32(add_2nd); |
1345 | | |
1346 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
1347 | | E1l = _mm_madd_epi16(m128Tmp0, |
1348 | | _mm_load_si128((__m128i *) (transform8x8[0]))); |
1349 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
1350 | | E1h = _mm_madd_epi16(m128Tmp1, |
1351 | | _mm_load_si128((__m128i *) (transform8x8[0]))); |
1352 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
1353 | | E2l = _mm_madd_epi16(m128Tmp2, |
1354 | | _mm_load_si128((__m128i *) (transform8x8[1]))); |
1355 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
1356 | | E2h = _mm_madd_epi16(m128Tmp3, |
1357 | | _mm_load_si128((__m128i *) (transform8x8[1]))); |
1358 | | O0l = _mm_add_epi32(E1l, E2l); |
1359 | | O0h = _mm_add_epi32(E1h, E2h); |
1360 | | E1l = _mm_madd_epi16(m128Tmp0, |
1361 | | _mm_load_si128((__m128i *) (transform8x8[2]))); |
1362 | | E1h = _mm_madd_epi16(m128Tmp1, |
1363 | | _mm_load_si128((__m128i *) (transform8x8[2]))); |
1364 | | E2l = _mm_madd_epi16(m128Tmp2, |
1365 | | _mm_load_si128((__m128i *) (transform8x8[3]))); |
1366 | | E2h = _mm_madd_epi16(m128Tmp3, |
1367 | | _mm_load_si128((__m128i *) (transform8x8[3]))); |
1368 | | O1l = _mm_add_epi32(E1l, E2l); |
1369 | | O1h = _mm_add_epi32(E1h, E2h); |
1370 | | E1l = _mm_madd_epi16(m128Tmp0, |
1371 | | _mm_load_si128((__m128i *) (transform8x8[4]))); |
1372 | | E1h = _mm_madd_epi16(m128Tmp1, |
1373 | | _mm_load_si128((__m128i *) (transform8x8[4]))); |
1374 | | E2l = _mm_madd_epi16(m128Tmp2, |
1375 | | _mm_load_si128((__m128i *) (transform8x8[5]))); |
1376 | | E2h = _mm_madd_epi16(m128Tmp3, |
1377 | | _mm_load_si128((__m128i *) (transform8x8[5]))); |
1378 | | O2l = _mm_add_epi32(E1l, E2l); |
1379 | | O2h = _mm_add_epi32(E1h, E2h); |
1380 | | E1l = _mm_madd_epi16(m128Tmp0, |
1381 | | _mm_load_si128((__m128i *) (transform8x8[6]))); |
1382 | | E1h = _mm_madd_epi16(m128Tmp1, |
1383 | | _mm_load_si128((__m128i *) (transform8x8[6]))); |
1384 | | E2l = _mm_madd_epi16(m128Tmp2, |
1385 | | _mm_load_si128((__m128i *) (transform8x8[7]))); |
1386 | | E2h = _mm_madd_epi16(m128Tmp3, |
1387 | | _mm_load_si128((__m128i *) (transform8x8[7]))); |
1388 | | O3h = _mm_add_epi32(E1h, E2h); |
1389 | | O3l = _mm_add_epi32(E1l, E2l); |
1390 | | |
1391 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4); |
1392 | | EE0l = _mm_madd_epi16(m128Tmp0, |
1393 | | _mm_load_si128((__m128i *) (transform8x8[8]))); |
1394 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4); |
1395 | | EE0h = _mm_madd_epi16(m128Tmp1, |
1396 | | _mm_load_si128((__m128i *) (transform8x8[8]))); |
1397 | | EE1l = _mm_madd_epi16(m128Tmp0, |
1398 | | _mm_load_si128((__m128i *) (transform8x8[9]))); |
1399 | | EE1h = _mm_madd_epi16(m128Tmp1, |
1400 | | _mm_load_si128((__m128i *) (transform8x8[9]))); |
1401 | | |
1402 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1403 | | E00l = _mm_madd_epi16(m128Tmp0, |
1404 | | _mm_load_si128((__m128i *) (transform8x8[10]))); |
1405 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1406 | | E00h = _mm_madd_epi16(m128Tmp1, |
1407 | | _mm_load_si128((__m128i *) (transform8x8[10]))); |
1408 | | E01l = _mm_madd_epi16(m128Tmp0, |
1409 | | _mm_load_si128((__m128i *) (transform8x8[11]))); |
1410 | | E01h = _mm_madd_epi16(m128Tmp1, |
1411 | | _mm_load_si128((__m128i *) (transform8x8[11]))); |
1412 | | E0l = _mm_add_epi32(EE0l, E00l); |
1413 | | E0l = _mm_add_epi32(E0l, m128iAdd); |
1414 | | E0h = _mm_add_epi32(EE0h, E00h); |
1415 | | E0h = _mm_add_epi32(E0h, m128iAdd); |
1416 | | E3l = _mm_sub_epi32(EE0l, E00l); |
1417 | | E3l = _mm_add_epi32(E3l, m128iAdd); |
1418 | | E3h = _mm_sub_epi32(EE0h, E00h); |
1419 | | E3h = _mm_add_epi32(E3h, m128iAdd); |
1420 | | E1l = _mm_add_epi32(EE1l, E01l); |
1421 | | E1l = _mm_add_epi32(E1l, m128iAdd); |
1422 | | E1h = _mm_add_epi32(EE1h, E01h); |
1423 | | E1h = _mm_add_epi32(E1h, m128iAdd); |
1424 | | E2l = _mm_sub_epi32(EE1l, E01l); |
1425 | | E2l = _mm_add_epi32(E2l, m128iAdd); |
1426 | | E2h = _mm_sub_epi32(EE1h, E01h); |
1427 | | E2h = _mm_add_epi32(E2h, m128iAdd); |
1428 | | |
1429 | | m128iS0 = _mm_packs_epi32( |
1430 | | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd), |
1431 | | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd)); |
1432 | | m128iS1 = _mm_packs_epi32( |
1433 | | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd), |
1434 | | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd)); |
1435 | | m128iS2 = _mm_packs_epi32( |
1436 | | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd), |
1437 | | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd)); |
1438 | | m128iS3 = _mm_packs_epi32( |
1439 | | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd), |
1440 | | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd)); |
1441 | | m128iS4 = _mm_packs_epi32( |
1442 | | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd), |
1443 | | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd)); |
1444 | | m128iS5 = _mm_packs_epi32( |
1445 | | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd), |
1446 | | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd)); |
1447 | | m128iS6 = _mm_packs_epi32( |
1448 | | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd), |
1449 | | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd)); |
1450 | | m128iS7 = _mm_packs_epi32( |
1451 | | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd), |
1452 | | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd)); |
1453 | | |
1454 | | _mm_store_si128((__m128i *) (src), m128iS0); |
1455 | | _mm_store_si128((__m128i *) (src + 8), m128iS1); |
1456 | | _mm_store_si128((__m128i *) (src + 16), m128iS2); |
1457 | | _mm_store_si128((__m128i *) (src + 24), m128iS3); |
1458 | | _mm_store_si128((__m128i *) (src + 32), m128iS4); |
1459 | | _mm_store_si128((__m128i *) (src + 40), m128iS5); |
1460 | | _mm_store_si128((__m128i *) (src + 48), m128iS6); |
1461 | | _mm_store_si128((__m128i *) (src + 56), m128iS7); |
1462 | | |
1463 | | j = 0; |
1464 | | for (i = 0; i < 4; i++) { |
1465 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
1466 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); |
1467 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); |
1468 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); |
1469 | | dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); |
1470 | | dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); |
1471 | | dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); |
1472 | | dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); |
1473 | | j += 1; |
1474 | | dst += stride; |
1475 | | dst[0] = av_clip_uintp2(dst[0] + src[j],10); |
1476 | | dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10); |
1477 | | dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10); |
1478 | | dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10); |
1479 | | dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10); |
1480 | | dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10); |
1481 | | dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10); |
1482 | | dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10); |
1483 | | j += 1; |
1484 | | dst += stride; |
1485 | | } |
1486 | | |
1487 | | } |
1488 | | #endif |
1489 | | |
1490 | | |
1491 | | #if HAVE_SSE4_1 |
1492 | | void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, |
1493 | 10.6k | ptrdiff_t _stride) { |
1494 | 10.6k | uint8_t shift_2nd = 12; // 20 - Bit depth |
1495 | 10.6k | uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) |
1496 | 10.6k | int i; |
1497 | 10.6k | uint8_t *dst = (uint8_t*) _dst; |
1498 | 10.6k | ptrdiff_t stride = _stride / sizeof(uint8_t); |
1499 | 10.6k | const int16_t *src = coeffs; |
1500 | 10.6k | int32_t shift; |
1501 | 10.6k | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
1502 | 10.6k | m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, |
1503 | 10.6k | m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, |
1504 | 10.6k | m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, |
1505 | 10.6k | E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, |
1506 | 10.6k | O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, |
1507 | 10.6k | E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; |
1508 | 10.6k | __m128i E4l, E5l, E6l, E7l; |
1509 | 10.6k | __m128i E4h, E5h, E6h, E7h; |
1510 | 10.6k | __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15; |
1511 | 10.6k | __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; |
1512 | | |
1513 | | |
1514 | | /*__m128i T00,T01, T02, T03, T04, T05, T06, T07; |
1515 | | __m128i T10,T11, T12, T13, T14, T15, T16, T17; |
1516 | | __m128i T20,T21, T22, T23, T24, T25, T26, T27; |
1517 | | __m128i T30,T31, T32, T33, T34, T35, T36, T37; |
1518 | | |
1519 | | __m128i U00,U01, U02, U03, U10, U11, U12, U13; |
1520 | | |
1521 | | __m128i V00,V01, V10, V11;*/ |
1522 | | |
1523 | | |
1524 | 10.6k | const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0])); |
1525 | 10.6k | const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1])); |
1526 | 10.6k | const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2])); |
1527 | 10.6k | const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3])); |
1528 | 10.6k | const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4])); |
1529 | 10.6k | const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5])); |
1530 | 10.6k | const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6])); |
1531 | 10.6k | const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7])); |
1532 | 10.6k | const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0])); |
1533 | 10.6k | const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1])); |
1534 | 10.6k | const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2])); |
1535 | 10.6k | const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3])); |
1536 | 10.6k | const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4])); |
1537 | 10.6k | const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5])); |
1538 | 10.6k | const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6])); |
1539 | 10.6k | const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7])); |
1540 | 10.6k | const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0])); |
1541 | 10.6k | const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1])); |
1542 | 10.6k | const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2])); |
1543 | 10.6k | const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3])); |
1544 | 10.6k | const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4])); |
1545 | 10.6k | const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5])); |
1546 | 10.6k | const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6])); |
1547 | 10.6k | const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7])); |
1548 | 10.6k | const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0])); |
1549 | 10.6k | const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1])); |
1550 | 10.6k | const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2])); |
1551 | 10.6k | const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3])); |
1552 | 10.6k | const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4])); |
1553 | 10.6k | const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5])); |
1554 | 10.6k | const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6])); |
1555 | 10.6k | const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7])); |
1556 | | |
1557 | 10.6k | const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0])); |
1558 | 10.6k | const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1])); |
1559 | 10.6k | const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2])); |
1560 | 10.6k | const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3])); |
1561 | 10.6k | const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0])); |
1562 | 10.6k | const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1])); |
1563 | 10.6k | const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2])); |
1564 | 10.6k | const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3])); |
1565 | | |
1566 | 10.6k | const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0])); |
1567 | 10.6k | const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1])); |
1568 | 10.6k | const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0])); |
1569 | 10.6k | const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1])); |
1570 | | |
1571 | | |
1572 | | |
1573 | 10.6k | int j; |
1574 | 10.6k | m128iS0 = _mm_load_si128((__m128i *) (src)); |
1575 | 10.6k | m128iS1 = _mm_load_si128((__m128i *) (src + 16)); |
1576 | 10.6k | m128iS2 = _mm_load_si128((__m128i *) (src + 32)); |
1577 | 10.6k | m128iS3 = _mm_load_si128((__m128i *) (src + 48)); |
1578 | 10.6k | m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); |
1579 | 10.6k | m128iS5 = _mm_load_si128((__m128i *) (src + 80)); |
1580 | 10.6k | m128iS6 = _mm_load_si128((__m128i *) (src + 96)); |
1581 | 10.6k | m128iS7 = _mm_load_si128((__m128i *) (src + 112)); |
1582 | 10.6k | m128iS8 = _mm_load_si128((__m128i *) (src + 128)); |
1583 | 10.6k | m128iS9 = _mm_load_si128((__m128i *) (src + 144)); |
1584 | 10.6k | m128iS10 = _mm_load_si128((__m128i *) (src + 160)); |
1585 | 10.6k | m128iS11 = _mm_load_si128((__m128i *) (src + 176)); |
1586 | 10.6k | m128iS12 = _mm_load_si128((__m128i *) (src + 192)); |
1587 | 10.6k | m128iS13 = _mm_load_si128((__m128i *) (src + 208)); |
1588 | 10.6k | m128iS14 = _mm_load_si128((__m128i *) (src + 224)); |
1589 | 10.6k | m128iS15 = _mm_load_si128((__m128i *) (src + 240)); |
1590 | 10.6k | shift = shift_1st; |
1591 | 10.6k | m128iAdd = _mm_set1_epi32(add_1st); |
1592 | | |
1593 | 31.9k | for (j = 0; j < 2; j++) { |
1594 | 63.9k | for (i = 0; i < 16; i += 8) { |
1595 | | |
1596 | 42.6k | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
1597 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T00); |
1598 | 42.6k | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
1599 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T00); |
1600 | | |
1601 | 42.6k | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
1602 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T10); |
1603 | 42.6k | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
1604 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T10); |
1605 | | |
1606 | 42.6k | m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); |
1607 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T20); |
1608 | 42.6k | m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); |
1609 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T20); |
1610 | | |
1611 | 42.6k | m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); |
1612 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T30); |
1613 | 42.6k | m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); |
1614 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T30); |
1615 | | |
1616 | 42.6k | O0l = _mm_add_epi32(E0l, E1l); |
1617 | 42.6k | O0l = _mm_add_epi32(O0l, E2l); |
1618 | 42.6k | O0l = _mm_add_epi32(O0l, E3l); |
1619 | | |
1620 | 42.6k | O0h = _mm_add_epi32(E0h, E1h); |
1621 | 42.6k | O0h = _mm_add_epi32(O0h, E2h); |
1622 | 42.6k | O0h = _mm_add_epi32(O0h, E3h); |
1623 | | |
1624 | | /* Compute O1*/ |
1625 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T01); |
1626 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T01); |
1627 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T11); |
1628 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T11); |
1629 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T21); |
1630 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T21); |
1631 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T31); |
1632 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T31); |
1633 | 42.6k | O1l = _mm_add_epi32(E0l, E1l); |
1634 | 42.6k | O1l = _mm_add_epi32(O1l, E2l); |
1635 | 42.6k | O1l = _mm_add_epi32(O1l, E3l); |
1636 | 42.6k | O1h = _mm_add_epi32(E0h, E1h); |
1637 | 42.6k | O1h = _mm_add_epi32(O1h, E2h); |
1638 | 42.6k | O1h = _mm_add_epi32(O1h, E3h); |
1639 | | |
1640 | | /* Compute O2*/ |
1641 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T02); |
1642 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T02); |
1643 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T12); |
1644 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T12); |
1645 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T22); |
1646 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T22); |
1647 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T32); |
1648 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T32); |
1649 | 42.6k | O2l = _mm_add_epi32(E0l, E1l); |
1650 | 42.6k | O2l = _mm_add_epi32(O2l, E2l); |
1651 | 42.6k | O2l = _mm_add_epi32(O2l, E3l); |
1652 | | |
1653 | 42.6k | O2h = _mm_add_epi32(E0h, E1h); |
1654 | 42.6k | O2h = _mm_add_epi32(O2h, E2h); |
1655 | 42.6k | O2h = _mm_add_epi32(O2h, E3h); |
1656 | | |
1657 | | /* Compute O3*/ |
1658 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T03); |
1659 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T03); |
1660 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T13); |
1661 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T13); |
1662 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T23); |
1663 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T23); |
1664 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T33); |
1665 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T33); |
1666 | | |
1667 | 42.6k | O3l = _mm_add_epi32(E0l, E1l); |
1668 | 42.6k | O3l = _mm_add_epi32(O3l, E2l); |
1669 | 42.6k | O3l = _mm_add_epi32(O3l, E3l); |
1670 | | |
1671 | 42.6k | O3h = _mm_add_epi32(E0h, E1h); |
1672 | 42.6k | O3h = _mm_add_epi32(O3h, E2h); |
1673 | 42.6k | O3h = _mm_add_epi32(O3h, E3h); |
1674 | | |
1675 | | /* Compute O4*/ |
1676 | | |
1677 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T04); |
1678 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T04); |
1679 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T14); |
1680 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T14); |
1681 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T24); |
1682 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T24); |
1683 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T34); |
1684 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T34); |
1685 | | |
1686 | 42.6k | O4l = _mm_add_epi32(E0l, E1l); |
1687 | 42.6k | O4l = _mm_add_epi32(O4l, E2l); |
1688 | 42.6k | O4l = _mm_add_epi32(O4l, E3l); |
1689 | | |
1690 | 42.6k | O4h = _mm_add_epi32(E0h, E1h); |
1691 | 42.6k | O4h = _mm_add_epi32(O4h, E2h); |
1692 | 42.6k | O4h = _mm_add_epi32(O4h, E3h); |
1693 | | |
1694 | | /* Compute O5*/ |
1695 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T05); |
1696 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T05); |
1697 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T15); |
1698 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T15); |
1699 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T25); |
1700 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T25); |
1701 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T35); |
1702 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T35); |
1703 | | |
1704 | 42.6k | O5l = _mm_add_epi32(E0l, E1l); |
1705 | 42.6k | O5l = _mm_add_epi32(O5l, E2l); |
1706 | 42.6k | O5l = _mm_add_epi32(O5l, E3l); |
1707 | | |
1708 | 42.6k | O5h = _mm_add_epi32(E0h, E1h); |
1709 | 42.6k | O5h = _mm_add_epi32(O5h, E2h); |
1710 | 42.6k | O5h = _mm_add_epi32(O5h, E3h); |
1711 | | |
1712 | | /* Compute O6*/ |
1713 | | |
1714 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T06); |
1715 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T06); |
1716 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T16); |
1717 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T16); |
1718 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T26); |
1719 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T26); |
1720 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T36); |
1721 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T36); |
1722 | | |
1723 | 42.6k | O6l = _mm_add_epi32(E0l, E1l); |
1724 | 42.6k | O6l = _mm_add_epi32(O6l, E2l); |
1725 | 42.6k | O6l = _mm_add_epi32(O6l, E3l); |
1726 | | |
1727 | 42.6k | O6h = _mm_add_epi32(E0h, E1h); |
1728 | 42.6k | O6h = _mm_add_epi32(O6h, E2h); |
1729 | 42.6k | O6h = _mm_add_epi32(O6h, E3h); |
1730 | | |
1731 | | /* Compute O7*/ |
1732 | | |
1733 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,T07); |
1734 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,T07); |
1735 | 42.6k | E1l = _mm_madd_epi16(m128Tmp2,T17); |
1736 | 42.6k | E1h = _mm_madd_epi16(m128Tmp3,T17); |
1737 | 42.6k | E2l = _mm_madd_epi16(m128Tmp4,T27); |
1738 | 42.6k | E2h = _mm_madd_epi16(m128Tmp5,T27); |
1739 | 42.6k | E3l = _mm_madd_epi16(m128Tmp6,T37); |
1740 | 42.6k | E3h = _mm_madd_epi16(m128Tmp7,T37); |
1741 | | |
1742 | 42.6k | O7l = _mm_add_epi32(E0l, E1l); |
1743 | 42.6k | O7l = _mm_add_epi32(O7l, E2l); |
1744 | 42.6k | O7l = _mm_add_epi32(O7l, E3l); |
1745 | | |
1746 | 42.6k | O7h = _mm_add_epi32(E0h, E1h); |
1747 | 42.6k | O7h = _mm_add_epi32(O7h, E2h); |
1748 | 42.6k | O7h = _mm_add_epi32(O7h, E3h); |
1749 | | |
1750 | | /* Compute E0 */ |
1751 | | |
1752 | | |
1753 | | |
1754 | 42.6k | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
1755 | 42.6k | E0l = _mm_madd_epi16(m128Tmp0,U00); |
1756 | 42.6k | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
1757 | 42.6k | E0h = _mm_madd_epi16(m128Tmp1,U00); |
1758 | | |
1759 | 42.6k | m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); |
1760 | 42.6k | E0l = _mm_add_epi32(E0l, |
1761 | 42.6k | _mm_madd_epi16(m128Tmp2,U10)); |
1762 | 42.6k | m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); |
1763 | 42.6k | E0h = _mm_add_epi32(E0h, |
1764 | 42.6k | _mm_madd_epi16(m128Tmp3,U10)); |
1765 | | |
1766 | | /* Compute E1 */ |
1767 | 42.6k | E1l = _mm_madd_epi16(m128Tmp0,U01); |
1768 | 42.6k | E1h = _mm_madd_epi16(m128Tmp1,U01); |
1769 | 42.6k | E1l = _mm_add_epi32(E1l, |
1770 | 42.6k | _mm_madd_epi16(m128Tmp2,U11)); |
1771 | 42.6k | E1h = _mm_add_epi32(E1h, |
1772 | 42.6k | _mm_madd_epi16(m128Tmp3,U11)); |
1773 | | |
1774 | | /* Compute E2 */ |
1775 | 42.6k | E2l = _mm_madd_epi16(m128Tmp0,U02); |
1776 | 42.6k | E2h = _mm_madd_epi16(m128Tmp1,U02); |
1777 | 42.6k | E2l = _mm_add_epi32(E2l, |
1778 | 42.6k | _mm_madd_epi16(m128Tmp2,U12)); |
1779 | 42.6k | E2h = _mm_add_epi32(E2h, |
1780 | 42.6k | _mm_madd_epi16(m128Tmp3,U12)); |
1781 | | /* Compute E3 */ |
1782 | 42.6k | E3l = _mm_madd_epi16(m128Tmp0,U03); |
1783 | 42.6k | E3h = _mm_madd_epi16(m128Tmp1,U03); |
1784 | 42.6k | E3l = _mm_add_epi32(E3l, |
1785 | 42.6k | _mm_madd_epi16(m128Tmp2,U13)); |
1786 | 42.6k | E3h = _mm_add_epi32(E3h, |
1787 | 42.6k | _mm_madd_epi16(m128Tmp3,U13)); |
1788 | | |
1789 | | /* Compute EE0 and EEE */ |
1790 | | |
1791 | 42.6k | m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); |
1792 | 42.6k | E00l = _mm_madd_epi16(m128Tmp0,V00); |
1793 | 42.6k | m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); |
1794 | 42.6k | E00h = _mm_madd_epi16(m128Tmp1,V00); |
1795 | | |
1796 | 42.6k | m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); |
1797 | 42.6k | EE0l = _mm_madd_epi16(m128Tmp2,V10); |
1798 | 42.6k | m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); |
1799 | 42.6k | EE0h = _mm_madd_epi16(m128Tmp3,V10); |
1800 | | |
1801 | 42.6k | E01l = _mm_madd_epi16(m128Tmp0,V01); |
1802 | 42.6k | E01h = _mm_madd_epi16(m128Tmp1,V01); |
1803 | | |
1804 | 42.6k | EE1l = _mm_madd_epi16(m128Tmp2,V11); |
1805 | 42.6k | EE1h = _mm_madd_epi16(m128Tmp3,V11); |
1806 | | |
1807 | | /* Compute EE */ |
1808 | 42.6k | EE2l = _mm_sub_epi32(EE1l, E01l); |
1809 | 42.6k | EE3l = _mm_sub_epi32(EE0l, E00l); |
1810 | 42.6k | EE2h = _mm_sub_epi32(EE1h, E01h); |
1811 | 42.6k | EE3h = _mm_sub_epi32(EE0h, E00h); |
1812 | | |
1813 | 42.6k | EE0l = _mm_add_epi32(EE0l, E00l); |
1814 | 42.6k | EE1l = _mm_add_epi32(EE1l, E01l); |
1815 | 42.6k | EE0h = _mm_add_epi32(EE0h, E00h); |
1816 | 42.6k | EE1h = _mm_add_epi32(EE1h, E01h); |
1817 | | |
1818 | | /* Compute E */ |
1819 | | |
1820 | 42.6k | E4l = _mm_sub_epi32(EE3l, E3l); |
1821 | 42.6k | E4l = _mm_add_epi32(E4l, m128iAdd); |
1822 | | |
1823 | 42.6k | E5l = _mm_sub_epi32(EE2l, E2l); |
1824 | 42.6k | E5l = _mm_add_epi32(E5l, m128iAdd); |
1825 | | |
1826 | 42.6k | E6l = _mm_sub_epi32(EE1l, E1l); |
1827 | 42.6k | E6l = _mm_add_epi32(E6l, m128iAdd); |
1828 | | |
1829 | 42.6k | E7l = _mm_sub_epi32(EE0l, E0l); |
1830 | 42.6k | E7l = _mm_add_epi32(E7l, m128iAdd); |
1831 | | |
1832 | 42.6k | E4h = _mm_sub_epi32(EE3h, E3h); |
1833 | 42.6k | E4h = _mm_add_epi32(E4h, m128iAdd); |
1834 | | |
1835 | 42.6k | E5h = _mm_sub_epi32(EE2h, E2h); |
1836 | 42.6k | E5h = _mm_add_epi32(E5h, m128iAdd); |
1837 | | |
1838 | 42.6k | E6h = _mm_sub_epi32(EE1h, E1h); |
1839 | 42.6k | E6h = _mm_add_epi32(E6h, m128iAdd); |
1840 | | |
1841 | 42.6k | E7h = _mm_sub_epi32(EE0h, E0h); |
1842 | 42.6k | E7h = _mm_add_epi32(E7h, m128iAdd); |
1843 | | |
1844 | 42.6k | E0l = _mm_add_epi32(EE0l, E0l); |
1845 | 42.6k | E0l = _mm_add_epi32(E0l, m128iAdd); |
1846 | | |
1847 | 42.6k | E1l = _mm_add_epi32(EE1l, E1l); |
1848 | 42.6k | E1l = _mm_add_epi32(E1l, m128iAdd); |
1849 | | |
1850 | 42.6k | E2l = _mm_add_epi32(EE2l, E2l); |
1851 | 42.6k | E2l = _mm_add_epi32(E2l, m128iAdd); |
1852 | | |
1853 | 42.6k | E3l = _mm_add_epi32(EE3l, E3l); |
1854 | 42.6k | E3l = _mm_add_epi32(E3l, m128iAdd); |
1855 | | |
1856 | 42.6k | E0h = _mm_add_epi32(EE0h, E0h); |
1857 | 42.6k | E0h = _mm_add_epi32(E0h, m128iAdd); |
1858 | | |
1859 | 42.6k | E1h = _mm_add_epi32(EE1h, E1h); |
1860 | 42.6k | E1h = _mm_add_epi32(E1h, m128iAdd); |
1861 | | |
1862 | 42.6k | E2h = _mm_add_epi32(EE2h, E2h); |
1863 | 42.6k | E2h = _mm_add_epi32(E2h, m128iAdd); |
1864 | | |
1865 | 42.6k | E3h = _mm_add_epi32(EE3h, E3h); |
1866 | 42.6k | E3h = _mm_add_epi32(E3h, m128iAdd); |
1867 | | |
1868 | 42.6k | m128iS0 = _mm_packs_epi32( |
1869 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), |
1870 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); |
1871 | 42.6k | m128iS1 = _mm_packs_epi32( |
1872 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), |
1873 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); |
1874 | 42.6k | m128iS2 = _mm_packs_epi32( |
1875 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), |
1876 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); |
1877 | 42.6k | m128iS3 = _mm_packs_epi32( |
1878 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), |
1879 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); |
1880 | | |
1881 | 42.6k | m128iS4 = _mm_packs_epi32( |
1882 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), |
1883 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); |
1884 | 42.6k | m128iS5 = _mm_packs_epi32( |
1885 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), |
1886 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); |
1887 | 42.6k | m128iS6 = _mm_packs_epi32( |
1888 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), |
1889 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); |
1890 | 42.6k | m128iS7 = _mm_packs_epi32( |
1891 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), |
1892 | 42.6k | _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); |
1893 | | |
1894 | 42.6k | m128iS15 = _mm_packs_epi32( |
1895 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), |
1896 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); |
1897 | 42.6k | m128iS14 = _mm_packs_epi32( |
1898 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), |
1899 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); |
1900 | 42.6k | m128iS13 = _mm_packs_epi32( |
1901 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), |
1902 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); |
1903 | 42.6k | m128iS12 = _mm_packs_epi32( |
1904 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), |
1905 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); |
1906 | | |
1907 | 42.6k | m128iS11 = _mm_packs_epi32( |
1908 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), |
1909 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); |
1910 | 42.6k | m128iS10 = _mm_packs_epi32( |
1911 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), |
1912 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); |
1913 | 42.6k | m128iS9 = _mm_packs_epi32( |
1914 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), |
1915 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); |
1916 | 42.6k | m128iS8 = _mm_packs_epi32( |
1917 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), |
1918 | 42.6k | _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); |
1919 | | |
1920 | | |
1921 | | |
1922 | 42.6k | if (!j) { //first pass |
1923 | | |
1924 | | /* Inverse the matrix */ |
1925 | 21.3k | E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); |
1926 | 21.3k | E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); |
1927 | 21.3k | E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); |
1928 | 21.3k | E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); |
1929 | 21.3k | E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); |
1930 | 21.3k | E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); |
1931 | 21.3k | E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); |
1932 | 21.3k | E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); |
1933 | | |
1934 | 21.3k | E0h = _mm_unpackhi_epi16(m128iS0, m128iS8); |
1935 | 21.3k | E1h = _mm_unpackhi_epi16(m128iS1, m128iS9); |
1936 | 21.3k | E2h = _mm_unpackhi_epi16(m128iS2, m128iS10); |
1937 | 21.3k | E3h = _mm_unpackhi_epi16(m128iS3, m128iS11); |
1938 | 21.3k | E4h = _mm_unpackhi_epi16(m128iS4, m128iS12); |
1939 | 21.3k | E5h = _mm_unpackhi_epi16(m128iS5, m128iS13); |
1940 | 21.3k | E6h = _mm_unpackhi_epi16(m128iS6, m128iS14); |
1941 | 21.3k | E7h = _mm_unpackhi_epi16(m128iS7, m128iS15); |
1942 | | |
1943 | 21.3k | m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); |
1944 | 21.3k | m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); |
1945 | 21.3k | m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); |
1946 | 21.3k | m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); |
1947 | | |
1948 | 21.3k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
1949 | 21.3k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
1950 | 21.3k | m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1951 | 21.3k | m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1952 | | |
1953 | 21.3k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
1954 | 21.3k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
1955 | 21.3k | m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1956 | 21.3k | m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1957 | | |
1958 | 21.3k | m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); |
1959 | 21.3k | m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); |
1960 | 21.3k | m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); |
1961 | 21.3k | m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); |
1962 | | |
1963 | 21.3k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
1964 | 21.3k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
1965 | 21.3k | m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1966 | 21.3k | m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1967 | | |
1968 | 21.3k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
1969 | 21.3k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
1970 | 21.3k | m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1971 | 21.3k | m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1972 | | |
1973 | 21.3k | m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); |
1974 | 21.3k | m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); |
1975 | 21.3k | m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); |
1976 | 21.3k | m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); |
1977 | | |
1978 | 21.3k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
1979 | 21.3k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
1980 | 21.3k | m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1981 | 21.3k | m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1982 | | |
1983 | 21.3k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
1984 | 21.3k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
1985 | 21.3k | m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1986 | 21.3k | m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1987 | | |
1988 | 21.3k | m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); |
1989 | 21.3k | m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); |
1990 | 21.3k | m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); |
1991 | 21.3k | m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); |
1992 | | |
1993 | 21.3k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
1994 | 21.3k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
1995 | 21.3k | m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
1996 | 21.3k | m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
1997 | | |
1998 | 21.3k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
1999 | 21.3k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
2000 | 21.3k | m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2001 | 21.3k | m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2002 | | |
2003 | 21.3k | if (!i) { |
2004 | | |
2005 | 10.6k | r0= m128iS0; //0 |
2006 | 10.6k | r1= m128iS1; //16 |
2007 | 10.6k | r2= m128iS2; //32 |
2008 | 10.6k | r3= m128iS3; //48 |
2009 | 10.6k | r4= m128iS4; //64 |
2010 | 10.6k | r5= m128iS5; //80 |
2011 | 10.6k | r6= m128iS6; //96 |
2012 | 10.6k | r7= m128iS7; //112 |
2013 | 10.6k | r8= m128iS8; //128 |
2014 | 10.6k | r9= m128iS9; //144 |
2015 | 10.6k | r10= m128iS10; //160 |
2016 | 10.6k | r11= m128iS11; //176 |
2017 | 10.6k | r12= m128iS12; //192 |
2018 | 10.6k | r13= m128iS13; //208 |
2019 | 10.6k | r14= m128iS14; //224 |
2020 | 10.6k | r15= m128iS15; //240 |
2021 | | |
2022 | | |
2023 | | |
2024 | 10.6k | m128iS0 = _mm_load_si128((__m128i *) (src + 8)); |
2025 | 10.6k | m128iS1 = _mm_load_si128((__m128i *) (src + 24)); |
2026 | 10.6k | m128iS2 = _mm_load_si128((__m128i *) (src + 40)); |
2027 | 10.6k | m128iS3 = _mm_load_si128((__m128i *) (src + 56)); |
2028 | 10.6k | m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); |
2029 | 10.6k | m128iS5 = _mm_load_si128((__m128i *) (src + 88)); |
2030 | 10.6k | m128iS6 = _mm_load_si128((__m128i *) (src + 104)); |
2031 | 10.6k | m128iS7 = _mm_load_si128((__m128i *) (src + 120)); |
2032 | 10.6k | m128iS8 = _mm_load_si128((__m128i *) (src + 136)); |
2033 | 10.6k | m128iS9 = _mm_load_si128((__m128i *) (src + 152)); |
2034 | 10.6k | m128iS10 = _mm_load_si128((__m128i *) (src + 168)); |
2035 | 10.6k | m128iS11 = _mm_load_si128((__m128i *) (src + 184)); |
2036 | 10.6k | m128iS12 = _mm_load_si128((__m128i *) (src + 200)); |
2037 | 10.6k | m128iS13 = _mm_load_si128((__m128i *) (src + 216)); |
2038 | 10.6k | m128iS14 = _mm_load_si128((__m128i *) (src + 232)); |
2039 | 10.6k | m128iS15 = _mm_load_si128((__m128i *) (src + 248)); |
2040 | 10.6k | } else { |
2041 | | |
2042 | 10.6k | r16= m128iS0; //8 |
2043 | 10.6k | r17= m128iS1; //24 |
2044 | 10.6k | r18= m128iS2; //40 |
2045 | 10.6k | r19= m128iS3; //56 |
2046 | 10.6k | r20= m128iS4; //72 |
2047 | 10.6k | r21= m128iS5; //88 |
2048 | 10.6k | r22= m128iS6; //104 |
2049 | 10.6k | r23= m128iS7; //120 |
2050 | 10.6k | r24= m128iS8; //136 |
2051 | 10.6k | r25= m128iS9; //152 |
2052 | 10.6k | r26= m128iS10; //168 |
2053 | 10.6k | r27= m128iS11; //184 |
2054 | 10.6k | r28= m128iS12; //200 |
2055 | 10.6k | r29= m128iS13; //216 |
2056 | 10.6k | r30= m128iS14; //232 |
2057 | 10.6k | r31= m128iS15; //248 |
2058 | | |
2059 | | //prepare next iteration : |
2060 | | |
2061 | 10.6k | m128iS0= r0; |
2062 | 10.6k | m128iS1= r2; |
2063 | 10.6k | m128iS2= r4; |
2064 | 10.6k | m128iS3= r6; |
2065 | 10.6k | m128iS4= r8; |
2066 | 10.6k | m128iS5= r10; |
2067 | 10.6k | m128iS6= r12; |
2068 | 10.6k | m128iS7= r14; |
2069 | 10.6k | m128iS8= r16; |
2070 | 10.6k | m128iS9= r18; |
2071 | 10.6k | m128iS10=r20; |
2072 | 10.6k | m128iS11=r22; |
2073 | 10.6k | m128iS12=r24; |
2074 | 10.6k | m128iS13=r26; |
2075 | 10.6k | m128iS14=r28; |
2076 | 10.6k | m128iS15=r30; |
2077 | | |
2078 | 10.6k | shift = shift_2nd; |
2079 | 10.6k | m128iAdd = _mm_set1_epi32(add_2nd); |
2080 | 10.6k | } |
2081 | | |
2082 | 21.3k | } else { |
2083 | | |
2084 | | //transpose half matrix : |
2085 | | //instead of having 1 register = 1 half-column, |
2086 | | //1 register = 1 half-row. |
2087 | 21.3k | E0l = _mm_unpacklo_epi16(m128iS0, m128iS1); |
2088 | 21.3k | E1l = _mm_unpacklo_epi16(m128iS2, m128iS3); |
2089 | 21.3k | E2l = _mm_unpacklo_epi16(m128iS4, m128iS5); |
2090 | 21.3k | E3l = _mm_unpacklo_epi16(m128iS6, m128iS7); |
2091 | 21.3k | E4l = _mm_unpacklo_epi16(m128iS8, m128iS9); |
2092 | 21.3k | E5l = _mm_unpacklo_epi16(m128iS10, m128iS11); |
2093 | 21.3k | E6l = _mm_unpacklo_epi16(m128iS12, m128iS13); |
2094 | 21.3k | E7l = _mm_unpacklo_epi16(m128iS14, m128iS15); |
2095 | | |
2096 | 21.3k | O0l = _mm_unpackhi_epi16(m128iS0, m128iS1); |
2097 | 21.3k | O1l = _mm_unpackhi_epi16(m128iS2, m128iS3); |
2098 | 21.3k | O2l = _mm_unpackhi_epi16(m128iS4, m128iS5); |
2099 | 21.3k | O3l = _mm_unpackhi_epi16(m128iS6, m128iS7); |
2100 | 21.3k | O4l = _mm_unpackhi_epi16(m128iS8, m128iS9); |
2101 | 21.3k | O5l = _mm_unpackhi_epi16(m128iS10, m128iS11); |
2102 | 21.3k | O6l = _mm_unpackhi_epi16(m128iS12, m128iS13); |
2103 | 21.3k | O7l = _mm_unpackhi_epi16(m128iS14, m128iS15); |
2104 | | |
2105 | | |
2106 | 21.3k | m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l); |
2107 | 21.3k | m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l); |
2108 | | |
2109 | 21.3k | m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l); |
2110 | 21.3k | m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l); |
2111 | | |
2112 | 21.3k | r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row |
2113 | 21.3k | r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row |
2114 | | |
2115 | | |
2116 | 21.3k | r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row |
2117 | 21.3k | r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd half 2nd row |
2118 | | |
2119 | 21.3k | m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l); |
2120 | 21.3k | m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l); |
2121 | 21.3k | m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l); |
2122 | 21.3k | m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l); |
2123 | | |
2124 | | |
2125 | 21.3k | r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); |
2126 | 21.3k | r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); |
2127 | | |
2128 | 21.3k | r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); |
2129 | 21.3k | r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); |
2130 | | |
2131 | 21.3k | m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l); |
2132 | 21.3k | m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l); |
2133 | 21.3k | m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l); |
2134 | 21.3k | m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l); |
2135 | | |
2136 | 21.3k | r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); |
2137 | 21.3k | r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); |
2138 | | |
2139 | | |
2140 | 21.3k | r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); |
2141 | 21.3k | r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); |
2142 | | |
2143 | 21.3k | m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l); |
2144 | 21.3k | m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l); |
2145 | 21.3k | m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l); |
2146 | 21.3k | m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l); |
2147 | | |
2148 | 21.3k | r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); |
2149 | 21.3k | r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); |
2150 | | |
2151 | | |
2152 | 21.3k | r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); |
2153 | 21.3k | r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); |
2154 | | |
2155 | 21.3k | dst = (uint8_t*) (_dst + (i*stride)); |
2156 | 21.3k | m128Tmp0= _mm_setzero_si128(); |
2157 | 21.3k | m128Tmp1= _mm_load_si128((__m128i*)dst); |
2158 | 21.3k | m128Tmp2= _mm_load_si128((__m128i*)(dst+stride)); |
2159 | 21.3k | m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride)); |
2160 | 21.3k | m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride)); |
2161 | 21.3k | m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride)); |
2162 | 21.3k | m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride)); |
2163 | 21.3k | m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride)); |
2164 | 21.3k | E0l= _mm_load_si128((__m128i*)(dst+7*stride)); |
2165 | | |
2166 | | |
2167 | 21.3k | r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0)); |
2168 | 21.3k | r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0)); |
2169 | 21.3k | r0= _mm_packus_epi16(r0,r2); |
2170 | | |
2171 | | |
2172 | | |
2173 | | |
2174 | 21.3k | r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0)); |
2175 | 21.3k | r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0)); |
2176 | 21.3k | r4= _mm_packus_epi16(r4,r6); |
2177 | | |
2178 | | |
2179 | 21.3k | r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0)); |
2180 | 21.3k | r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0)); |
2181 | 21.3k | r8= _mm_packus_epi16(r8,r10); |
2182 | | |
2183 | | |
2184 | 21.3k | r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0)); |
2185 | 21.3k | r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0)); |
2186 | 21.3k | r12= _mm_packus_epi16(r12,r14); |
2187 | | |
2188 | | |
2189 | 21.3k | r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0)); |
2190 | 21.3k | r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0)); |
2191 | 21.3k | r16= _mm_packus_epi16(r16,r18); |
2192 | | |
2193 | | |
2194 | 21.3k | r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0)); |
2195 | 21.3k | r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0)); |
2196 | 21.3k | r20= _mm_packus_epi16(r20,r22); |
2197 | | |
2198 | | |
2199 | 21.3k | r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0)); |
2200 | 21.3k | r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0)); |
2201 | 21.3k | r24= _mm_packus_epi16(r24,r26); |
2202 | | |
2203 | | |
2204 | | |
2205 | 21.3k | r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0)); |
2206 | 21.3k | r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0)); |
2207 | 21.3k | r28= _mm_packus_epi16(r28,r30); |
2208 | | |
2209 | 21.3k | _mm_store_si128((__m128i*)dst,r0); |
2210 | 21.3k | _mm_store_si128((__m128i*)(dst+stride),r4); |
2211 | 21.3k | _mm_store_si128((__m128i*)(dst+2*stride),r8); |
2212 | 21.3k | _mm_store_si128((__m128i*)(dst+3*stride),r12); |
2213 | 21.3k | _mm_store_si128((__m128i*)(dst+4*stride),r16); |
2214 | 21.3k | _mm_store_si128((__m128i*)(dst+5*stride),r20); |
2215 | 21.3k | _mm_store_si128((__m128i*)(dst+6*stride),r24); |
2216 | 21.3k | _mm_store_si128((__m128i*)(dst+7*stride),r28); |
2217 | | |
2218 | | |
2219 | | |
2220 | 21.3k | if (!i) { |
2221 | | //first half done, can store ! |
2222 | | |
2223 | | |
2224 | 10.6k | m128iS0= r1; |
2225 | 10.6k | m128iS1= r3; |
2226 | 10.6k | m128iS2= r5; |
2227 | 10.6k | m128iS3= r7; |
2228 | 10.6k | m128iS4= r9; |
2229 | 10.6k | m128iS5= r11; |
2230 | 10.6k | m128iS6= r13; |
2231 | 10.6k | m128iS7= r15; |
2232 | 10.6k | m128iS8= r17; |
2233 | 10.6k | m128iS9= r19; |
2234 | 10.6k | m128iS10=r21; |
2235 | 10.6k | m128iS11=r23; |
2236 | 10.6k | m128iS12=r25; |
2237 | 10.6k | m128iS13=r27; |
2238 | 10.6k | m128iS14=r29; |
2239 | 10.6k | m128iS15=r31; |
2240 | 10.6k | } |
2241 | 21.3k | } |
2242 | 42.6k | } |
2243 | 21.3k | } |
2244 | 10.6k | } |
2245 | | #endif |
2246 | | |
2247 | | |
2248 | | #if 0 |
2249 | | void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, |
2250 | | ptrdiff_t _stride) { |
2251 | | int i; |
2252 | | uint16_t *dst = (uint16_t*) _dst; |
2253 | | ptrdiff_t stride = _stride / 2; |
2254 | | int16_t *src = coeffs; |
2255 | | int32_t shift; |
2256 | | uint8_t shift_2nd = 10; //20 - bit depth |
2257 | | uint16_t add_2nd = 1 << 9; //shift - 1; |
2258 | | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
2259 | | m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, |
2260 | | m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, |
2261 | | m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, |
2262 | | E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, |
2263 | | O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, |
2264 | | E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; |
2265 | | __m128i E4l, E5l, E6l, E7l; |
2266 | | __m128i E4h, E5h, E6h, E7h; |
2267 | | int j; |
2268 | | m128iS0 = _mm_load_si128((__m128i *) (src)); |
2269 | | m128iS1 = _mm_load_si128((__m128i *) (src + 16)); |
2270 | | m128iS2 = _mm_load_si128((__m128i *) (src + 32)); |
2271 | | m128iS3 = _mm_load_si128((__m128i *) (src + 48)); |
2272 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 64)); |
2273 | | m128iS5 = _mm_load_si128((__m128i *) (src + 80)); |
2274 | | m128iS6 = _mm_load_si128((__m128i *) (src + 96)); |
2275 | | m128iS7 = _mm_load_si128((__m128i *) (src + 112)); |
2276 | | m128iS8 = _mm_load_si128((__m128i *) (src + 128)); |
2277 | | m128iS9 = _mm_load_si128((__m128i *) (src + 144)); |
2278 | | m128iS10 = _mm_load_si128((__m128i *) (src + 160)); |
2279 | | m128iS11 = _mm_load_si128((__m128i *) (src + 176)); |
2280 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 192)); |
2281 | | m128iS13 = _mm_load_si128((__m128i *) (src + 208)); |
2282 | | m128iS14 = _mm_load_si128((__m128i *) (src + 224)); |
2283 | | m128iS15 = _mm_load_si128((__m128i *) (src + 240)); |
2284 | | shift = shift_1st; |
2285 | | m128iAdd = _mm_set1_epi32(add_1st); |
2286 | | |
2287 | | for (j = 0; j < 2; j++) { |
2288 | | for (i = 0; i < 16; i += 8) { |
2289 | | |
2290 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
2291 | | E0l = _mm_madd_epi16(m128Tmp0, |
2292 | | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
2293 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
2294 | | E0h = _mm_madd_epi16(m128Tmp1, |
2295 | | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
2296 | | |
2297 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
2298 | | E1l = _mm_madd_epi16(m128Tmp2, |
2299 | | _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); |
2300 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
2301 | | E1h = _mm_madd_epi16(m128Tmp3, |
2302 | | _mm_load_si128((__m128i *) (transform16x16_1[1][0]))); |
2303 | | |
2304 | | m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); |
2305 | | E2l = _mm_madd_epi16(m128Tmp4, |
2306 | | _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); |
2307 | | m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); |
2308 | | E2h = _mm_madd_epi16(m128Tmp5, |
2309 | | _mm_load_si128((__m128i *) (transform16x16_1[2][0]))); |
2310 | | |
2311 | | m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); |
2312 | | E3l = _mm_madd_epi16(m128Tmp6, |
2313 | | _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); |
2314 | | m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); |
2315 | | E3h = _mm_madd_epi16(m128Tmp7, |
2316 | | _mm_load_si128((__m128i *) (transform16x16_1[3][0]))); |
2317 | | |
2318 | | O0l = _mm_add_epi32(E0l, E1l); |
2319 | | O0l = _mm_add_epi32(O0l, E2l); |
2320 | | O0l = _mm_add_epi32(O0l, E3l); |
2321 | | |
2322 | | O0h = _mm_add_epi32(E0h, E1h); |
2323 | | O0h = _mm_add_epi32(O0h, E2h); |
2324 | | O0h = _mm_add_epi32(O0h, E3h); |
2325 | | |
2326 | | /* Compute O1*/ |
2327 | | E0l = _mm_madd_epi16(m128Tmp0, |
2328 | | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
2329 | | E0h = _mm_madd_epi16(m128Tmp1, |
2330 | | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
2331 | | E1l = _mm_madd_epi16(m128Tmp2, |
2332 | | _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); |
2333 | | E1h = _mm_madd_epi16(m128Tmp3, |
2334 | | _mm_load_si128((__m128i *) (transform16x16_1[1][1]))); |
2335 | | E2l = _mm_madd_epi16(m128Tmp4, |
2336 | | _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); |
2337 | | E2h = _mm_madd_epi16(m128Tmp5, |
2338 | | _mm_load_si128((__m128i *) (transform16x16_1[2][1]))); |
2339 | | E3l = _mm_madd_epi16(m128Tmp6, |
2340 | | _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); |
2341 | | E3h = _mm_madd_epi16(m128Tmp7, |
2342 | | _mm_load_si128((__m128i *) (transform16x16_1[3][1]))); |
2343 | | O1l = _mm_add_epi32(E0l, E1l); |
2344 | | O1l = _mm_add_epi32(O1l, E2l); |
2345 | | O1l = _mm_add_epi32(O1l, E3l); |
2346 | | O1h = _mm_add_epi32(E0h, E1h); |
2347 | | O1h = _mm_add_epi32(O1h, E2h); |
2348 | | O1h = _mm_add_epi32(O1h, E3h); |
2349 | | |
2350 | | /* Compute O2*/ |
2351 | | E0l = _mm_madd_epi16(m128Tmp0, |
2352 | | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
2353 | | E0h = _mm_madd_epi16(m128Tmp1, |
2354 | | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
2355 | | E1l = _mm_madd_epi16(m128Tmp2, |
2356 | | _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); |
2357 | | E1h = _mm_madd_epi16(m128Tmp3, |
2358 | | _mm_load_si128((__m128i *) (transform16x16_1[1][2]))); |
2359 | | E2l = _mm_madd_epi16(m128Tmp4, |
2360 | | _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); |
2361 | | E2h = _mm_madd_epi16(m128Tmp5, |
2362 | | _mm_load_si128((__m128i *) (transform16x16_1[2][2]))); |
2363 | | E3l = _mm_madd_epi16(m128Tmp6, |
2364 | | _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); |
2365 | | E3h = _mm_madd_epi16(m128Tmp7, |
2366 | | _mm_load_si128((__m128i *) (transform16x16_1[3][2]))); |
2367 | | O2l = _mm_add_epi32(E0l, E1l); |
2368 | | O2l = _mm_add_epi32(O2l, E2l); |
2369 | | O2l = _mm_add_epi32(O2l, E3l); |
2370 | | |
2371 | | O2h = _mm_add_epi32(E0h, E1h); |
2372 | | O2h = _mm_add_epi32(O2h, E2h); |
2373 | | O2h = _mm_add_epi32(O2h, E3h); |
2374 | | |
2375 | | /* Compute O3*/ |
2376 | | E0l = _mm_madd_epi16(m128Tmp0, |
2377 | | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
2378 | | E0h = _mm_madd_epi16(m128Tmp1, |
2379 | | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
2380 | | E1l = _mm_madd_epi16(m128Tmp2, |
2381 | | _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); |
2382 | | E1h = _mm_madd_epi16(m128Tmp3, |
2383 | | _mm_load_si128((__m128i *) (transform16x16_1[1][3]))); |
2384 | | E2l = _mm_madd_epi16(m128Tmp4, |
2385 | | _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); |
2386 | | E2h = _mm_madd_epi16(m128Tmp5, |
2387 | | _mm_load_si128((__m128i *) (transform16x16_1[2][3]))); |
2388 | | E3l = _mm_madd_epi16(m128Tmp6, |
2389 | | _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); |
2390 | | E3h = _mm_madd_epi16(m128Tmp7, |
2391 | | _mm_load_si128((__m128i *) (transform16x16_1[3][3]))); |
2392 | | |
2393 | | O3l = _mm_add_epi32(E0l, E1l); |
2394 | | O3l = _mm_add_epi32(O3l, E2l); |
2395 | | O3l = _mm_add_epi32(O3l, E3l); |
2396 | | |
2397 | | O3h = _mm_add_epi32(E0h, E1h); |
2398 | | O3h = _mm_add_epi32(O3h, E2h); |
2399 | | O3h = _mm_add_epi32(O3h, E3h); |
2400 | | |
2401 | | /* Compute O4*/ |
2402 | | |
2403 | | E0l = _mm_madd_epi16(m128Tmp0, |
2404 | | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
2405 | | E0h = _mm_madd_epi16(m128Tmp1, |
2406 | | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
2407 | | E1l = _mm_madd_epi16(m128Tmp2, |
2408 | | _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); |
2409 | | E1h = _mm_madd_epi16(m128Tmp3, |
2410 | | _mm_load_si128((__m128i *) (transform16x16_1[1][4]))); |
2411 | | E2l = _mm_madd_epi16(m128Tmp4, |
2412 | | _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); |
2413 | | E2h = _mm_madd_epi16(m128Tmp5, |
2414 | | _mm_load_si128((__m128i *) (transform16x16_1[2][4]))); |
2415 | | E3l = _mm_madd_epi16(m128Tmp6, |
2416 | | _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); |
2417 | | E3h = _mm_madd_epi16(m128Tmp7, |
2418 | | _mm_load_si128((__m128i *) (transform16x16_1[3][4]))); |
2419 | | |
2420 | | O4l = _mm_add_epi32(E0l, E1l); |
2421 | | O4l = _mm_add_epi32(O4l, E2l); |
2422 | | O4l = _mm_add_epi32(O4l, E3l); |
2423 | | |
2424 | | O4h = _mm_add_epi32(E0h, E1h); |
2425 | | O4h = _mm_add_epi32(O4h, E2h); |
2426 | | O4h = _mm_add_epi32(O4h, E3h); |
2427 | | |
2428 | | /* Compute O5*/ |
2429 | | E0l = _mm_madd_epi16(m128Tmp0, |
2430 | | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
2431 | | E0h = _mm_madd_epi16(m128Tmp1, |
2432 | | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
2433 | | E1l = _mm_madd_epi16(m128Tmp2, |
2434 | | _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); |
2435 | | E1h = _mm_madd_epi16(m128Tmp3, |
2436 | | _mm_load_si128((__m128i *) (transform16x16_1[1][5]))); |
2437 | | E2l = _mm_madd_epi16(m128Tmp4, |
2438 | | _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); |
2439 | | E2h = _mm_madd_epi16(m128Tmp5, |
2440 | | _mm_load_si128((__m128i *) (transform16x16_1[2][5]))); |
2441 | | E3l = _mm_madd_epi16(m128Tmp6, |
2442 | | _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); |
2443 | | E3h = _mm_madd_epi16(m128Tmp7, |
2444 | | _mm_load_si128((__m128i *) (transform16x16_1[3][5]))); |
2445 | | |
2446 | | O5l = _mm_add_epi32(E0l, E1l); |
2447 | | O5l = _mm_add_epi32(O5l, E2l); |
2448 | | O5l = _mm_add_epi32(O5l, E3l); |
2449 | | |
2450 | | O5h = _mm_add_epi32(E0h, E1h); |
2451 | | O5h = _mm_add_epi32(O5h, E2h); |
2452 | | O5h = _mm_add_epi32(O5h, E3h); |
2453 | | |
2454 | | /* Compute O6*/ |
2455 | | |
2456 | | E0l = _mm_madd_epi16(m128Tmp0, |
2457 | | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
2458 | | E0h = _mm_madd_epi16(m128Tmp1, |
2459 | | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
2460 | | E1l = _mm_madd_epi16(m128Tmp2, |
2461 | | _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); |
2462 | | E1h = _mm_madd_epi16(m128Tmp3, |
2463 | | _mm_load_si128((__m128i *) (transform16x16_1[1][6]))); |
2464 | | E2l = _mm_madd_epi16(m128Tmp4, |
2465 | | _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); |
2466 | | E2h = _mm_madd_epi16(m128Tmp5, |
2467 | | _mm_load_si128((__m128i *) (transform16x16_1[2][6]))); |
2468 | | E3l = _mm_madd_epi16(m128Tmp6, |
2469 | | _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); |
2470 | | E3h = _mm_madd_epi16(m128Tmp7, |
2471 | | _mm_load_si128((__m128i *) (transform16x16_1[3][6]))); |
2472 | | |
2473 | | O6l = _mm_add_epi32(E0l, E1l); |
2474 | | O6l = _mm_add_epi32(O6l, E2l); |
2475 | | O6l = _mm_add_epi32(O6l, E3l); |
2476 | | |
2477 | | O6h = _mm_add_epi32(E0h, E1h); |
2478 | | O6h = _mm_add_epi32(O6h, E2h); |
2479 | | O6h = _mm_add_epi32(O6h, E3h); |
2480 | | |
2481 | | /* Compute O7*/ |
2482 | | |
2483 | | E0l = _mm_madd_epi16(m128Tmp0, |
2484 | | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
2485 | | E0h = _mm_madd_epi16(m128Tmp1, |
2486 | | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
2487 | | E1l = _mm_madd_epi16(m128Tmp2, |
2488 | | _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); |
2489 | | E1h = _mm_madd_epi16(m128Tmp3, |
2490 | | _mm_load_si128((__m128i *) (transform16x16_1[1][7]))); |
2491 | | E2l = _mm_madd_epi16(m128Tmp4, |
2492 | | _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); |
2493 | | E2h = _mm_madd_epi16(m128Tmp5, |
2494 | | _mm_load_si128((__m128i *) (transform16x16_1[2][7]))); |
2495 | | E3l = _mm_madd_epi16(m128Tmp6, |
2496 | | _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); |
2497 | | E3h = _mm_madd_epi16(m128Tmp7, |
2498 | | _mm_load_si128((__m128i *) (transform16x16_1[3][7]))); |
2499 | | |
2500 | | O7l = _mm_add_epi32(E0l, E1l); |
2501 | | O7l = _mm_add_epi32(O7l, E2l); |
2502 | | O7l = _mm_add_epi32(O7l, E3l); |
2503 | | |
2504 | | O7h = _mm_add_epi32(E0h, E1h); |
2505 | | O7h = _mm_add_epi32(O7h, E2h); |
2506 | | O7h = _mm_add_epi32(O7h, E3h); |
2507 | | |
2508 | | /* Compute E0 */ |
2509 | | |
2510 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
2511 | | E0l = _mm_madd_epi16(m128Tmp0, |
2512 | | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
2513 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
2514 | | E0h = _mm_madd_epi16(m128Tmp1, |
2515 | | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
2516 | | |
2517 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); |
2518 | | E0l = _mm_add_epi32(E0l, |
2519 | | _mm_madd_epi16(m128Tmp2, |
2520 | | _mm_load_si128( |
2521 | | (__m128i *) (transform16x16_2[1][0])))); |
2522 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); |
2523 | | E0h = _mm_add_epi32(E0h, |
2524 | | _mm_madd_epi16(m128Tmp3, |
2525 | | _mm_load_si128( |
2526 | | (__m128i *) (transform16x16_2[1][0])))); |
2527 | | |
2528 | | /* Compute E1 */ |
2529 | | E1l = _mm_madd_epi16(m128Tmp0, |
2530 | | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
2531 | | E1h = _mm_madd_epi16(m128Tmp1, |
2532 | | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
2533 | | E1l = _mm_add_epi32(E1l, |
2534 | | _mm_madd_epi16(m128Tmp2, |
2535 | | _mm_load_si128( |
2536 | | (__m128i *) (transform16x16_2[1][1])))); |
2537 | | E1h = _mm_add_epi32(E1h, |
2538 | | _mm_madd_epi16(m128Tmp3, |
2539 | | _mm_load_si128( |
2540 | | (__m128i *) (transform16x16_2[1][1])))); |
2541 | | |
2542 | | /* Compute E2 */ |
2543 | | E2l = _mm_madd_epi16(m128Tmp0, |
2544 | | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
2545 | | E2h = _mm_madd_epi16(m128Tmp1, |
2546 | | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
2547 | | E2l = _mm_add_epi32(E2l, |
2548 | | _mm_madd_epi16(m128Tmp2, |
2549 | | _mm_load_si128( |
2550 | | (__m128i *) (transform16x16_2[1][2])))); |
2551 | | E2h = _mm_add_epi32(E2h, |
2552 | | _mm_madd_epi16(m128Tmp3, |
2553 | | _mm_load_si128( |
2554 | | (__m128i *) (transform16x16_2[1][2])))); |
2555 | | /* Compute E3 */ |
2556 | | E3l = _mm_madd_epi16(m128Tmp0, |
2557 | | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
2558 | | E3h = _mm_madd_epi16(m128Tmp1, |
2559 | | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
2560 | | E3l = _mm_add_epi32(E3l, |
2561 | | _mm_madd_epi16(m128Tmp2, |
2562 | | _mm_load_si128( |
2563 | | (__m128i *) (transform16x16_2[1][3])))); |
2564 | | E3h = _mm_add_epi32(E3h, |
2565 | | _mm_madd_epi16(m128Tmp3, |
2566 | | _mm_load_si128( |
2567 | | (__m128i *) (transform16x16_2[1][3])))); |
2568 | | |
2569 | | /* Compute EE0 and EEE */ |
2570 | | |
2571 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); |
2572 | | E00l = _mm_madd_epi16(m128Tmp0, |
2573 | | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
2574 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); |
2575 | | E00h = _mm_madd_epi16(m128Tmp1, |
2576 | | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
2577 | | |
2578 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8); |
2579 | | EE0l = _mm_madd_epi16(m128Tmp2, |
2580 | | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
2581 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8); |
2582 | | EE0h = _mm_madd_epi16(m128Tmp3, |
2583 | | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
2584 | | |
2585 | | E01l = _mm_madd_epi16(m128Tmp0, |
2586 | | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
2587 | | E01h = _mm_madd_epi16(m128Tmp1, |
2588 | | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
2589 | | |
2590 | | EE1l = _mm_madd_epi16(m128Tmp2, |
2591 | | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
2592 | | EE1h = _mm_madd_epi16(m128Tmp3, |
2593 | | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
2594 | | |
2595 | | /* Compute EE */ |
2596 | | EE2l = _mm_sub_epi32(EE1l, E01l); |
2597 | | EE3l = _mm_sub_epi32(EE0l, E00l); |
2598 | | EE2h = _mm_sub_epi32(EE1h, E01h); |
2599 | | EE3h = _mm_sub_epi32(EE0h, E00h); |
2600 | | |
2601 | | EE0l = _mm_add_epi32(EE0l, E00l); |
2602 | | EE1l = _mm_add_epi32(EE1l, E01l); |
2603 | | EE0h = _mm_add_epi32(EE0h, E00h); |
2604 | | EE1h = _mm_add_epi32(EE1h, E01h); |
2605 | | |
2606 | | /* Compute E */ |
2607 | | |
2608 | | E4l = _mm_sub_epi32(EE3l, E3l); |
2609 | | E4l = _mm_add_epi32(E4l, m128iAdd); |
2610 | | |
2611 | | E5l = _mm_sub_epi32(EE2l, E2l); |
2612 | | E5l = _mm_add_epi32(E5l, m128iAdd); |
2613 | | |
2614 | | E6l = _mm_sub_epi32(EE1l, E1l); |
2615 | | E6l = _mm_add_epi32(E6l, m128iAdd); |
2616 | | |
2617 | | E7l = _mm_sub_epi32(EE0l, E0l); |
2618 | | E7l = _mm_add_epi32(E7l, m128iAdd); |
2619 | | |
2620 | | E4h = _mm_sub_epi32(EE3h, E3h); |
2621 | | E4h = _mm_add_epi32(E4h, m128iAdd); |
2622 | | |
2623 | | E5h = _mm_sub_epi32(EE2h, E2h); |
2624 | | E5h = _mm_add_epi32(E5h, m128iAdd); |
2625 | | |
2626 | | E6h = _mm_sub_epi32(EE1h, E1h); |
2627 | | E6h = _mm_add_epi32(E6h, m128iAdd); |
2628 | | |
2629 | | E7h = _mm_sub_epi32(EE0h, E0h); |
2630 | | E7h = _mm_add_epi32(E7h, m128iAdd); |
2631 | | |
2632 | | E0l = _mm_add_epi32(EE0l, E0l); |
2633 | | E0l = _mm_add_epi32(E0l, m128iAdd); |
2634 | | |
2635 | | E1l = _mm_add_epi32(EE1l, E1l); |
2636 | | E1l = _mm_add_epi32(E1l, m128iAdd); |
2637 | | |
2638 | | E2l = _mm_add_epi32(EE2l, E2l); |
2639 | | E2l = _mm_add_epi32(E2l, m128iAdd); |
2640 | | |
2641 | | E3l = _mm_add_epi32(EE3l, E3l); |
2642 | | E3l = _mm_add_epi32(E3l, m128iAdd); |
2643 | | |
2644 | | E0h = _mm_add_epi32(EE0h, E0h); |
2645 | | E0h = _mm_add_epi32(E0h, m128iAdd); |
2646 | | |
2647 | | E1h = _mm_add_epi32(EE1h, E1h); |
2648 | | E1h = _mm_add_epi32(E1h, m128iAdd); |
2649 | | |
2650 | | E2h = _mm_add_epi32(EE2h, E2h); |
2651 | | E2h = _mm_add_epi32(E2h, m128iAdd); |
2652 | | |
2653 | | E3h = _mm_add_epi32(EE3h, E3h); |
2654 | | E3h = _mm_add_epi32(E3h, m128iAdd); |
2655 | | |
2656 | | m128iS0 = _mm_packs_epi32( |
2657 | | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), |
2658 | | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); |
2659 | | m128iS1 = _mm_packs_epi32( |
2660 | | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), |
2661 | | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); |
2662 | | m128iS2 = _mm_packs_epi32( |
2663 | | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), |
2664 | | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); |
2665 | | m128iS3 = _mm_packs_epi32( |
2666 | | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), |
2667 | | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); |
2668 | | |
2669 | | m128iS4 = _mm_packs_epi32( |
2670 | | _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), |
2671 | | _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); |
2672 | | m128iS5 = _mm_packs_epi32( |
2673 | | _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), |
2674 | | _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); |
2675 | | m128iS6 = _mm_packs_epi32( |
2676 | | _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), |
2677 | | _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); |
2678 | | m128iS7 = _mm_packs_epi32( |
2679 | | _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), |
2680 | | _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); |
2681 | | |
2682 | | m128iS15 = _mm_packs_epi32( |
2683 | | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), |
2684 | | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); |
2685 | | m128iS14 = _mm_packs_epi32( |
2686 | | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), |
2687 | | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); |
2688 | | m128iS13 = _mm_packs_epi32( |
2689 | | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), |
2690 | | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); |
2691 | | m128iS12 = _mm_packs_epi32( |
2692 | | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), |
2693 | | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); |
2694 | | |
2695 | | m128iS11 = _mm_packs_epi32( |
2696 | | _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), |
2697 | | _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); |
2698 | | m128iS10 = _mm_packs_epi32( |
2699 | | _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), |
2700 | | _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); |
2701 | | m128iS9 = _mm_packs_epi32( |
2702 | | _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), |
2703 | | _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); |
2704 | | m128iS8 = _mm_packs_epi32( |
2705 | | _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), |
2706 | | _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); |
2707 | | |
2708 | | if (!j) { |
2709 | | /* Inverse the matrix */ |
2710 | | E0l = _mm_unpacklo_epi16(m128iS0, m128iS8); |
2711 | | E1l = _mm_unpacklo_epi16(m128iS1, m128iS9); |
2712 | | E2l = _mm_unpacklo_epi16(m128iS2, m128iS10); |
2713 | | E3l = _mm_unpacklo_epi16(m128iS3, m128iS11); |
2714 | | E4l = _mm_unpacklo_epi16(m128iS4, m128iS12); |
2715 | | E5l = _mm_unpacklo_epi16(m128iS5, m128iS13); |
2716 | | E6l = _mm_unpacklo_epi16(m128iS6, m128iS14); |
2717 | | E7l = _mm_unpacklo_epi16(m128iS7, m128iS15); |
2718 | | |
2719 | | O0l = _mm_unpackhi_epi16(m128iS0, m128iS8); |
2720 | | O1l = _mm_unpackhi_epi16(m128iS1, m128iS9); |
2721 | | O2l = _mm_unpackhi_epi16(m128iS2, m128iS10); |
2722 | | O3l = _mm_unpackhi_epi16(m128iS3, m128iS11); |
2723 | | O4l = _mm_unpackhi_epi16(m128iS4, m128iS12); |
2724 | | O5l = _mm_unpackhi_epi16(m128iS5, m128iS13); |
2725 | | O6l = _mm_unpackhi_epi16(m128iS6, m128iS14); |
2726 | | O7l = _mm_unpackhi_epi16(m128iS7, m128iS15); |
2727 | | |
2728 | | m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l); |
2729 | | m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l); |
2730 | | m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l); |
2731 | | m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l); |
2732 | | |
2733 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
2734 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
2735 | | m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2736 | | m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2737 | | |
2738 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
2739 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
2740 | | m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2741 | | m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2742 | | |
2743 | | m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l); |
2744 | | m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l); |
2745 | | m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l); |
2746 | | m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l); |
2747 | | |
2748 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
2749 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
2750 | | m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2751 | | m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2752 | | |
2753 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
2754 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
2755 | | m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2756 | | m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2757 | | |
2758 | | m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l); |
2759 | | m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l); |
2760 | | m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l); |
2761 | | m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l); |
2762 | | |
2763 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
2764 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
2765 | | m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2766 | | m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2767 | | |
2768 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
2769 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
2770 | | m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2771 | | m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2772 | | |
2773 | | m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l); |
2774 | | m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l); |
2775 | | m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l); |
2776 | | m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l); |
2777 | | |
2778 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
2779 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
2780 | | m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2781 | | m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2782 | | |
2783 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
2784 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
2785 | | m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
2786 | | m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
2787 | | |
2788 | | /* */ |
2789 | | _mm_store_si128((__m128i *) (src + i), m128iS0); |
2790 | | _mm_store_si128((__m128i *) (src + 16 + i), m128iS1); |
2791 | | _mm_store_si128((__m128i *) (src + 32 + i), m128iS2); |
2792 | | _mm_store_si128((__m128i *) (src + 48 + i), m128iS3); |
2793 | | _mm_store_si128((__m128i *) (src + 64 + i), m128iS4); |
2794 | | _mm_store_si128((__m128i *) (src + 80 + i), m128iS5); |
2795 | | _mm_store_si128((__m128i *) (src + 96 + i), m128iS6); |
2796 | | _mm_store_si128((__m128i *) (src + 112 + i), m128iS7); |
2797 | | _mm_store_si128((__m128i *) (src + 128 + i), m128iS8); |
2798 | | _mm_store_si128((__m128i *) (src + 144 + i), m128iS9); |
2799 | | _mm_store_si128((__m128i *) (src + 160 + i), m128iS10); |
2800 | | _mm_store_si128((__m128i *) (src + 176 + i), m128iS11); |
2801 | | _mm_store_si128((__m128i *) (src + 192 + i), m128iS12); |
2802 | | _mm_store_si128((__m128i *) (src + 208 + i), m128iS13); |
2803 | | _mm_store_si128((__m128i *) (src + 224 + i), m128iS14); |
2804 | | _mm_store_si128((__m128i *) (src + 240 + i), m128iS15); |
2805 | | |
2806 | | if (!i) { |
2807 | | m128iS0 = _mm_load_si128((__m128i *) (src + 8)); |
2808 | | m128iS1 = _mm_load_si128((__m128i *) (src + 24)); |
2809 | | m128iS2 = _mm_load_si128((__m128i *) (src + 40)); |
2810 | | m128iS3 = _mm_load_si128((__m128i *) (src + 56)); |
2811 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 72)); |
2812 | | m128iS5 = _mm_load_si128((__m128i *) (src + 88)); |
2813 | | m128iS6 = _mm_load_si128((__m128i *) (src + 104)); |
2814 | | m128iS7 = _mm_load_si128((__m128i *) (src + 120)); |
2815 | | m128iS8 = _mm_load_si128((__m128i *) (src + 136)); |
2816 | | m128iS9 = _mm_load_si128((__m128i *) (src + 152)); |
2817 | | m128iS10 = _mm_load_si128((__m128i *) (src + 168)); |
2818 | | m128iS11 = _mm_load_si128((__m128i *) (src + 184)); |
2819 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 200)); |
2820 | | m128iS13 = _mm_load_si128((__m128i *) (src + 216)); |
2821 | | m128iS14 = _mm_load_si128((__m128i *) (src + 232)); |
2822 | | m128iS15 = _mm_load_si128((__m128i *) (src + 248)); |
2823 | | } else { |
2824 | | m128iS0 = _mm_load_si128((__m128i *) (src)); |
2825 | | m128iS1 = _mm_load_si128((__m128i *) (src + 32)); |
2826 | | m128iS2 = _mm_load_si128((__m128i *) (src + 64)); |
2827 | | m128iS3 = _mm_load_si128((__m128i *) (src + 96)); |
2828 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); |
2829 | | m128iS5 = _mm_load_si128((__m128i *) (src + 160)); |
2830 | | m128iS6 = _mm_load_si128((__m128i *) (src + 192)); |
2831 | | m128iS7 = _mm_load_si128((__m128i *) (src + 224)); |
2832 | | m128iS8 = _mm_load_si128((__m128i *) (src + 8)); |
2833 | | m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8)); |
2834 | | m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8)); |
2835 | | m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8)); |
2836 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8)); |
2837 | | m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8)); |
2838 | | m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8)); |
2839 | | m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8)); |
2840 | | shift = shift_2nd; |
2841 | | m128iAdd = _mm_set1_epi32(add_2nd); |
2842 | | } |
2843 | | |
2844 | | } else { |
2845 | | int k, m = 0; |
2846 | | _mm_storeu_si128((__m128i *) (src), m128iS0); |
2847 | | _mm_storeu_si128((__m128i *) (src + 8), m128iS1); |
2848 | | _mm_storeu_si128((__m128i *) (src + 32), m128iS2); |
2849 | | _mm_storeu_si128((__m128i *) (src + 40), m128iS3); |
2850 | | _mm_storeu_si128((__m128i *) (src + 64), m128iS4); |
2851 | | _mm_storeu_si128((__m128i *) (src + 72), m128iS5); |
2852 | | _mm_storeu_si128((__m128i *) (src + 96), m128iS6); |
2853 | | _mm_storeu_si128((__m128i *) (src + 104), m128iS7); |
2854 | | _mm_storeu_si128((__m128i *) (src + 128), m128iS8); |
2855 | | _mm_storeu_si128((__m128i *) (src + 136), m128iS9); |
2856 | | _mm_storeu_si128((__m128i *) (src + 160), m128iS10); |
2857 | | _mm_storeu_si128((__m128i *) (src + 168), m128iS11); |
2858 | | _mm_storeu_si128((__m128i *) (src + 192), m128iS12); |
2859 | | _mm_storeu_si128((__m128i *) (src + 200), m128iS13); |
2860 | | _mm_storeu_si128((__m128i *) (src + 224), m128iS14); |
2861 | | _mm_storeu_si128((__m128i *) (src + 232), m128iS15); |
2862 | | dst = (uint16_t*) _dst + (i * stride); |
2863 | | |
2864 | | for (k = 0; k < 8; k++) { |
2865 | | dst[0] = av_clip_uintp2(dst[0] + src[m],10); |
2866 | | dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); |
2867 | | dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10); |
2868 | | dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10); |
2869 | | dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10); |
2870 | | dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10); |
2871 | | dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10); |
2872 | | dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10); |
2873 | | |
2874 | | dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10); |
2875 | | dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10); |
2876 | | dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10); |
2877 | | dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10); |
2878 | | dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10); |
2879 | | dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10); |
2880 | | dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10); |
2881 | | dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10); |
2882 | | m += 1; |
2883 | | dst += stride; |
2884 | | } |
2885 | | if (!i) { |
2886 | | m128iS0 = _mm_load_si128((__m128i *) (src + 16)); |
2887 | | m128iS1 = _mm_load_si128((__m128i *) (src + 48)); |
2888 | | m128iS2 = _mm_load_si128((__m128i *) (src + 80)); |
2889 | | m128iS3 = _mm_loadu_si128((__m128i *) (src + 112)); |
2890 | | m128iS4 = _mm_load_si128((__m128i *) (src + 144)); |
2891 | | m128iS5 = _mm_load_si128((__m128i *) (src + 176)); |
2892 | | m128iS6 = _mm_load_si128((__m128i *) (src + 208)); |
2893 | | m128iS7 = _mm_load_si128((__m128i *) (src + 240)); |
2894 | | m128iS8 = _mm_load_si128((__m128i *) (src + 24)); |
2895 | | m128iS9 = _mm_load_si128((__m128i *) (src + 56)); |
2896 | | m128iS10 = _mm_load_si128((__m128i *) (src + 88)); |
2897 | | m128iS11 = _mm_loadu_si128((__m128i *) (src + 120)); |
2898 | | m128iS12 = _mm_load_si128((__m128i *) (src + 152)); |
2899 | | m128iS13 = _mm_load_si128((__m128i *) (src + 184)); |
2900 | | m128iS14 = _mm_load_si128((__m128i *) (src + 216)); |
2901 | | m128iS15 = _mm_load_si128((__m128i *) (src + 248)); |
2902 | | } |
2903 | | } |
2904 | | } |
2905 | | } |
2906 | | |
2907 | | } |
2908 | | #endif |
2909 | | |
2910 | | |
2911 | | #if HAVE_SSE4_1 |
2912 | | void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs, |
2913 | 6.39k | ptrdiff_t _stride) { |
2914 | 6.39k | uint8_t shift_2nd = 12; // 20 - Bit depth |
2915 | 6.39k | uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1)) |
2916 | 6.39k | int i, j; |
2917 | 6.39k | uint8_t *dst = (uint8_t*) _dst; |
2918 | 6.39k | ptrdiff_t stride = _stride / sizeof(uint8_t); |
2919 | 6.39k | int shift; |
2920 | 6.39k | const int16_t *src = coeffs; |
2921 | | |
2922 | 6.39k | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
2923 | 6.39k | m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, |
2924 | 6.39k | m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, |
2925 | 6.39k | m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, |
2926 | 6.39k | E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, |
2927 | 6.39k | O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, |
2928 | 6.39k | E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; |
2929 | 6.39k | __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; |
2930 | 6.39k | __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, |
2931 | 6.39k | EEE0l, EEE1l, EEE0h, EEE1h; |
2932 | 6.39k | __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, |
2933 | 6.39k | m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, |
2934 | 6.39k | m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, |
2935 | 6.39k | m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, |
2936 | 6.39k | O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, |
2937 | 6.39k | O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, |
2938 | 6.39k | EE4l, EE7h, EE6h, EE5h, EE4h; |
2939 | | |
2940 | 6.39k | __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31; |
2941 | 6.39k | __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63; |
2942 | 6.39k | __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95; |
2943 | 6.39k | __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127; |
2944 | | |
2945 | | |
2946 | 6.39k | m128iS0 = _mm_load_si128((__m128i *) (src)); |
2947 | 6.39k | m128iS1 = _mm_load_si128((__m128i *) (src + 32)); |
2948 | 6.39k | m128iS2 = _mm_load_si128((__m128i *) (src + 64)); |
2949 | 6.39k | m128iS3 = _mm_load_si128((__m128i *) (src + 96)); |
2950 | 6.39k | m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); |
2951 | 6.39k | m128iS5 = _mm_load_si128((__m128i *) (src + 160)); |
2952 | 6.39k | m128iS6 = _mm_load_si128((__m128i *) (src + 192)); |
2953 | 6.39k | m128iS7 = _mm_load_si128((__m128i *) (src + 224)); |
2954 | 6.39k | m128iS8 = _mm_load_si128((__m128i *) (src + 256)); |
2955 | 6.39k | m128iS9 = _mm_load_si128((__m128i *) (src + 288)); |
2956 | 6.39k | m128iS10 = _mm_load_si128((__m128i *) (src + 320)); |
2957 | 6.39k | m128iS11 = _mm_load_si128((__m128i *) (src + 352)); |
2958 | 6.39k | m128iS12 = _mm_load_si128((__m128i *) (src + 384)); |
2959 | 6.39k | m128iS13 = _mm_load_si128((__m128i *) (src + 416)); |
2960 | 6.39k | m128iS14 = _mm_load_si128((__m128i *) (src + 448)); |
2961 | 6.39k | m128iS15 = _mm_load_si128((__m128i *) (src + 480)); |
2962 | 6.39k | m128iS16 = _mm_load_si128((__m128i *) (src + 512)); |
2963 | 6.39k | m128iS17 = _mm_load_si128((__m128i *) (src + 544)); |
2964 | 6.39k | m128iS18 = _mm_load_si128((__m128i *) (src + 576)); |
2965 | 6.39k | m128iS19 = _mm_load_si128((__m128i *) (src + 608)); |
2966 | 6.39k | m128iS20 = _mm_load_si128((__m128i *) (src + 640)); |
2967 | 6.39k | m128iS21 = _mm_load_si128((__m128i *) (src + 672)); |
2968 | 6.39k | m128iS22 = _mm_load_si128((__m128i *) (src + 704)); |
2969 | 6.39k | m128iS23 = _mm_load_si128((__m128i *) (src + 736)); |
2970 | 6.39k | m128iS24 = _mm_load_si128((__m128i *) (src + 768)); |
2971 | 6.39k | m128iS25 = _mm_load_si128((__m128i *) (src + 800)); |
2972 | 6.39k | m128iS26 = _mm_load_si128((__m128i *) (src + 832)); |
2973 | 6.39k | m128iS27 = _mm_load_si128((__m128i *) (src + 864)); |
2974 | 6.39k | m128iS28 = _mm_load_si128((__m128i *) (src + 896)); |
2975 | 6.39k | m128iS29 = _mm_load_si128((__m128i *) (src + 928)); |
2976 | 6.39k | m128iS30 = _mm_load_si128((__m128i *) (src + 960)); |
2977 | 6.39k | m128iS31 = _mm_load_si128((__m128i *) (src + 992)); |
2978 | | |
2979 | 6.39k | shift = shift_1st; |
2980 | 6.39k | m128iAdd = _mm_set1_epi32(add_1st); |
2981 | | |
2982 | 19.1k | for (j = 0; j < 2; j++) { |
2983 | 63.9k | for (i = 0; i < 32; i += 8) { |
2984 | 51.1k | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
2985 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
2986 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][0]))); |
2987 | 51.1k | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
2988 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
2989 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][0]))); |
2990 | | |
2991 | 51.1k | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
2992 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
2993 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][0]))); |
2994 | 51.1k | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
2995 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
2996 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][0]))); |
2997 | | |
2998 | 51.1k | m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); |
2999 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3000 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][0]))); |
3001 | 51.1k | m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); |
3002 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3003 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][0]))); |
3004 | | |
3005 | 51.1k | m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); |
3006 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3007 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][0]))); |
3008 | 51.1k | m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); |
3009 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3010 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][0]))); |
3011 | | |
3012 | 51.1k | m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); |
3013 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3014 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][0]))); |
3015 | 51.1k | m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); |
3016 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3017 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][0]))); |
3018 | | |
3019 | 51.1k | m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); |
3020 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3021 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][0]))); |
3022 | 51.1k | m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); |
3023 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3024 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][0]))); |
3025 | | |
3026 | 51.1k | m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); |
3027 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3028 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][0]))); |
3029 | 51.1k | m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); |
3030 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3031 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][0]))); |
3032 | | |
3033 | 51.1k | m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); |
3034 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3035 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][0]))); |
3036 | 51.1k | m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); |
3037 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3038 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][0]))); |
3039 | | |
3040 | 51.1k | O0l = _mm_add_epi32(E0l, E1l); |
3041 | 51.1k | O0l = _mm_add_epi32(O0l, E2l); |
3042 | 51.1k | O0l = _mm_add_epi32(O0l, E3l); |
3043 | 51.1k | O0l = _mm_add_epi32(O0l, E4l); |
3044 | 51.1k | O0l = _mm_add_epi32(O0l, E5l); |
3045 | 51.1k | O0l = _mm_add_epi32(O0l, E6l); |
3046 | 51.1k | O0l = _mm_add_epi32(O0l, E7l); |
3047 | | |
3048 | 51.1k | O0h = _mm_add_epi32(E0h, E1h); |
3049 | 51.1k | O0h = _mm_add_epi32(O0h, E2h); |
3050 | 51.1k | O0h = _mm_add_epi32(O0h, E3h); |
3051 | 51.1k | O0h = _mm_add_epi32(O0h, E4h); |
3052 | 51.1k | O0h = _mm_add_epi32(O0h, E5h); |
3053 | 51.1k | O0h = _mm_add_epi32(O0h, E6h); |
3054 | 51.1k | O0h = _mm_add_epi32(O0h, E7h); |
3055 | | |
3056 | | /* Compute O1*/ |
3057 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3058 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][1]))); |
3059 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3060 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][1]))); |
3061 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3062 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][1]))); |
3063 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3064 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][1]))); |
3065 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3066 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][1]))); |
3067 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3068 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][1]))); |
3069 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3070 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][1]))); |
3071 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3072 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][1]))); |
3073 | | |
3074 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3075 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][1]))); |
3076 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3077 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][1]))); |
3078 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3079 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][1]))); |
3080 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3081 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][1]))); |
3082 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3083 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][1]))); |
3084 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3085 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][1]))); |
3086 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3087 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][1]))); |
3088 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3089 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][1]))); |
3090 | | |
3091 | 51.1k | O1l = _mm_add_epi32(E0l, E1l); |
3092 | 51.1k | O1l = _mm_add_epi32(O1l, E2l); |
3093 | 51.1k | O1l = _mm_add_epi32(O1l, E3l); |
3094 | 51.1k | O1l = _mm_add_epi32(O1l, E4l); |
3095 | 51.1k | O1l = _mm_add_epi32(O1l, E5l); |
3096 | 51.1k | O1l = _mm_add_epi32(O1l, E6l); |
3097 | 51.1k | O1l = _mm_add_epi32(O1l, E7l); |
3098 | | |
3099 | 51.1k | O1h = _mm_add_epi32(E0h, E1h); |
3100 | 51.1k | O1h = _mm_add_epi32(O1h, E2h); |
3101 | 51.1k | O1h = _mm_add_epi32(O1h, E3h); |
3102 | 51.1k | O1h = _mm_add_epi32(O1h, E4h); |
3103 | 51.1k | O1h = _mm_add_epi32(O1h, E5h); |
3104 | 51.1k | O1h = _mm_add_epi32(O1h, E6h); |
3105 | 51.1k | O1h = _mm_add_epi32(O1h, E7h); |
3106 | | /* Compute O2*/ |
3107 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3108 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][2]))); |
3109 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3110 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][2]))); |
3111 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3112 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][2]))); |
3113 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3114 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][2]))); |
3115 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3116 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][2]))); |
3117 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3118 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][2]))); |
3119 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3120 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][2]))); |
3121 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3122 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][2]))); |
3123 | | |
3124 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3125 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][2]))); |
3126 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3127 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][2]))); |
3128 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3129 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][2]))); |
3130 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3131 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][2]))); |
3132 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3133 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][2]))); |
3134 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3135 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][2]))); |
3136 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3137 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][2]))); |
3138 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3139 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][2]))); |
3140 | | |
3141 | 51.1k | O2l = _mm_add_epi32(E0l, E1l); |
3142 | 51.1k | O2l = _mm_add_epi32(O2l, E2l); |
3143 | 51.1k | O2l = _mm_add_epi32(O2l, E3l); |
3144 | 51.1k | O2l = _mm_add_epi32(O2l, E4l); |
3145 | 51.1k | O2l = _mm_add_epi32(O2l, E5l); |
3146 | 51.1k | O2l = _mm_add_epi32(O2l, E6l); |
3147 | 51.1k | O2l = _mm_add_epi32(O2l, E7l); |
3148 | | |
3149 | 51.1k | O2h = _mm_add_epi32(E0h, E1h); |
3150 | 51.1k | O2h = _mm_add_epi32(O2h, E2h); |
3151 | 51.1k | O2h = _mm_add_epi32(O2h, E3h); |
3152 | 51.1k | O2h = _mm_add_epi32(O2h, E4h); |
3153 | 51.1k | O2h = _mm_add_epi32(O2h, E5h); |
3154 | 51.1k | O2h = _mm_add_epi32(O2h, E6h); |
3155 | 51.1k | O2h = _mm_add_epi32(O2h, E7h); |
3156 | | /* Compute O3*/ |
3157 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3158 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][3]))); |
3159 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3160 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][3]))); |
3161 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3162 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][3]))); |
3163 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3164 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][3]))); |
3165 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3166 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][3]))); |
3167 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3168 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][3]))); |
3169 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3170 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][3]))); |
3171 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3172 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][3]))); |
3173 | | |
3174 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3175 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][3]))); |
3176 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3177 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][3]))); |
3178 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3179 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][3]))); |
3180 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3181 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][3]))); |
3182 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3183 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][3]))); |
3184 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3185 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][3]))); |
3186 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3187 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][3]))); |
3188 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3189 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][3]))); |
3190 | | |
3191 | 51.1k | O3l = _mm_add_epi32(E0l, E1l); |
3192 | 51.1k | O3l = _mm_add_epi32(O3l, E2l); |
3193 | 51.1k | O3l = _mm_add_epi32(O3l, E3l); |
3194 | 51.1k | O3l = _mm_add_epi32(O3l, E4l); |
3195 | 51.1k | O3l = _mm_add_epi32(O3l, E5l); |
3196 | 51.1k | O3l = _mm_add_epi32(O3l, E6l); |
3197 | 51.1k | O3l = _mm_add_epi32(O3l, E7l); |
3198 | | |
3199 | 51.1k | O3h = _mm_add_epi32(E0h, E1h); |
3200 | 51.1k | O3h = _mm_add_epi32(O3h, E2h); |
3201 | 51.1k | O3h = _mm_add_epi32(O3h, E3h); |
3202 | 51.1k | O3h = _mm_add_epi32(O3h, E4h); |
3203 | 51.1k | O3h = _mm_add_epi32(O3h, E5h); |
3204 | 51.1k | O3h = _mm_add_epi32(O3h, E6h); |
3205 | 51.1k | O3h = _mm_add_epi32(O3h, E7h); |
3206 | | /* Compute O4*/ |
3207 | | |
3208 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3209 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][4]))); |
3210 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3211 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][4]))); |
3212 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3213 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][4]))); |
3214 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3215 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][4]))); |
3216 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3217 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][4]))); |
3218 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3219 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][4]))); |
3220 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3221 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][4]))); |
3222 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3223 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][4]))); |
3224 | | |
3225 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3226 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][4]))); |
3227 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3228 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][4]))); |
3229 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3230 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][4]))); |
3231 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3232 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][4]))); |
3233 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3234 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][4]))); |
3235 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3236 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][4]))); |
3237 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3238 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][4]))); |
3239 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3240 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][4]))); |
3241 | | |
3242 | 51.1k | O4l = _mm_add_epi32(E0l, E1l); |
3243 | 51.1k | O4l = _mm_add_epi32(O4l, E2l); |
3244 | 51.1k | O4l = _mm_add_epi32(O4l, E3l); |
3245 | 51.1k | O4l = _mm_add_epi32(O4l, E4l); |
3246 | 51.1k | O4l = _mm_add_epi32(O4l, E5l); |
3247 | 51.1k | O4l = _mm_add_epi32(O4l, E6l); |
3248 | 51.1k | O4l = _mm_add_epi32(O4l, E7l); |
3249 | | |
3250 | 51.1k | O4h = _mm_add_epi32(E0h, E1h); |
3251 | 51.1k | O4h = _mm_add_epi32(O4h, E2h); |
3252 | 51.1k | O4h = _mm_add_epi32(O4h, E3h); |
3253 | 51.1k | O4h = _mm_add_epi32(O4h, E4h); |
3254 | 51.1k | O4h = _mm_add_epi32(O4h, E5h); |
3255 | 51.1k | O4h = _mm_add_epi32(O4h, E6h); |
3256 | 51.1k | O4h = _mm_add_epi32(O4h, E7h); |
3257 | | |
3258 | | /* Compute O5*/ |
3259 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3260 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][5]))); |
3261 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3262 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][5]))); |
3263 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3264 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][5]))); |
3265 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3266 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][5]))); |
3267 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3268 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][5]))); |
3269 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3270 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][5]))); |
3271 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3272 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][5]))); |
3273 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3274 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][5]))); |
3275 | | |
3276 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3277 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][5]))); |
3278 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3279 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][5]))); |
3280 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3281 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][5]))); |
3282 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3283 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][5]))); |
3284 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3285 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][5]))); |
3286 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3287 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][5]))); |
3288 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3289 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][5]))); |
3290 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3291 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][5]))); |
3292 | | |
3293 | 51.1k | O5l = _mm_add_epi32(E0l, E1l); |
3294 | 51.1k | O5l = _mm_add_epi32(O5l, E2l); |
3295 | 51.1k | O5l = _mm_add_epi32(O5l, E3l); |
3296 | 51.1k | O5l = _mm_add_epi32(O5l, E4l); |
3297 | 51.1k | O5l = _mm_add_epi32(O5l, E5l); |
3298 | 51.1k | O5l = _mm_add_epi32(O5l, E6l); |
3299 | 51.1k | O5l = _mm_add_epi32(O5l, E7l); |
3300 | | |
3301 | 51.1k | O5h = _mm_add_epi32(E0h, E1h); |
3302 | 51.1k | O5h = _mm_add_epi32(O5h, E2h); |
3303 | 51.1k | O5h = _mm_add_epi32(O5h, E3h); |
3304 | 51.1k | O5h = _mm_add_epi32(O5h, E4h); |
3305 | 51.1k | O5h = _mm_add_epi32(O5h, E5h); |
3306 | 51.1k | O5h = _mm_add_epi32(O5h, E6h); |
3307 | 51.1k | O5h = _mm_add_epi32(O5h, E7h); |
3308 | | |
3309 | | /* Compute O6*/ |
3310 | | |
3311 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3312 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][6]))); |
3313 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3314 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][6]))); |
3315 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3316 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][6]))); |
3317 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3318 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][6]))); |
3319 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3320 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][6]))); |
3321 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3322 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][6]))); |
3323 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3324 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][6]))); |
3325 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3326 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][6]))); |
3327 | | |
3328 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3329 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][6]))); |
3330 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3331 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][6]))); |
3332 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3333 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][6]))); |
3334 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3335 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][6]))); |
3336 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3337 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][6]))); |
3338 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3339 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][6]))); |
3340 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3341 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][6]))); |
3342 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3343 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][6]))); |
3344 | | |
3345 | 51.1k | O6l = _mm_add_epi32(E0l, E1l); |
3346 | 51.1k | O6l = _mm_add_epi32(O6l, E2l); |
3347 | 51.1k | O6l = _mm_add_epi32(O6l, E3l); |
3348 | 51.1k | O6l = _mm_add_epi32(O6l, E4l); |
3349 | 51.1k | O6l = _mm_add_epi32(O6l, E5l); |
3350 | 51.1k | O6l = _mm_add_epi32(O6l, E6l); |
3351 | 51.1k | O6l = _mm_add_epi32(O6l, E7l); |
3352 | | |
3353 | 51.1k | O6h = _mm_add_epi32(E0h, E1h); |
3354 | 51.1k | O6h = _mm_add_epi32(O6h, E2h); |
3355 | 51.1k | O6h = _mm_add_epi32(O6h, E3h); |
3356 | 51.1k | O6h = _mm_add_epi32(O6h, E4h); |
3357 | 51.1k | O6h = _mm_add_epi32(O6h, E5h); |
3358 | 51.1k | O6h = _mm_add_epi32(O6h, E6h); |
3359 | 51.1k | O6h = _mm_add_epi32(O6h, E7h); |
3360 | | |
3361 | | /* Compute O7*/ |
3362 | | |
3363 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3364 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][7]))); |
3365 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3366 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][7]))); |
3367 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3368 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][7]))); |
3369 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3370 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][7]))); |
3371 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3372 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][7]))); |
3373 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3374 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][7]))); |
3375 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3376 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][7]))); |
3377 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3378 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][7]))); |
3379 | | |
3380 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3381 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][7]))); |
3382 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3383 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][7]))); |
3384 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3385 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][7]))); |
3386 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3387 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][7]))); |
3388 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3389 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][7]))); |
3390 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3391 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][7]))); |
3392 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3393 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][7]))); |
3394 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3395 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][7]))); |
3396 | | |
3397 | 51.1k | O7l = _mm_add_epi32(E0l, E1l); |
3398 | 51.1k | O7l = _mm_add_epi32(O7l, E2l); |
3399 | 51.1k | O7l = _mm_add_epi32(O7l, E3l); |
3400 | 51.1k | O7l = _mm_add_epi32(O7l, E4l); |
3401 | 51.1k | O7l = _mm_add_epi32(O7l, E5l); |
3402 | 51.1k | O7l = _mm_add_epi32(O7l, E6l); |
3403 | 51.1k | O7l = _mm_add_epi32(O7l, E7l); |
3404 | | |
3405 | 51.1k | O7h = _mm_add_epi32(E0h, E1h); |
3406 | 51.1k | O7h = _mm_add_epi32(O7h, E2h); |
3407 | 51.1k | O7h = _mm_add_epi32(O7h, E3h); |
3408 | 51.1k | O7h = _mm_add_epi32(O7h, E4h); |
3409 | 51.1k | O7h = _mm_add_epi32(O7h, E5h); |
3410 | 51.1k | O7h = _mm_add_epi32(O7h, E6h); |
3411 | 51.1k | O7h = _mm_add_epi32(O7h, E7h); |
3412 | | |
3413 | | /* Compute O8*/ |
3414 | | |
3415 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3416 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][8]))); |
3417 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3418 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][8]))); |
3419 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3420 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][8]))); |
3421 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3422 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][8]))); |
3423 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3424 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][8]))); |
3425 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3426 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][8]))); |
3427 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3428 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][8]))); |
3429 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3430 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][8]))); |
3431 | | |
3432 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3433 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][8]))); |
3434 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3435 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][8]))); |
3436 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3437 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][8]))); |
3438 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3439 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][8]))); |
3440 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3441 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][8]))); |
3442 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3443 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][8]))); |
3444 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3445 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][8]))); |
3446 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3447 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][8]))); |
3448 | | |
3449 | 51.1k | O8l = _mm_add_epi32(E0l, E1l); |
3450 | 51.1k | O8l = _mm_add_epi32(O8l, E2l); |
3451 | 51.1k | O8l = _mm_add_epi32(O8l, E3l); |
3452 | 51.1k | O8l = _mm_add_epi32(O8l, E4l); |
3453 | 51.1k | O8l = _mm_add_epi32(O8l, E5l); |
3454 | 51.1k | O8l = _mm_add_epi32(O8l, E6l); |
3455 | 51.1k | O8l = _mm_add_epi32(O8l, E7l); |
3456 | | |
3457 | 51.1k | O8h = _mm_add_epi32(E0h, E1h); |
3458 | 51.1k | O8h = _mm_add_epi32(O8h, E2h); |
3459 | 51.1k | O8h = _mm_add_epi32(O8h, E3h); |
3460 | 51.1k | O8h = _mm_add_epi32(O8h, E4h); |
3461 | 51.1k | O8h = _mm_add_epi32(O8h, E5h); |
3462 | 51.1k | O8h = _mm_add_epi32(O8h, E6h); |
3463 | 51.1k | O8h = _mm_add_epi32(O8h, E7h); |
3464 | | |
3465 | | /* Compute O9*/ |
3466 | | |
3467 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3468 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][9]))); |
3469 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3470 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][9]))); |
3471 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3472 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][9]))); |
3473 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3474 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][9]))); |
3475 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3476 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][9]))); |
3477 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3478 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][9]))); |
3479 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3480 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][9]))); |
3481 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3482 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][9]))); |
3483 | | |
3484 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3485 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][9]))); |
3486 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3487 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][9]))); |
3488 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3489 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][9]))); |
3490 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3491 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][9]))); |
3492 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3493 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][9]))); |
3494 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3495 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][9]))); |
3496 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3497 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][9]))); |
3498 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3499 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][9]))); |
3500 | | |
3501 | 51.1k | O9l = _mm_add_epi32(E0l, E1l); |
3502 | 51.1k | O9l = _mm_add_epi32(O9l, E2l); |
3503 | 51.1k | O9l = _mm_add_epi32(O9l, E3l); |
3504 | 51.1k | O9l = _mm_add_epi32(O9l, E4l); |
3505 | 51.1k | O9l = _mm_add_epi32(O9l, E5l); |
3506 | 51.1k | O9l = _mm_add_epi32(O9l, E6l); |
3507 | 51.1k | O9l = _mm_add_epi32(O9l, E7l); |
3508 | | |
3509 | 51.1k | O9h = _mm_add_epi32(E0h, E1h); |
3510 | 51.1k | O9h = _mm_add_epi32(O9h, E2h); |
3511 | 51.1k | O9h = _mm_add_epi32(O9h, E3h); |
3512 | 51.1k | O9h = _mm_add_epi32(O9h, E4h); |
3513 | 51.1k | O9h = _mm_add_epi32(O9h, E5h); |
3514 | 51.1k | O9h = _mm_add_epi32(O9h, E6h); |
3515 | 51.1k | O9h = _mm_add_epi32(O9h, E7h); |
3516 | | |
3517 | | /* Compute 10*/ |
3518 | | |
3519 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3520 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][10]))); |
3521 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3522 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][10]))); |
3523 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3524 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][10]))); |
3525 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3526 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][10]))); |
3527 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3528 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][10]))); |
3529 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3530 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][10]))); |
3531 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3532 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][10]))); |
3533 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3534 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][10]))); |
3535 | | |
3536 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3537 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][10]))); |
3538 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3539 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][10]))); |
3540 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3541 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][10]))); |
3542 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3543 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][10]))); |
3544 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3545 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][10]))); |
3546 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3547 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][10]))); |
3548 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3549 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][10]))); |
3550 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3551 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][10]))); |
3552 | | |
3553 | 51.1k | O10l = _mm_add_epi32(E0l, E1l); |
3554 | 51.1k | O10l = _mm_add_epi32(O10l, E2l); |
3555 | 51.1k | O10l = _mm_add_epi32(O10l, E3l); |
3556 | 51.1k | O10l = _mm_add_epi32(O10l, E4l); |
3557 | 51.1k | O10l = _mm_add_epi32(O10l, E5l); |
3558 | 51.1k | O10l = _mm_add_epi32(O10l, E6l); |
3559 | 51.1k | O10l = _mm_add_epi32(O10l, E7l); |
3560 | | |
3561 | 51.1k | O10h = _mm_add_epi32(E0h, E1h); |
3562 | 51.1k | O10h = _mm_add_epi32(O10h, E2h); |
3563 | 51.1k | O10h = _mm_add_epi32(O10h, E3h); |
3564 | 51.1k | O10h = _mm_add_epi32(O10h, E4h); |
3565 | 51.1k | O10h = _mm_add_epi32(O10h, E5h); |
3566 | 51.1k | O10h = _mm_add_epi32(O10h, E6h); |
3567 | 51.1k | O10h = _mm_add_epi32(O10h, E7h); |
3568 | | |
3569 | | /* Compute 11*/ |
3570 | | |
3571 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3572 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][11]))); |
3573 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3574 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][11]))); |
3575 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3576 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][11]))); |
3577 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3578 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][11]))); |
3579 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3580 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][11]))); |
3581 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3582 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][11]))); |
3583 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3584 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][11]))); |
3585 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3586 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][11]))); |
3587 | | |
3588 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3589 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][11]))); |
3590 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3591 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][11]))); |
3592 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3593 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][11]))); |
3594 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3595 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][11]))); |
3596 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3597 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][11]))); |
3598 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3599 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][11]))); |
3600 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3601 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][11]))); |
3602 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3603 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][11]))); |
3604 | | |
3605 | 51.1k | O11l = _mm_add_epi32(E0l, E1l); |
3606 | 51.1k | O11l = _mm_add_epi32(O11l, E2l); |
3607 | 51.1k | O11l = _mm_add_epi32(O11l, E3l); |
3608 | 51.1k | O11l = _mm_add_epi32(O11l, E4l); |
3609 | 51.1k | O11l = _mm_add_epi32(O11l, E5l); |
3610 | 51.1k | O11l = _mm_add_epi32(O11l, E6l); |
3611 | 51.1k | O11l = _mm_add_epi32(O11l, E7l); |
3612 | | |
3613 | 51.1k | O11h = _mm_add_epi32(E0h, E1h); |
3614 | 51.1k | O11h = _mm_add_epi32(O11h, E2h); |
3615 | 51.1k | O11h = _mm_add_epi32(O11h, E3h); |
3616 | 51.1k | O11h = _mm_add_epi32(O11h, E4h); |
3617 | 51.1k | O11h = _mm_add_epi32(O11h, E5h); |
3618 | 51.1k | O11h = _mm_add_epi32(O11h, E6h); |
3619 | 51.1k | O11h = _mm_add_epi32(O11h, E7h); |
3620 | | |
3621 | | /* Compute 12*/ |
3622 | | |
3623 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3624 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][12]))); |
3625 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3626 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][12]))); |
3627 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3628 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][12]))); |
3629 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3630 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][12]))); |
3631 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3632 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][12]))); |
3633 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3634 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][12]))); |
3635 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3636 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][12]))); |
3637 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3638 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][12]))); |
3639 | | |
3640 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3641 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][12]))); |
3642 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3643 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][12]))); |
3644 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3645 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][12]))); |
3646 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3647 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][12]))); |
3648 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3649 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][12]))); |
3650 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3651 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][12]))); |
3652 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3653 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][12]))); |
3654 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3655 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][12]))); |
3656 | | |
3657 | 51.1k | O12l = _mm_add_epi32(E0l, E1l); |
3658 | 51.1k | O12l = _mm_add_epi32(O12l, E2l); |
3659 | 51.1k | O12l = _mm_add_epi32(O12l, E3l); |
3660 | 51.1k | O12l = _mm_add_epi32(O12l, E4l); |
3661 | 51.1k | O12l = _mm_add_epi32(O12l, E5l); |
3662 | 51.1k | O12l = _mm_add_epi32(O12l, E6l); |
3663 | 51.1k | O12l = _mm_add_epi32(O12l, E7l); |
3664 | | |
3665 | 51.1k | O12h = _mm_add_epi32(E0h, E1h); |
3666 | 51.1k | O12h = _mm_add_epi32(O12h, E2h); |
3667 | 51.1k | O12h = _mm_add_epi32(O12h, E3h); |
3668 | 51.1k | O12h = _mm_add_epi32(O12h, E4h); |
3669 | 51.1k | O12h = _mm_add_epi32(O12h, E5h); |
3670 | 51.1k | O12h = _mm_add_epi32(O12h, E6h); |
3671 | 51.1k | O12h = _mm_add_epi32(O12h, E7h); |
3672 | | |
3673 | | /* Compute 13*/ |
3674 | | |
3675 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3676 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][13]))); |
3677 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3678 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][13]))); |
3679 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3680 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][13]))); |
3681 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3682 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][13]))); |
3683 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3684 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][13]))); |
3685 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3686 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][13]))); |
3687 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3688 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][13]))); |
3689 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3690 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][13]))); |
3691 | | |
3692 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3693 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][13]))); |
3694 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3695 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][13]))); |
3696 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3697 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][13]))); |
3698 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3699 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][13]))); |
3700 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3701 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][13]))); |
3702 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3703 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][13]))); |
3704 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3705 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][13]))); |
3706 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3707 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][13]))); |
3708 | | |
3709 | 51.1k | O13l = _mm_add_epi32(E0l, E1l); |
3710 | 51.1k | O13l = _mm_add_epi32(O13l, E2l); |
3711 | 51.1k | O13l = _mm_add_epi32(O13l, E3l); |
3712 | 51.1k | O13l = _mm_add_epi32(O13l, E4l); |
3713 | 51.1k | O13l = _mm_add_epi32(O13l, E5l); |
3714 | 51.1k | O13l = _mm_add_epi32(O13l, E6l); |
3715 | 51.1k | O13l = _mm_add_epi32(O13l, E7l); |
3716 | | |
3717 | 51.1k | O13h = _mm_add_epi32(E0h, E1h); |
3718 | 51.1k | O13h = _mm_add_epi32(O13h, E2h); |
3719 | 51.1k | O13h = _mm_add_epi32(O13h, E3h); |
3720 | 51.1k | O13h = _mm_add_epi32(O13h, E4h); |
3721 | 51.1k | O13h = _mm_add_epi32(O13h, E5h); |
3722 | 51.1k | O13h = _mm_add_epi32(O13h, E6h); |
3723 | 51.1k | O13h = _mm_add_epi32(O13h, E7h); |
3724 | | |
3725 | | /* Compute O14 */ |
3726 | | |
3727 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3728 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][14]))); |
3729 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3730 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][14]))); |
3731 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3732 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][14]))); |
3733 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3734 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][14]))); |
3735 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3736 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][14]))); |
3737 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3738 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][14]))); |
3739 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3740 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][14]))); |
3741 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3742 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][14]))); |
3743 | | |
3744 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3745 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][14]))); |
3746 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3747 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][14]))); |
3748 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3749 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][14]))); |
3750 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3751 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][14]))); |
3752 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3753 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][14]))); |
3754 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3755 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][14]))); |
3756 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3757 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][14]))); |
3758 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3759 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][14]))); |
3760 | | |
3761 | 51.1k | O14l = _mm_add_epi32(E0l, E1l); |
3762 | 51.1k | O14l = _mm_add_epi32(O14l, E2l); |
3763 | 51.1k | O14l = _mm_add_epi32(O14l, E3l); |
3764 | 51.1k | O14l = _mm_add_epi32(O14l, E4l); |
3765 | 51.1k | O14l = _mm_add_epi32(O14l, E5l); |
3766 | 51.1k | O14l = _mm_add_epi32(O14l, E6l); |
3767 | 51.1k | O14l = _mm_add_epi32(O14l, E7l); |
3768 | | |
3769 | 51.1k | O14h = _mm_add_epi32(E0h, E1h); |
3770 | 51.1k | O14h = _mm_add_epi32(O14h, E2h); |
3771 | 51.1k | O14h = _mm_add_epi32(O14h, E3h); |
3772 | 51.1k | O14h = _mm_add_epi32(O14h, E4h); |
3773 | 51.1k | O14h = _mm_add_epi32(O14h, E5h); |
3774 | 51.1k | O14h = _mm_add_epi32(O14h, E6h); |
3775 | 51.1k | O14h = _mm_add_epi32(O14h, E7h); |
3776 | | |
3777 | | /* Compute O15*/ |
3778 | | |
3779 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3780 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][15]))); |
3781 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3782 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[0][15]))); |
3783 | 51.1k | E1l = _mm_madd_epi16(m128Tmp2, |
3784 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][15]))); |
3785 | 51.1k | E1h = _mm_madd_epi16(m128Tmp3, |
3786 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[1][15]))); |
3787 | 51.1k | E2l = _mm_madd_epi16(m128Tmp4, |
3788 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][15]))); |
3789 | 51.1k | E2h = _mm_madd_epi16(m128Tmp5, |
3790 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[2][15]))); |
3791 | 51.1k | E3l = _mm_madd_epi16(m128Tmp6, |
3792 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][15]))); |
3793 | 51.1k | E3h = _mm_madd_epi16(m128Tmp7, |
3794 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[3][15]))); |
3795 | | |
3796 | 51.1k | E4l = _mm_madd_epi16(m128Tmp8, |
3797 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][15]))); |
3798 | 51.1k | E4h = _mm_madd_epi16(m128Tmp9, |
3799 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[4][15]))); |
3800 | 51.1k | E5l = _mm_madd_epi16(m128Tmp10, |
3801 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][15]))); |
3802 | 51.1k | E5h = _mm_madd_epi16(m128Tmp11, |
3803 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[5][15]))); |
3804 | 51.1k | E6l = _mm_madd_epi16(m128Tmp12, |
3805 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][15]))); |
3806 | 51.1k | E6h = _mm_madd_epi16(m128Tmp13, |
3807 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[6][15]))); |
3808 | 51.1k | E7l = _mm_madd_epi16(m128Tmp14, |
3809 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][15]))); |
3810 | 51.1k | E7h = _mm_madd_epi16(m128Tmp15, |
3811 | 51.1k | _mm_load_si128((__m128i *) (transform32x32[7][15]))); |
3812 | | |
3813 | 51.1k | O15l = _mm_add_epi32(E0l, E1l); |
3814 | 51.1k | O15l = _mm_add_epi32(O15l, E2l); |
3815 | 51.1k | O15l = _mm_add_epi32(O15l, E3l); |
3816 | 51.1k | O15l = _mm_add_epi32(O15l, E4l); |
3817 | 51.1k | O15l = _mm_add_epi32(O15l, E5l); |
3818 | 51.1k | O15l = _mm_add_epi32(O15l, E6l); |
3819 | 51.1k | O15l = _mm_add_epi32(O15l, E7l); |
3820 | | |
3821 | 51.1k | O15h = _mm_add_epi32(E0h, E1h); |
3822 | 51.1k | O15h = _mm_add_epi32(O15h, E2h); |
3823 | 51.1k | O15h = _mm_add_epi32(O15h, E3h); |
3824 | 51.1k | O15h = _mm_add_epi32(O15h, E4h); |
3825 | 51.1k | O15h = _mm_add_epi32(O15h, E5h); |
3826 | 51.1k | O15h = _mm_add_epi32(O15h, E6h); |
3827 | 51.1k | O15h = _mm_add_epi32(O15h, E7h); |
3828 | | /* Compute E0 */ |
3829 | | |
3830 | 51.1k | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
3831 | 51.1k | E0l = _mm_madd_epi16(m128Tmp0, |
3832 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
3833 | 51.1k | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
3834 | 51.1k | E0h = _mm_madd_epi16(m128Tmp1, |
3835 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
3836 | | |
3837 | 51.1k | m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); |
3838 | 51.1k | E0l = _mm_add_epi32(E0l, |
3839 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3840 | 51.1k | _mm_load_si128( |
3841 | 51.1k | (__m128i *) (transform16x16_1[1][0])))); |
3842 | 51.1k | m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); |
3843 | 51.1k | E0h = _mm_add_epi32(E0h, |
3844 | 51.1k | _mm_madd_epi16(m128Tmp3, |
3845 | 51.1k | _mm_load_si128( |
3846 | 51.1k | (__m128i *) (transform16x16_1[1][0])))); |
3847 | | |
3848 | 51.1k | m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); |
3849 | 51.1k | E0l = _mm_add_epi32(E0l, |
3850 | 51.1k | _mm_madd_epi16(m128Tmp4, |
3851 | 51.1k | _mm_load_si128( |
3852 | 51.1k | (__m128i *) (transform16x16_1[2][0])))); |
3853 | 51.1k | m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); |
3854 | 51.1k | E0h = _mm_add_epi32(E0h, |
3855 | 51.1k | _mm_madd_epi16(m128Tmp5, |
3856 | 51.1k | _mm_load_si128( |
3857 | 51.1k | (__m128i *) (transform16x16_1[2][0])))); |
3858 | | |
3859 | 51.1k | m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); |
3860 | 51.1k | E0l = _mm_add_epi32(E0l, |
3861 | 51.1k | _mm_madd_epi16(m128Tmp6, |
3862 | 51.1k | _mm_load_si128( |
3863 | 51.1k | (__m128i *) (transform16x16_1[3][0])))); |
3864 | 51.1k | m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); |
3865 | 51.1k | E0h = _mm_add_epi32(E0h, |
3866 | 51.1k | _mm_madd_epi16(m128Tmp7, |
3867 | 51.1k | _mm_load_si128( |
3868 | 51.1k | (__m128i *) (transform16x16_1[3][0])))); |
3869 | | |
3870 | | /* Compute E1 */ |
3871 | 51.1k | E1l = _mm_madd_epi16(m128Tmp0, |
3872 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
3873 | 51.1k | E1h = _mm_madd_epi16(m128Tmp1, |
3874 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
3875 | 51.1k | E1l = _mm_add_epi32(E1l, |
3876 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3877 | 51.1k | _mm_load_si128( |
3878 | 51.1k | (__m128i *) (transform16x16_1[1][1])))); |
3879 | 51.1k | E1h = _mm_add_epi32(E1h, |
3880 | 51.1k | _mm_madd_epi16(m128Tmp3, |
3881 | 51.1k | _mm_load_si128( |
3882 | 51.1k | (__m128i *) (transform16x16_1[1][1])))); |
3883 | 51.1k | E1l = _mm_add_epi32(E1l, |
3884 | 51.1k | _mm_madd_epi16(m128Tmp4, |
3885 | 51.1k | _mm_load_si128( |
3886 | 51.1k | (__m128i *) (transform16x16_1[2][1])))); |
3887 | 51.1k | E1h = _mm_add_epi32(E1h, |
3888 | 51.1k | _mm_madd_epi16(m128Tmp5, |
3889 | 51.1k | _mm_load_si128( |
3890 | 51.1k | (__m128i *) (transform16x16_1[2][1])))); |
3891 | 51.1k | E1l = _mm_add_epi32(E1l, |
3892 | 51.1k | _mm_madd_epi16(m128Tmp6, |
3893 | 51.1k | _mm_load_si128( |
3894 | 51.1k | (__m128i *) (transform16x16_1[3][1])))); |
3895 | 51.1k | E1h = _mm_add_epi32(E1h, |
3896 | 51.1k | _mm_madd_epi16(m128Tmp7, |
3897 | 51.1k | _mm_load_si128( |
3898 | 51.1k | (__m128i *) (transform16x16_1[3][1])))); |
3899 | | |
3900 | | /* Compute E2 */ |
3901 | 51.1k | E2l = _mm_madd_epi16(m128Tmp0, |
3902 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
3903 | 51.1k | E2h = _mm_madd_epi16(m128Tmp1, |
3904 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
3905 | 51.1k | E2l = _mm_add_epi32(E2l, |
3906 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3907 | 51.1k | _mm_load_si128( |
3908 | 51.1k | (__m128i *) (transform16x16_1[1][2])))); |
3909 | 51.1k | E2h = _mm_add_epi32(E2h, |
3910 | 51.1k | _mm_madd_epi16(m128Tmp3, |
3911 | 51.1k | _mm_load_si128( |
3912 | 51.1k | (__m128i *) (transform16x16_1[1][2])))); |
3913 | 51.1k | E2l = _mm_add_epi32(E2l, |
3914 | 51.1k | _mm_madd_epi16(m128Tmp4, |
3915 | 51.1k | _mm_load_si128( |
3916 | 51.1k | (__m128i *) (transform16x16_1[2][2])))); |
3917 | 51.1k | E2h = _mm_add_epi32(E2h, |
3918 | 51.1k | _mm_madd_epi16(m128Tmp5, |
3919 | 51.1k | _mm_load_si128( |
3920 | 51.1k | (__m128i *) (transform16x16_1[2][2])))); |
3921 | 51.1k | E2l = _mm_add_epi32(E2l, |
3922 | 51.1k | _mm_madd_epi16(m128Tmp6, |
3923 | 51.1k | _mm_load_si128( |
3924 | 51.1k | (__m128i *) (transform16x16_1[3][2])))); |
3925 | 51.1k | E2h = _mm_add_epi32(E2h, |
3926 | 51.1k | _mm_madd_epi16(m128Tmp7, |
3927 | 51.1k | _mm_load_si128( |
3928 | 51.1k | (__m128i *) (transform16x16_1[3][2])))); |
3929 | | |
3930 | | /* Compute E3 */ |
3931 | 51.1k | E3l = _mm_madd_epi16(m128Tmp0, |
3932 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
3933 | 51.1k | E3h = _mm_madd_epi16(m128Tmp1, |
3934 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
3935 | 51.1k | E3l = _mm_add_epi32(E3l, |
3936 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3937 | 51.1k | _mm_load_si128( |
3938 | 51.1k | (__m128i *) (transform16x16_1[1][3])))); |
3939 | 51.1k | E3h = _mm_add_epi32(E3h, |
3940 | 51.1k | _mm_madd_epi16(m128Tmp3, |
3941 | 51.1k | _mm_load_si128( |
3942 | 51.1k | (__m128i *) (transform16x16_1[1][3])))); |
3943 | 51.1k | E3l = _mm_add_epi32(E3l, |
3944 | 51.1k | _mm_madd_epi16(m128Tmp4, |
3945 | 51.1k | _mm_load_si128( |
3946 | 51.1k | (__m128i *) (transform16x16_1[2][3])))); |
3947 | 51.1k | E3h = _mm_add_epi32(E3h, |
3948 | 51.1k | _mm_madd_epi16(m128Tmp5, |
3949 | 51.1k | _mm_load_si128( |
3950 | 51.1k | (__m128i *) (transform16x16_1[2][3])))); |
3951 | 51.1k | E3l = _mm_add_epi32(E3l, |
3952 | 51.1k | _mm_madd_epi16(m128Tmp6, |
3953 | 51.1k | _mm_load_si128( |
3954 | 51.1k | (__m128i *) (transform16x16_1[3][3])))); |
3955 | 51.1k | E3h = _mm_add_epi32(E3h, |
3956 | 51.1k | _mm_madd_epi16(m128Tmp7, |
3957 | 51.1k | _mm_load_si128( |
3958 | 51.1k | (__m128i *) (transform16x16_1[3][3])))); |
3959 | | |
3960 | | /* Compute E4 */ |
3961 | 51.1k | E4l = _mm_madd_epi16(m128Tmp0, |
3962 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
3963 | 51.1k | E4h = _mm_madd_epi16(m128Tmp1, |
3964 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
3965 | 51.1k | E4l = _mm_add_epi32(E4l, |
3966 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3967 | 51.1k | _mm_load_si128( |
3968 | 51.1k | (__m128i *) (transform16x16_1[1][4])))); |
3969 | 51.1k | E4h = _mm_add_epi32(E4h, |
3970 | 51.1k | _mm_madd_epi16(m128Tmp3, |
3971 | 51.1k | _mm_load_si128( |
3972 | 51.1k | (__m128i *) (transform16x16_1[1][4])))); |
3973 | 51.1k | E4l = _mm_add_epi32(E4l, |
3974 | 51.1k | _mm_madd_epi16(m128Tmp4, |
3975 | 51.1k | _mm_load_si128( |
3976 | 51.1k | (__m128i *) (transform16x16_1[2][4])))); |
3977 | 51.1k | E4h = _mm_add_epi32(E4h, |
3978 | 51.1k | _mm_madd_epi16(m128Tmp5, |
3979 | 51.1k | _mm_load_si128( |
3980 | 51.1k | (__m128i *) (transform16x16_1[2][4])))); |
3981 | 51.1k | E4l = _mm_add_epi32(E4l, |
3982 | 51.1k | _mm_madd_epi16(m128Tmp6, |
3983 | 51.1k | _mm_load_si128( |
3984 | 51.1k | (__m128i *) (transform16x16_1[3][4])))); |
3985 | 51.1k | E4h = _mm_add_epi32(E4h, |
3986 | 51.1k | _mm_madd_epi16(m128Tmp7, |
3987 | 51.1k | _mm_load_si128( |
3988 | 51.1k | (__m128i *) (transform16x16_1[3][4])))); |
3989 | | |
3990 | | /* Compute E3 */ |
3991 | 51.1k | E5l = _mm_madd_epi16(m128Tmp0, |
3992 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
3993 | 51.1k | E5h = _mm_madd_epi16(m128Tmp1, |
3994 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
3995 | 51.1k | E5l = _mm_add_epi32(E5l, |
3996 | 51.1k | _mm_madd_epi16(m128Tmp2, |
3997 | 51.1k | _mm_load_si128( |
3998 | 51.1k | (__m128i *) (transform16x16_1[1][5])))); |
3999 | 51.1k | E5h = _mm_add_epi32(E5h, |
4000 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4001 | 51.1k | _mm_load_si128( |
4002 | 51.1k | (__m128i *) (transform16x16_1[1][5])))); |
4003 | 51.1k | E5l = _mm_add_epi32(E5l, |
4004 | 51.1k | _mm_madd_epi16(m128Tmp4, |
4005 | 51.1k | _mm_load_si128( |
4006 | 51.1k | (__m128i *) (transform16x16_1[2][5])))); |
4007 | 51.1k | E5h = _mm_add_epi32(E5h, |
4008 | 51.1k | _mm_madd_epi16(m128Tmp5, |
4009 | 51.1k | _mm_load_si128( |
4010 | 51.1k | (__m128i *) (transform16x16_1[2][5])))); |
4011 | 51.1k | E5l = _mm_add_epi32(E5l, |
4012 | 51.1k | _mm_madd_epi16(m128Tmp6, |
4013 | 51.1k | _mm_load_si128( |
4014 | 51.1k | (__m128i *) (transform16x16_1[3][5])))); |
4015 | 51.1k | E5h = _mm_add_epi32(E5h, |
4016 | 51.1k | _mm_madd_epi16(m128Tmp7, |
4017 | 51.1k | _mm_load_si128( |
4018 | 51.1k | (__m128i *) (transform16x16_1[3][5])))); |
4019 | | |
4020 | | /* Compute E6 */ |
4021 | 51.1k | E6l = _mm_madd_epi16(m128Tmp0, |
4022 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
4023 | 51.1k | E6h = _mm_madd_epi16(m128Tmp1, |
4024 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
4025 | 51.1k | E6l = _mm_add_epi32(E6l, |
4026 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4027 | 51.1k | _mm_load_si128( |
4028 | 51.1k | (__m128i *) (transform16x16_1[1][6])))); |
4029 | 51.1k | E6h = _mm_add_epi32(E6h, |
4030 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4031 | 51.1k | _mm_load_si128( |
4032 | 51.1k | (__m128i *) (transform16x16_1[1][6])))); |
4033 | 51.1k | E6l = _mm_add_epi32(E6l, |
4034 | 51.1k | _mm_madd_epi16(m128Tmp4, |
4035 | 51.1k | _mm_load_si128( |
4036 | 51.1k | (__m128i *) (transform16x16_1[2][6])))); |
4037 | 51.1k | E6h = _mm_add_epi32(E6h, |
4038 | 51.1k | _mm_madd_epi16(m128Tmp5, |
4039 | 51.1k | _mm_load_si128( |
4040 | 51.1k | (__m128i *) (transform16x16_1[2][6])))); |
4041 | 51.1k | E6l = _mm_add_epi32(E6l, |
4042 | 51.1k | _mm_madd_epi16(m128Tmp6, |
4043 | 51.1k | _mm_load_si128( |
4044 | 51.1k | (__m128i *) (transform16x16_1[3][6])))); |
4045 | 51.1k | E6h = _mm_add_epi32(E6h, |
4046 | 51.1k | _mm_madd_epi16(m128Tmp7, |
4047 | 51.1k | _mm_load_si128( |
4048 | 51.1k | (__m128i *) (transform16x16_1[3][6])))); |
4049 | | |
4050 | | /* Compute E7 */ |
4051 | 51.1k | E7l = _mm_madd_epi16(m128Tmp0, |
4052 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
4053 | 51.1k | E7h = _mm_madd_epi16(m128Tmp1, |
4054 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
4055 | 51.1k | E7l = _mm_add_epi32(E7l, |
4056 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4057 | 51.1k | _mm_load_si128( |
4058 | 51.1k | (__m128i *) (transform16x16_1[1][7])))); |
4059 | 51.1k | E7h = _mm_add_epi32(E7h, |
4060 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4061 | 51.1k | _mm_load_si128( |
4062 | 51.1k | (__m128i *) (transform16x16_1[1][7])))); |
4063 | 51.1k | E7l = _mm_add_epi32(E7l, |
4064 | 51.1k | _mm_madd_epi16(m128Tmp4, |
4065 | 51.1k | _mm_load_si128( |
4066 | 51.1k | (__m128i *) (transform16x16_1[2][7])))); |
4067 | 51.1k | E7h = _mm_add_epi32(E7h, |
4068 | 51.1k | _mm_madd_epi16(m128Tmp5, |
4069 | 51.1k | _mm_load_si128( |
4070 | 51.1k | (__m128i *) (transform16x16_1[2][7])))); |
4071 | 51.1k | E7l = _mm_add_epi32(E7l, |
4072 | 51.1k | _mm_madd_epi16(m128Tmp6, |
4073 | 51.1k | _mm_load_si128( |
4074 | 51.1k | (__m128i *) (transform16x16_1[3][7])))); |
4075 | 51.1k | E7h = _mm_add_epi32(E7h, |
4076 | 51.1k | _mm_madd_epi16(m128Tmp7, |
4077 | 51.1k | _mm_load_si128( |
4078 | 51.1k | (__m128i *) (transform16x16_1[3][7])))); |
4079 | | |
4080 | | /* Compute EE0 and EEE */ |
4081 | | |
4082 | 51.1k | m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); |
4083 | 51.1k | E00l = _mm_madd_epi16(m128Tmp0, |
4084 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
4085 | 51.1k | m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); |
4086 | 51.1k | E00h = _mm_madd_epi16(m128Tmp1, |
4087 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
4088 | | |
4089 | 51.1k | m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); |
4090 | 51.1k | E00l = _mm_add_epi32(E00l, |
4091 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4092 | 51.1k | _mm_load_si128( |
4093 | 51.1k | (__m128i *) (transform16x16_2[1][0])))); |
4094 | 51.1k | m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); |
4095 | 51.1k | E00h = _mm_add_epi32(E00h, |
4096 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4097 | 51.1k | _mm_load_si128( |
4098 | 51.1k | (__m128i *) (transform16x16_2[1][0])))); |
4099 | | |
4100 | 51.1k | E01l = _mm_madd_epi16(m128Tmp0, |
4101 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
4102 | 51.1k | E01h = _mm_madd_epi16(m128Tmp1, |
4103 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
4104 | 51.1k | E01l = _mm_add_epi32(E01l, |
4105 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4106 | 51.1k | _mm_load_si128( |
4107 | 51.1k | (__m128i *) (transform16x16_2[1][1])))); |
4108 | 51.1k | E01h = _mm_add_epi32(E01h, |
4109 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4110 | 51.1k | _mm_load_si128( |
4111 | 51.1k | (__m128i *) (transform16x16_2[1][1])))); |
4112 | | |
4113 | 51.1k | E02l = _mm_madd_epi16(m128Tmp0, |
4114 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
4115 | 51.1k | E02h = _mm_madd_epi16(m128Tmp1, |
4116 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
4117 | 51.1k | E02l = _mm_add_epi32(E02l, |
4118 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4119 | 51.1k | _mm_load_si128( |
4120 | 51.1k | (__m128i *) (transform16x16_2[1][2])))); |
4121 | 51.1k | E02h = _mm_add_epi32(E02h, |
4122 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4123 | 51.1k | _mm_load_si128( |
4124 | 51.1k | (__m128i *) (transform16x16_2[1][2])))); |
4125 | | |
4126 | 51.1k | E03l = _mm_madd_epi16(m128Tmp0, |
4127 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
4128 | 51.1k | E03h = _mm_madd_epi16(m128Tmp1, |
4129 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
4130 | 51.1k | E03l = _mm_add_epi32(E03l, |
4131 | 51.1k | _mm_madd_epi16(m128Tmp2, |
4132 | 51.1k | _mm_load_si128( |
4133 | 51.1k | (__m128i *) (transform16x16_2[1][3])))); |
4134 | 51.1k | E03h = _mm_add_epi32(E03h, |
4135 | 51.1k | _mm_madd_epi16(m128Tmp3, |
4136 | 51.1k | _mm_load_si128( |
4137 | 51.1k | (__m128i *) (transform16x16_2[1][3])))); |
4138 | | |
4139 | | /* Compute EE0 and EEE */ |
4140 | | |
4141 | 51.1k | m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); |
4142 | 51.1k | EE0l = _mm_madd_epi16(m128Tmp0, |
4143 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
4144 | 51.1k | m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); |
4145 | 51.1k | EE0h = _mm_madd_epi16(m128Tmp1, |
4146 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
4147 | | |
4148 | 51.1k | m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); |
4149 | 51.1k | EEE0l = _mm_madd_epi16(m128Tmp2, |
4150 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
4151 | 51.1k | m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); |
4152 | 51.1k | EEE0h = _mm_madd_epi16(m128Tmp3, |
4153 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
4154 | | |
4155 | 51.1k | EE1l = _mm_madd_epi16(m128Tmp0, |
4156 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
4157 | 51.1k | EE1h = _mm_madd_epi16(m128Tmp1, |
4158 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
4159 | | |
4160 | 51.1k | EEE1l = _mm_madd_epi16(m128Tmp2, |
4161 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
4162 | 51.1k | EEE1h = _mm_madd_epi16(m128Tmp3, |
4163 | 51.1k | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
4164 | | |
4165 | | /* Compute EE */ |
4166 | | |
4167 | 51.1k | EE2l = _mm_sub_epi32(EEE1l, EE1l); |
4168 | 51.1k | EE3l = _mm_sub_epi32(EEE0l, EE0l); |
4169 | 51.1k | EE2h = _mm_sub_epi32(EEE1h, EE1h); |
4170 | 51.1k | EE3h = _mm_sub_epi32(EEE0h, EE0h); |
4171 | | |
4172 | 51.1k | EE0l = _mm_add_epi32(EEE0l, EE0l); |
4173 | 51.1k | EE1l = _mm_add_epi32(EEE1l, EE1l); |
4174 | 51.1k | EE0h = _mm_add_epi32(EEE0h, EE0h); |
4175 | 51.1k | EE1h = _mm_add_epi32(EEE1h, EE1h); |
4176 | | /**/ |
4177 | | |
4178 | 51.1k | EE7l = _mm_sub_epi32(EE0l, E00l); |
4179 | 51.1k | EE6l = _mm_sub_epi32(EE1l, E01l); |
4180 | 51.1k | EE5l = _mm_sub_epi32(EE2l, E02l); |
4181 | 51.1k | EE4l = _mm_sub_epi32(EE3l, E03l); |
4182 | | |
4183 | 51.1k | EE7h = _mm_sub_epi32(EE0h, E00h); |
4184 | 51.1k | EE6h = _mm_sub_epi32(EE1h, E01h); |
4185 | 51.1k | EE5h = _mm_sub_epi32(EE2h, E02h); |
4186 | 51.1k | EE4h = _mm_sub_epi32(EE3h, E03h); |
4187 | | |
4188 | 51.1k | EE0l = _mm_add_epi32(EE0l, E00l); |
4189 | 51.1k | EE1l = _mm_add_epi32(EE1l, E01l); |
4190 | 51.1k | EE2l = _mm_add_epi32(EE2l, E02l); |
4191 | 51.1k | EE3l = _mm_add_epi32(EE3l, E03l); |
4192 | | |
4193 | 51.1k | EE0h = _mm_add_epi32(EE0h, E00h); |
4194 | 51.1k | EE1h = _mm_add_epi32(EE1h, E01h); |
4195 | 51.1k | EE2h = _mm_add_epi32(EE2h, E02h); |
4196 | 51.1k | EE3h = _mm_add_epi32(EE3h, E03h); |
4197 | | /* Compute E */ |
4198 | | |
4199 | 51.1k | E15l = _mm_sub_epi32(EE0l, E0l); |
4200 | 51.1k | E15l = _mm_add_epi32(E15l, m128iAdd); |
4201 | 51.1k | E14l = _mm_sub_epi32(EE1l, E1l); |
4202 | 51.1k | E14l = _mm_add_epi32(E14l, m128iAdd); |
4203 | 51.1k | E13l = _mm_sub_epi32(EE2l, E2l); |
4204 | 51.1k | E13l = _mm_add_epi32(E13l, m128iAdd); |
4205 | 51.1k | E12l = _mm_sub_epi32(EE3l, E3l); |
4206 | 51.1k | E12l = _mm_add_epi32(E12l, m128iAdd); |
4207 | 51.1k | E11l = _mm_sub_epi32(EE4l, E4l); |
4208 | 51.1k | E11l = _mm_add_epi32(E11l, m128iAdd); |
4209 | 51.1k | E10l = _mm_sub_epi32(EE5l, E5l); |
4210 | 51.1k | E10l = _mm_add_epi32(E10l, m128iAdd); |
4211 | 51.1k | E9l = _mm_sub_epi32(EE6l, E6l); |
4212 | 51.1k | E9l = _mm_add_epi32(E9l, m128iAdd); |
4213 | 51.1k | E8l = _mm_sub_epi32(EE7l, E7l); |
4214 | 51.1k | E8l = _mm_add_epi32(E8l, m128iAdd); |
4215 | | |
4216 | 51.1k | E0l = _mm_add_epi32(EE0l, E0l); |
4217 | 51.1k | E0l = _mm_add_epi32(E0l, m128iAdd); |
4218 | 51.1k | E1l = _mm_add_epi32(EE1l, E1l); |
4219 | 51.1k | E1l = _mm_add_epi32(E1l, m128iAdd); |
4220 | 51.1k | E2l = _mm_add_epi32(EE2l, E2l); |
4221 | 51.1k | E2l = _mm_add_epi32(E2l, m128iAdd); |
4222 | 51.1k | E3l = _mm_add_epi32(EE3l, E3l); |
4223 | 51.1k | E3l = _mm_add_epi32(E3l, m128iAdd); |
4224 | 51.1k | E4l = _mm_add_epi32(EE4l, E4l); |
4225 | 51.1k | E4l = _mm_add_epi32(E4l, m128iAdd); |
4226 | 51.1k | E5l = _mm_add_epi32(EE5l, E5l); |
4227 | 51.1k | E5l = _mm_add_epi32(E5l, m128iAdd); |
4228 | 51.1k | E6l = _mm_add_epi32(EE6l, E6l); |
4229 | 51.1k | E6l = _mm_add_epi32(E6l, m128iAdd); |
4230 | 51.1k | E7l = _mm_add_epi32(EE7l, E7l); |
4231 | 51.1k | E7l = _mm_add_epi32(E7l, m128iAdd); |
4232 | | |
4233 | 51.1k | E15h = _mm_sub_epi32(EE0h, E0h); |
4234 | 51.1k | E15h = _mm_add_epi32(E15h, m128iAdd); |
4235 | 51.1k | E14h = _mm_sub_epi32(EE1h, E1h); |
4236 | 51.1k | E14h = _mm_add_epi32(E14h, m128iAdd); |
4237 | 51.1k | E13h = _mm_sub_epi32(EE2h, E2h); |
4238 | 51.1k | E13h = _mm_add_epi32(E13h, m128iAdd); |
4239 | 51.1k | E12h = _mm_sub_epi32(EE3h, E3h); |
4240 | 51.1k | E12h = _mm_add_epi32(E12h, m128iAdd); |
4241 | 51.1k | E11h = _mm_sub_epi32(EE4h, E4h); |
4242 | 51.1k | E11h = _mm_add_epi32(E11h, m128iAdd); |
4243 | 51.1k | E10h = _mm_sub_epi32(EE5h, E5h); |
4244 | 51.1k | E10h = _mm_add_epi32(E10h, m128iAdd); |
4245 | 51.1k | E9h = _mm_sub_epi32(EE6h, E6h); |
4246 | 51.1k | E9h = _mm_add_epi32(E9h, m128iAdd); |
4247 | 51.1k | E8h = _mm_sub_epi32(EE7h, E7h); |
4248 | 51.1k | E8h = _mm_add_epi32(E8h, m128iAdd); |
4249 | | |
4250 | 51.1k | E0h = _mm_add_epi32(EE0h, E0h); |
4251 | 51.1k | E0h = _mm_add_epi32(E0h, m128iAdd); |
4252 | 51.1k | E1h = _mm_add_epi32(EE1h, E1h); |
4253 | 51.1k | E1h = _mm_add_epi32(E1h, m128iAdd); |
4254 | 51.1k | E2h = _mm_add_epi32(EE2h, E2h); |
4255 | 51.1k | E2h = _mm_add_epi32(E2h, m128iAdd); |
4256 | 51.1k | E3h = _mm_add_epi32(EE3h, E3h); |
4257 | 51.1k | E3h = _mm_add_epi32(E3h, m128iAdd); |
4258 | 51.1k | E4h = _mm_add_epi32(EE4h, E4h); |
4259 | 51.1k | E4h = _mm_add_epi32(E4h, m128iAdd); |
4260 | 51.1k | E5h = _mm_add_epi32(EE5h, E5h); |
4261 | 51.1k | E5h = _mm_add_epi32(E5h, m128iAdd); |
4262 | 51.1k | E6h = _mm_add_epi32(EE6h, E6h); |
4263 | 51.1k | E6h = _mm_add_epi32(E6h, m128iAdd); |
4264 | 51.1k | E7h = _mm_add_epi32(EE7h, E7h); |
4265 | 51.1k | E7h = _mm_add_epi32(E7h, m128iAdd); |
4266 | | |
4267 | 51.1k | m128iS0 = _mm_packs_epi32( |
4268 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), |
4269 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); |
4270 | 51.1k | m128iS1 = _mm_packs_epi32( |
4271 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), |
4272 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); |
4273 | 51.1k | m128iS2 = _mm_packs_epi32( |
4274 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), |
4275 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); |
4276 | 51.1k | m128iS3 = _mm_packs_epi32( |
4277 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), |
4278 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); |
4279 | 51.1k | m128iS4 = _mm_packs_epi32( |
4280 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), |
4281 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); |
4282 | 51.1k | m128iS5 = _mm_packs_epi32( |
4283 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), |
4284 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); |
4285 | 51.1k | m128iS6 = _mm_packs_epi32( |
4286 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), |
4287 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); |
4288 | 51.1k | m128iS7 = _mm_packs_epi32( |
4289 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), |
4290 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); |
4291 | 51.1k | m128iS8 = _mm_packs_epi32( |
4292 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), |
4293 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); |
4294 | 51.1k | m128iS9 = _mm_packs_epi32( |
4295 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), |
4296 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); |
4297 | 51.1k | m128iS10 = _mm_packs_epi32( |
4298 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), |
4299 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); |
4300 | 51.1k | m128iS11 = _mm_packs_epi32( |
4301 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), |
4302 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); |
4303 | 51.1k | m128iS12 = _mm_packs_epi32( |
4304 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), |
4305 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); |
4306 | 51.1k | m128iS13 = _mm_packs_epi32( |
4307 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), |
4308 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); |
4309 | 51.1k | m128iS14 = _mm_packs_epi32( |
4310 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), |
4311 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); |
4312 | 51.1k | m128iS15 = _mm_packs_epi32( |
4313 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), |
4314 | 51.1k | _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); |
4315 | | |
4316 | 51.1k | m128iS31 = _mm_packs_epi32( |
4317 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), |
4318 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); |
4319 | 51.1k | m128iS30 = _mm_packs_epi32( |
4320 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), |
4321 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); |
4322 | 51.1k | m128iS29 = _mm_packs_epi32( |
4323 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), |
4324 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); |
4325 | 51.1k | m128iS28 = _mm_packs_epi32( |
4326 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), |
4327 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); |
4328 | 51.1k | m128iS27 = _mm_packs_epi32( |
4329 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), |
4330 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); |
4331 | 51.1k | m128iS26 = _mm_packs_epi32( |
4332 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), |
4333 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); |
4334 | 51.1k | m128iS25 = _mm_packs_epi32( |
4335 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), |
4336 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); |
4337 | 51.1k | m128iS24 = _mm_packs_epi32( |
4338 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), |
4339 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); |
4340 | 51.1k | m128iS23 = _mm_packs_epi32( |
4341 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), |
4342 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); |
4343 | 51.1k | m128iS22 = _mm_packs_epi32( |
4344 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), |
4345 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); |
4346 | 51.1k | m128iS21 = _mm_packs_epi32( |
4347 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), |
4348 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); |
4349 | 51.1k | m128iS20 = _mm_packs_epi32( |
4350 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), |
4351 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); |
4352 | 51.1k | m128iS19 = _mm_packs_epi32( |
4353 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), |
4354 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); |
4355 | 51.1k | m128iS18 = _mm_packs_epi32( |
4356 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), |
4357 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); |
4358 | 51.1k | m128iS17 = _mm_packs_epi32( |
4359 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), |
4360 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); |
4361 | 51.1k | m128iS16 = _mm_packs_epi32( |
4362 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), |
4363 | 51.1k | _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); |
4364 | | |
4365 | 51.1k | if (!j) { |
4366 | | /* Inverse the matrix */ |
4367 | 25.5k | E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); |
4368 | 25.5k | E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); |
4369 | 25.5k | E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); |
4370 | 25.5k | E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); |
4371 | 25.5k | E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); |
4372 | 25.5k | E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); |
4373 | 25.5k | E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); |
4374 | 25.5k | E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); |
4375 | 25.5k | E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); |
4376 | 25.5k | E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); |
4377 | 25.5k | E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); |
4378 | 25.5k | E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); |
4379 | 25.5k | E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); |
4380 | 25.5k | E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); |
4381 | 25.5k | E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); |
4382 | 25.5k | E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); |
4383 | | |
4384 | 25.5k | O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); |
4385 | 25.5k | O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); |
4386 | 25.5k | O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); |
4387 | 25.5k | O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); |
4388 | 25.5k | O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); |
4389 | 25.5k | O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); |
4390 | 25.5k | O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); |
4391 | 25.5k | O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); |
4392 | 25.5k | O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); |
4393 | 25.5k | O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); |
4394 | 25.5k | O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); |
4395 | 25.5k | O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); |
4396 | 25.5k | O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); |
4397 | 25.5k | O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); |
4398 | 25.5k | O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); |
4399 | 25.5k | O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); |
4400 | | |
4401 | 25.5k | E0h = _mm_unpacklo_epi16(E0l, E8l); |
4402 | 25.5k | E1h = _mm_unpacklo_epi16(E1l, E9l); |
4403 | 25.5k | E2h = _mm_unpacklo_epi16(E2l, E10l); |
4404 | 25.5k | E3h = _mm_unpacklo_epi16(E3l, E11l); |
4405 | 25.5k | E4h = _mm_unpacklo_epi16(E4l, E12l); |
4406 | 25.5k | E5h = _mm_unpacklo_epi16(E5l, E13l); |
4407 | 25.5k | E6h = _mm_unpacklo_epi16(E6l, E14l); |
4408 | 25.5k | E7h = _mm_unpacklo_epi16(E7l, E15l); |
4409 | | |
4410 | 25.5k | E8h = _mm_unpackhi_epi16(E0l, E8l); |
4411 | 25.5k | E9h = _mm_unpackhi_epi16(E1l, E9l); |
4412 | 25.5k | E10h = _mm_unpackhi_epi16(E2l, E10l); |
4413 | 25.5k | E11h = _mm_unpackhi_epi16(E3l, E11l); |
4414 | 25.5k | E12h = _mm_unpackhi_epi16(E4l, E12l); |
4415 | 25.5k | E13h = _mm_unpackhi_epi16(E5l, E13l); |
4416 | 25.5k | E14h = _mm_unpackhi_epi16(E6l, E14l); |
4417 | 25.5k | E15h = _mm_unpackhi_epi16(E7l, E15l); |
4418 | | |
4419 | 25.5k | m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); |
4420 | 25.5k | m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); |
4421 | 25.5k | m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); |
4422 | 25.5k | m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); |
4423 | | |
4424 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4425 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4426 | 25.5k | m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4427 | 25.5k | m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4428 | | |
4429 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4430 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4431 | 25.5k | m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4432 | 25.5k | m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4433 | | |
4434 | 25.5k | m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); |
4435 | 25.5k | m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); |
4436 | 25.5k | m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); |
4437 | 25.5k | m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); |
4438 | | |
4439 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4440 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4441 | 25.5k | m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4442 | 25.5k | m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4443 | | |
4444 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4445 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4446 | 25.5k | m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4447 | 25.5k | m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4448 | | |
4449 | 25.5k | m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); |
4450 | 25.5k | m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); |
4451 | 25.5k | m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); |
4452 | 25.5k | m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); |
4453 | | |
4454 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4455 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4456 | 25.5k | m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4457 | 25.5k | m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4458 | | |
4459 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4460 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4461 | 25.5k | m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4462 | 25.5k | m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4463 | | |
4464 | 25.5k | m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); |
4465 | 25.5k | m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); |
4466 | 25.5k | m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); |
4467 | 25.5k | m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); |
4468 | | |
4469 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4470 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4471 | 25.5k | m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4472 | 25.5k | m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4473 | | |
4474 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4475 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4476 | 25.5k | m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4477 | 25.5k | m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4478 | | |
4479 | | /* */ |
4480 | 25.5k | E0h = _mm_unpacklo_epi16(O0l, O8l); |
4481 | 25.5k | E1h = _mm_unpacklo_epi16(O1l, O9l); |
4482 | 25.5k | E2h = _mm_unpacklo_epi16(O2l, O10l); |
4483 | 25.5k | E3h = _mm_unpacklo_epi16(O3l, O11l); |
4484 | 25.5k | E4h = _mm_unpacklo_epi16(O4l, O12l); |
4485 | 25.5k | E5h = _mm_unpacklo_epi16(O5l, O13l); |
4486 | 25.5k | E6h = _mm_unpacklo_epi16(O6l, O14l); |
4487 | 25.5k | E7h = _mm_unpacklo_epi16(O7l, O15l); |
4488 | | |
4489 | 25.5k | E8h = _mm_unpackhi_epi16(O0l, O8l); |
4490 | 25.5k | E9h = _mm_unpackhi_epi16(O1l, O9l); |
4491 | 25.5k | E10h = _mm_unpackhi_epi16(O2l, O10l); |
4492 | 25.5k | E11h = _mm_unpackhi_epi16(O3l, O11l); |
4493 | 25.5k | E12h = _mm_unpackhi_epi16(O4l, O12l); |
4494 | 25.5k | E13h = _mm_unpackhi_epi16(O5l, O13l); |
4495 | 25.5k | E14h = _mm_unpackhi_epi16(O6l, O14l); |
4496 | 25.5k | E15h = _mm_unpackhi_epi16(O7l, O15l); |
4497 | | |
4498 | 25.5k | m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); |
4499 | 25.5k | m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); |
4500 | 25.5k | m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); |
4501 | 25.5k | m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); |
4502 | | |
4503 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4504 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4505 | 25.5k | m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4506 | 25.5k | m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4507 | | |
4508 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4509 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4510 | 25.5k | m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4511 | 25.5k | m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4512 | | |
4513 | 25.5k | m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); |
4514 | 25.5k | m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); |
4515 | 25.5k | m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); |
4516 | 25.5k | m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); |
4517 | | |
4518 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4519 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4520 | 25.5k | m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4521 | 25.5k | m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4522 | | |
4523 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4524 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4525 | 25.5k | m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4526 | 25.5k | m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4527 | | |
4528 | 25.5k | m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); |
4529 | 25.5k | m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); |
4530 | 25.5k | m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); |
4531 | 25.5k | m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); |
4532 | | |
4533 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4534 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4535 | 25.5k | m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4536 | 25.5k | m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4537 | | |
4538 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4539 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4540 | 25.5k | m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4541 | 25.5k | m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4542 | | |
4543 | 25.5k | m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); |
4544 | 25.5k | m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); |
4545 | 25.5k | m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); |
4546 | 25.5k | m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); |
4547 | | |
4548 | 25.5k | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
4549 | 25.5k | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
4550 | 25.5k | m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4551 | 25.5k | m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4552 | | |
4553 | 25.5k | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
4554 | 25.5k | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
4555 | 25.5k | m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
4556 | 25.5k | m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
4557 | | |
4558 | 25.5k | if(i==0){ |
4559 | 6.39k | int k = 8; |
4560 | 6.39k | r0=m128iS0; |
4561 | 6.39k | r1=m128iS1; |
4562 | 6.39k | r2=m128iS2; |
4563 | 6.39k | r3=m128iS3; |
4564 | 6.39k | r4=m128iS4; |
4565 | 6.39k | r5=m128iS5; |
4566 | 6.39k | r6=m128iS6; |
4567 | 6.39k | r7=m128iS7; |
4568 | 6.39k | r8=m128iS8; |
4569 | 6.39k | r9=m128iS9; |
4570 | 6.39k | r10=m128iS10; |
4571 | 6.39k | r11=m128iS11; |
4572 | 6.39k | r12=m128iS12; |
4573 | 6.39k | r13=m128iS13; |
4574 | 6.39k | r14=m128iS14; |
4575 | 6.39k | r15=m128iS15; |
4576 | 6.39k | r16=m128iS16; |
4577 | 6.39k | r17=m128iS17; |
4578 | 6.39k | r18=m128iS18; |
4579 | 6.39k | r19=m128iS19; |
4580 | 6.39k | r20=m128iS20; |
4581 | 6.39k | r21=m128iS21; |
4582 | 6.39k | r22=m128iS22; |
4583 | 6.39k | r23=m128iS23; |
4584 | 6.39k | r24=m128iS24; |
4585 | 6.39k | r25=m128iS25; |
4586 | 6.39k | r26=m128iS26; |
4587 | 6.39k | r27=m128iS27; |
4588 | 6.39k | r28=m128iS28; |
4589 | 6.39k | r29=m128iS29; |
4590 | 6.39k | r30=m128iS30; |
4591 | 6.39k | r31=m128iS31; |
4592 | 6.39k | m128iS0 = _mm_load_si128((__m128i *) (src + k)); |
4593 | 6.39k | m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); |
4594 | 6.39k | m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); |
4595 | 6.39k | m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); |
4596 | 6.39k | m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); |
4597 | 6.39k | m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); |
4598 | 6.39k | m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); |
4599 | 6.39k | m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); |
4600 | 6.39k | m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); |
4601 | 6.39k | m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); |
4602 | 6.39k | m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); |
4603 | 6.39k | m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); |
4604 | 6.39k | m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); |
4605 | 6.39k | m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); |
4606 | 6.39k | m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); |
4607 | 6.39k | m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); |
4608 | | |
4609 | 6.39k | m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); |
4610 | 6.39k | m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); |
4611 | 6.39k | m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); |
4612 | 6.39k | m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); |
4613 | 6.39k | m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); |
4614 | 6.39k | m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); |
4615 | 6.39k | m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); |
4616 | 6.39k | m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); |
4617 | 6.39k | m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); |
4618 | 6.39k | m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); |
4619 | 6.39k | m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); |
4620 | 6.39k | m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); |
4621 | 6.39k | m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); |
4622 | 6.39k | m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); |
4623 | 6.39k | m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); |
4624 | 6.39k | m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); |
4625 | | |
4626 | 19.1k | }else if(i ==8){ |
4627 | | |
4628 | 6.39k | r32=m128iS0; |
4629 | 6.39k | r33=m128iS1; |
4630 | 6.39k | r34=m128iS2; |
4631 | 6.39k | r35=m128iS3; |
4632 | 6.39k | r36=m128iS4; |
4633 | 6.39k | r37=m128iS5; |
4634 | 6.39k | r38=m128iS6; |
4635 | 6.39k | r39=m128iS7; |
4636 | 6.39k | r40=m128iS8; |
4637 | 6.39k | r41=m128iS9; |
4638 | 6.39k | r42=m128iS10; |
4639 | 6.39k | r43=m128iS11; |
4640 | 6.39k | r44=m128iS12; |
4641 | 6.39k | r45=m128iS13; |
4642 | 6.39k | r46=m128iS14; |
4643 | 6.39k | r47=m128iS15; |
4644 | 6.39k | r48=m128iS16; |
4645 | 6.39k | r49=m128iS17; |
4646 | 6.39k | r50=m128iS18; |
4647 | 6.39k | r51=m128iS19; |
4648 | 6.39k | r52=m128iS20; |
4649 | 6.39k | r53=m128iS21; |
4650 | 6.39k | r54=m128iS22; |
4651 | 6.39k | r55=m128iS23; |
4652 | 6.39k | r56=m128iS24; |
4653 | 6.39k | r57=m128iS25; |
4654 | 6.39k | r58=m128iS26; |
4655 | 6.39k | r59=m128iS27; |
4656 | 6.39k | r60=m128iS28; |
4657 | 6.39k | r61=m128iS29; |
4658 | 6.39k | r62=m128iS30; |
4659 | 6.39k | r63=m128iS31; |
4660 | | |
4661 | 6.39k | m128iS0 = _mm_load_si128((__m128i *) (src + 16)); |
4662 | 6.39k | m128iS1 = _mm_load_si128((__m128i *) (src + 48)); |
4663 | 6.39k | m128iS2 = _mm_load_si128((__m128i *) (src + 80)); |
4664 | 6.39k | m128iS3 = _mm_load_si128((__m128i *) (src + 112)); |
4665 | 6.39k | m128iS4 = _mm_load_si128((__m128i *) (src + 144)); |
4666 | 6.39k | m128iS5 = _mm_load_si128((__m128i *) (src + 176)); |
4667 | 6.39k | m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16)); |
4668 | 6.39k | m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16)); |
4669 | 6.39k | m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16)); |
4670 | 6.39k | m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16)); |
4671 | 6.39k | m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16)); |
4672 | 6.39k | m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16)); |
4673 | 6.39k | m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16)); |
4674 | 6.39k | m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16)); |
4675 | 6.39k | m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16)); |
4676 | 6.39k | m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16)); |
4677 | | |
4678 | 6.39k | m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16)); |
4679 | 6.39k | m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16)); |
4680 | 6.39k | m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16)); |
4681 | 6.39k | m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16)); |
4682 | 6.39k | m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16)); |
4683 | 6.39k | m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16)); |
4684 | 6.39k | m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16)); |
4685 | 6.39k | m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16)); |
4686 | 6.39k | m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16)); |
4687 | 6.39k | m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16)); |
4688 | 6.39k | m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16)); |
4689 | 6.39k | m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16)); |
4690 | 6.39k | m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16)); |
4691 | 6.39k | m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16)); |
4692 | 6.39k | m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16)); |
4693 | 6.39k | m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16)); |
4694 | | |
4695 | | |
4696 | 12.7k | }else if(i ==16){ |
4697 | | |
4698 | 6.39k | r64=m128iS0; |
4699 | 6.39k | r65=m128iS1; |
4700 | 6.39k | r66=m128iS2; |
4701 | 6.39k | r67=m128iS3; |
4702 | 6.39k | r68=m128iS4; |
4703 | 6.39k | r69=m128iS5; |
4704 | 6.39k | r70=m128iS6; |
4705 | 6.39k | r71=m128iS7; |
4706 | 6.39k | r72=m128iS8; |
4707 | 6.39k | r73=m128iS9; |
4708 | 6.39k | r74=m128iS10; |
4709 | 6.39k | r75=m128iS11; |
4710 | 6.39k | r76=m128iS12; |
4711 | 6.39k | r77=m128iS13; |
4712 | 6.39k | r78=m128iS14; |
4713 | 6.39k | r79=m128iS15; |
4714 | 6.39k | r80=m128iS16; |
4715 | 6.39k | r81=m128iS17; |
4716 | 6.39k | r82=m128iS18; |
4717 | 6.39k | r83=m128iS19; |
4718 | 6.39k | r84=m128iS20; |
4719 | 6.39k | r85=m128iS21; |
4720 | 6.39k | r86=m128iS22; |
4721 | 6.39k | r87=m128iS23; |
4722 | 6.39k | r88=m128iS24; |
4723 | 6.39k | r89=m128iS25; |
4724 | 6.39k | r90=m128iS26; |
4725 | 6.39k | r91=m128iS27; |
4726 | 6.39k | r92=m128iS28; |
4727 | 6.39k | r93=m128iS29; |
4728 | 6.39k | r94=m128iS30; |
4729 | 6.39k | r95=m128iS31; |
4730 | | |
4731 | 6.39k | m128iS0 = _mm_load_si128((__m128i *) (src + 24)); |
4732 | 6.39k | m128iS1 = _mm_load_si128((__m128i *) (src + 56)); |
4733 | 6.39k | m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24)); |
4734 | 6.39k | m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24)); |
4735 | 6.39k | m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24)); |
4736 | 6.39k | m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24)); |
4737 | 6.39k | m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24)); |
4738 | 6.39k | m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24)); |
4739 | 6.39k | m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24)); |
4740 | 6.39k | m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24)); |
4741 | 6.39k | m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24)); |
4742 | 6.39k | m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24)); |
4743 | 6.39k | m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24)); |
4744 | 6.39k | m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24)); |
4745 | 6.39k | m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24)); |
4746 | 6.39k | m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24)); |
4747 | | |
4748 | 6.39k | m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24)); |
4749 | 6.39k | m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24)); |
4750 | 6.39k | m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24)); |
4751 | 6.39k | m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24)); |
4752 | 6.39k | m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24)); |
4753 | 6.39k | m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24)); |
4754 | 6.39k | m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24)); |
4755 | 6.39k | m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24)); |
4756 | 6.39k | m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24)); |
4757 | 6.39k | m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24)); |
4758 | 6.39k | m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24)); |
4759 | 6.39k | m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24)); |
4760 | 6.39k | m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24)); |
4761 | 6.39k | m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24)); |
4762 | 6.39k | m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24)); |
4763 | 6.39k | m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24)); |
4764 | | |
4765 | 6.39k | }else{ |
4766 | 6.39k | r96=m128iS0; |
4767 | 6.39k | r97=m128iS1; |
4768 | 6.39k | r98=m128iS2; |
4769 | 6.39k | r99=m128iS3; |
4770 | 6.39k | r100=m128iS4; |
4771 | 6.39k | r101=m128iS5; |
4772 | 6.39k | r102=m128iS6; |
4773 | 6.39k | r103=m128iS7; |
4774 | 6.39k | r104=m128iS8; |
4775 | 6.39k | r105=m128iS9; |
4776 | 6.39k | r106=m128iS10; |
4777 | 6.39k | r107=m128iS11; |
4778 | 6.39k | r108=m128iS12; |
4779 | 6.39k | r109=m128iS13; |
4780 | 6.39k | r110=m128iS14; |
4781 | 6.39k | r111=m128iS15; |
4782 | 6.39k | r112=m128iS16; |
4783 | 6.39k | r113=m128iS17; |
4784 | 6.39k | r114=m128iS18; |
4785 | 6.39k | r115=m128iS19; |
4786 | 6.39k | r116=m128iS20; |
4787 | 6.39k | r117=m128iS21; |
4788 | 6.39k | r118=m128iS22; |
4789 | 6.39k | r119=m128iS23; |
4790 | 6.39k | r120=m128iS24; |
4791 | 6.39k | r121=m128iS25; |
4792 | 6.39k | r122=m128iS26; |
4793 | 6.39k | r123=m128iS27; |
4794 | 6.39k | r124=m128iS28; |
4795 | 6.39k | r125=m128iS29; |
4796 | 6.39k | r126=m128iS30; |
4797 | 6.39k | r127=m128iS31; |
4798 | | |
4799 | | //load data for next j : |
4800 | 6.39k | m128iS0 = r0; |
4801 | 6.39k | m128iS1 = r4; |
4802 | 6.39k | m128iS2 = r8; |
4803 | 6.39k | m128iS3 = r12; |
4804 | 6.39k | m128iS4 = r16; |
4805 | 6.39k | m128iS5 = r20; |
4806 | 6.39k | m128iS6 = r24; |
4807 | 6.39k | m128iS7 = r28; |
4808 | 6.39k | m128iS8 = r32; |
4809 | 6.39k | m128iS9 = r36; |
4810 | 6.39k | m128iS10 = r40; |
4811 | 6.39k | m128iS11 = r44; |
4812 | 6.39k | m128iS12 = r48; |
4813 | 6.39k | m128iS13 = r52; |
4814 | 6.39k | m128iS14 = r56; |
4815 | 6.39k | m128iS15 = r60; |
4816 | 6.39k | m128iS16 = r64; |
4817 | 6.39k | m128iS17 = r68; |
4818 | 6.39k | m128iS18 = r72; |
4819 | 6.39k | m128iS19 = r76; |
4820 | 6.39k | m128iS20 = r80; |
4821 | 6.39k | m128iS21 = r84; |
4822 | 6.39k | m128iS22 = r88; |
4823 | 6.39k | m128iS23 = r92; |
4824 | 6.39k | m128iS24 = r96; |
4825 | 6.39k | m128iS25 = r100; |
4826 | 6.39k | m128iS26 = r104; |
4827 | 6.39k | m128iS27 = r108; |
4828 | 6.39k | m128iS28 = r112; |
4829 | 6.39k | m128iS29 = r116; |
4830 | 6.39k | m128iS30 = r120; |
4831 | 6.39k | m128iS31 =r124; |
4832 | 6.39k | shift = shift_2nd; |
4833 | 6.39k | m128iAdd = _mm_set1_epi32(add_2nd); |
4834 | | |
4835 | | |
4836 | 6.39k | } |
4837 | | |
4838 | 25.5k | } else { |
4839 | | |
4840 | | //Transpose Matrix |
4841 | | |
4842 | 25.5k | E0l= _mm_unpacklo_epi16(m128iS0,m128iS1); |
4843 | 25.5k | E1l= _mm_unpacklo_epi16(m128iS2,m128iS3); |
4844 | 25.5k | E2l= _mm_unpacklo_epi16(m128iS4,m128iS5); |
4845 | 25.5k | E3l= _mm_unpacklo_epi16(m128iS6,m128iS7); |
4846 | 25.5k | E4l= _mm_unpacklo_epi16(m128iS8,m128iS9); |
4847 | 25.5k | E5l= _mm_unpacklo_epi16(m128iS10,m128iS11); |
4848 | 25.5k | E6l= _mm_unpacklo_epi16(m128iS12,m128iS13); |
4849 | 25.5k | E7l= _mm_unpacklo_epi16(m128iS14,m128iS15); |
4850 | 25.5k | E8l= _mm_unpacklo_epi16(m128iS16,m128iS17); |
4851 | 25.5k | E9l= _mm_unpacklo_epi16(m128iS18,m128iS19); |
4852 | 25.5k | E10l= _mm_unpacklo_epi16(m128iS20,m128iS21); |
4853 | 25.5k | E11l= _mm_unpacklo_epi16(m128iS22,m128iS23); |
4854 | 25.5k | E12l= _mm_unpacklo_epi16(m128iS24,m128iS25); |
4855 | 25.5k | E13l= _mm_unpacklo_epi16(m128iS26,m128iS27); |
4856 | 25.5k | E14l= _mm_unpacklo_epi16(m128iS28,m128iS29); |
4857 | 25.5k | E15l= _mm_unpacklo_epi16(m128iS30,m128iS31); |
4858 | | |
4859 | | |
4860 | 25.5k | E0h= _mm_unpackhi_epi16(m128iS0,m128iS1); |
4861 | 25.5k | E1h= _mm_unpackhi_epi16(m128iS2,m128iS3); |
4862 | 25.5k | E2h= _mm_unpackhi_epi16(m128iS4,m128iS5); |
4863 | 25.5k | E3h= _mm_unpackhi_epi16(m128iS6,m128iS7); |
4864 | 25.5k | E4h= _mm_unpackhi_epi16(m128iS8,m128iS9); |
4865 | 25.5k | E5h= _mm_unpackhi_epi16(m128iS10,m128iS11); |
4866 | 25.5k | E6h= _mm_unpackhi_epi16(m128iS12,m128iS13); |
4867 | 25.5k | E7h= _mm_unpackhi_epi16(m128iS14,m128iS15); |
4868 | 25.5k | E8h= _mm_unpackhi_epi16(m128iS16,m128iS17); |
4869 | 25.5k | E9h= _mm_unpackhi_epi16(m128iS18,m128iS19); |
4870 | 25.5k | E10h= _mm_unpackhi_epi16(m128iS20,m128iS21); |
4871 | 25.5k | E11h= _mm_unpackhi_epi16(m128iS22,m128iS23); |
4872 | 25.5k | E12h= _mm_unpackhi_epi16(m128iS24,m128iS25); |
4873 | 25.5k | E13h= _mm_unpackhi_epi16(m128iS26,m128iS27); |
4874 | 25.5k | E14h= _mm_unpackhi_epi16(m128iS28,m128iS29); |
4875 | 25.5k | E15h= _mm_unpackhi_epi16(m128iS30,m128iS31); |
4876 | | |
4877 | 25.5k | m128Tmp0= _mm_unpacklo_epi32(E0l,E1l); |
4878 | 25.5k | m128Tmp1= _mm_unpacklo_epi32(E2l,E3l); |
4879 | 25.5k | m128Tmp2= _mm_unpacklo_epi32(E4l,E5l); |
4880 | 25.5k | m128Tmp3= _mm_unpacklo_epi32(E6l,E7l); |
4881 | 25.5k | m128Tmp4= _mm_unpacklo_epi32(E8l,E9l); |
4882 | 25.5k | m128Tmp5= _mm_unpacklo_epi32(E10l,E11l); |
4883 | 25.5k | m128Tmp6= _mm_unpacklo_epi32(E12l,E13l); |
4884 | 25.5k | m128Tmp7= _mm_unpacklo_epi32(E14l,E15l); |
4885 | | |
4886 | 25.5k | m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row |
4887 | 25.5k | m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row |
4888 | | |
4889 | | |
4890 | 25.5k | m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row |
4891 | 25.5k | m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row |
4892 | | |
4893 | | //second row |
4894 | | |
4895 | 25.5k | m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter |
4896 | 25.5k | m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter |
4897 | | |
4898 | 25.5k | m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter |
4899 | 25.5k | m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter |
4900 | | |
4901 | | //third row |
4902 | | |
4903 | 25.5k | m128Tmp0= _mm_unpackhi_epi32(E0l,E1l); |
4904 | 25.5k | m128Tmp1= _mm_unpackhi_epi32(E2l,E3l); |
4905 | 25.5k | m128Tmp2= _mm_unpackhi_epi32(E4l,E5l); |
4906 | 25.5k | m128Tmp3= _mm_unpackhi_epi32(E6l,E7l); |
4907 | 25.5k | m128Tmp4= _mm_unpackhi_epi32(E8l,E9l); |
4908 | 25.5k | m128Tmp5= _mm_unpackhi_epi32(E10l,E11l); |
4909 | 25.5k | m128Tmp6= _mm_unpackhi_epi32(E12l,E13l); |
4910 | 25.5k | m128Tmp7= _mm_unpackhi_epi32(E14l,E15l); |
4911 | | |
4912 | | |
4913 | 25.5k | m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter |
4914 | 25.5k | m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter |
4915 | | |
4916 | 25.5k | m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter |
4917 | 25.5k | m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter |
4918 | | |
4919 | | //fourth row |
4920 | | |
4921 | 25.5k | m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter |
4922 | 25.5k | m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter |
4923 | | |
4924 | 25.5k | m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter |
4925 | 25.5k | m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter |
4926 | | |
4927 | | //fifth row |
4928 | | |
4929 | 25.5k | m128Tmp0= _mm_unpacklo_epi32(E0h,E1h); |
4930 | 25.5k | m128Tmp1= _mm_unpacklo_epi32(E2h,E3h); |
4931 | 25.5k | m128Tmp2= _mm_unpacklo_epi32(E4h,E5h); |
4932 | 25.5k | m128Tmp3= _mm_unpacklo_epi32(E6h,E7h); |
4933 | 25.5k | m128Tmp4= _mm_unpacklo_epi32(E8h,E9h); |
4934 | 25.5k | m128Tmp5= _mm_unpacklo_epi32(E10h,E11h); |
4935 | 25.5k | m128Tmp6= _mm_unpacklo_epi32(E12h,E13h); |
4936 | 25.5k | m128Tmp7= _mm_unpacklo_epi32(E14h,E15h); |
4937 | | |
4938 | 25.5k | m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter |
4939 | 25.5k | m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter |
4940 | | |
4941 | | |
4942 | 25.5k | m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter |
4943 | 25.5k | m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); |
4944 | | |
4945 | | //sixth row |
4946 | | |
4947 | 25.5k | m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter |
4948 | 25.5k | m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter |
4949 | | |
4950 | | |
4951 | 25.5k | m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter |
4952 | 25.5k | m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter |
4953 | | |
4954 | | //seventh row |
4955 | | |
4956 | 25.5k | m128Tmp0= _mm_unpackhi_epi32(E0h,E1h); |
4957 | 25.5k | m128Tmp1= _mm_unpackhi_epi32(E2h,E3h); |
4958 | 25.5k | m128Tmp2= _mm_unpackhi_epi32(E4h,E5h); |
4959 | 25.5k | m128Tmp3= _mm_unpackhi_epi32(E6h,E7h); |
4960 | 25.5k | m128Tmp4= _mm_unpackhi_epi32(E8h,E9h); |
4961 | 25.5k | m128Tmp5= _mm_unpackhi_epi32(E10h,E11h); |
4962 | 25.5k | m128Tmp6= _mm_unpackhi_epi32(E12h,E13h); |
4963 | 25.5k | m128Tmp7= _mm_unpackhi_epi32(E14h,E15h); |
4964 | | |
4965 | | |
4966 | 25.5k | m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter |
4967 | 25.5k | m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter |
4968 | | |
4969 | | |
4970 | 25.5k | m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter |
4971 | 25.5k | m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter |
4972 | | |
4973 | | //last row |
4974 | | |
4975 | | |
4976 | 25.5k | m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter |
4977 | 25.5k | m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter |
4978 | | |
4979 | 25.5k | m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter |
4980 | 25.5k | m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter |
4981 | | |
4982 | | |
4983 | 25.5k | m128Tmp0=_mm_setzero_si128(); |
4984 | | |
4985 | | |
4986 | | //store |
4987 | 25.5k | dst = (uint8_t*) _dst + i*stride; |
4988 | | |
4989 | | |
4990 | 25.5k | E0l= _mm_load_si128((__m128i*)dst); //16 values |
4991 | 25.5k | E1l= _mm_load_si128((__m128i*)(dst+16)); |
4992 | 25.5k | E2l= _mm_load_si128((__m128i*)(dst+stride)); |
4993 | 25.5k | E3l= _mm_load_si128((__m128i*)(dst+stride+16)); |
4994 | 25.5k | E4l= _mm_load_si128((__m128i*)(dst+2*stride)); |
4995 | 25.5k | E5l= _mm_load_si128((__m128i*)(dst+2*stride+16)); |
4996 | 25.5k | E6l= _mm_load_si128((__m128i*)(dst+3*stride)); |
4997 | 25.5k | E7l= _mm_load_si128((__m128i*)(dst+3*stride+16)); |
4998 | 25.5k | E8l= _mm_load_si128((__m128i*)(dst+4*stride)); |
4999 | 25.5k | E9l= _mm_load_si128((__m128i*)(dst+4*stride+16)); |
5000 | 25.5k | E10l= _mm_load_si128((__m128i*)(dst+5*stride)); |
5001 | 25.5k | E11l= _mm_load_si128((__m128i*)(dst+5*stride+16)); |
5002 | 25.5k | E12l= _mm_load_si128((__m128i*)(dst+6*stride)); |
5003 | 25.5k | E13l= _mm_load_si128((__m128i*)(dst+6*stride+16)); |
5004 | 25.5k | E14l= _mm_load_si128((__m128i*)(dst+7*stride)); |
5005 | 25.5k | E15l= _mm_load_si128((__m128i*)(dst+7*stride+16)); |
5006 | | |
5007 | 25.5k | m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0)); |
5008 | 25.5k | m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0)); |
5009 | 25.5k | m128iS0= _mm_packus_epi16(m128iS0,m128iS1); |
5010 | | |
5011 | 25.5k | m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0)); |
5012 | 25.5k | m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0)); |
5013 | 25.5k | m128iS2= _mm_packus_epi16(m128iS2,m128iS3); |
5014 | | |
5015 | 25.5k | m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0)); |
5016 | 25.5k | m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0)); |
5017 | 25.5k | m128iS4= _mm_packus_epi16(m128iS4,m128iS5); |
5018 | | |
5019 | 25.5k | m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0)); |
5020 | 25.5k | m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0)); |
5021 | 25.5k | m128iS6= _mm_packus_epi16(m128iS6,m128iS7); |
5022 | | |
5023 | 25.5k | m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0)); |
5024 | 25.5k | m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0)); |
5025 | 25.5k | m128iS8= _mm_packus_epi16(m128iS8,m128iS9); |
5026 | | |
5027 | 25.5k | m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0)); |
5028 | 25.5k | m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0)); |
5029 | 25.5k | m128iS10= _mm_packus_epi16(m128iS10,m128iS11); |
5030 | | |
5031 | 25.5k | m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0)); |
5032 | 25.5k | m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0)); |
5033 | 25.5k | m128iS12= _mm_packus_epi16(m128iS12,m128iS13); |
5034 | | |
5035 | 25.5k | m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0)); |
5036 | 25.5k | m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0)); |
5037 | 25.5k | m128iS14= _mm_packus_epi16(m128iS14,m128iS15); |
5038 | | |
5039 | 25.5k | m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0)); |
5040 | 25.5k | m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0)); |
5041 | 25.5k | m128iS16= _mm_packus_epi16(m128iS16,m128iS17); |
5042 | | |
5043 | 25.5k | m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0)); |
5044 | 25.5k | m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0)); |
5045 | 25.5k | m128iS18= _mm_packus_epi16(m128iS18,m128iS19); |
5046 | | |
5047 | 25.5k | m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0)); |
5048 | 25.5k | m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0)); |
5049 | 25.5k | m128iS20= _mm_packus_epi16(m128iS20,m128iS21); |
5050 | | |
5051 | 25.5k | m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0)); |
5052 | 25.5k | m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0)); |
5053 | 25.5k | m128iS22= _mm_packus_epi16(m128iS22,m128iS23); |
5054 | | |
5055 | 25.5k | m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0)); |
5056 | 25.5k | m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0)); |
5057 | 25.5k | m128iS24= _mm_packus_epi16(m128iS24,m128iS25); |
5058 | | |
5059 | 25.5k | m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0)); |
5060 | 25.5k | m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0)); |
5061 | 25.5k | m128iS26= _mm_packus_epi16(m128iS26,m128iS27); |
5062 | | |
5063 | 25.5k | m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0)); |
5064 | 25.5k | m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0)); |
5065 | 25.5k | m128iS28= _mm_packus_epi16(m128iS28,m128iS29); |
5066 | | |
5067 | 25.5k | m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0)); |
5068 | 25.5k | m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0)); |
5069 | 25.5k | m128iS30= _mm_packus_epi16(m128iS30,m128iS31); |
5070 | | |
5071 | | |
5072 | 25.5k | _mm_store_si128((__m128i*)dst,m128iS0); |
5073 | 25.5k | _mm_store_si128((__m128i*)(dst+16),m128iS2); |
5074 | 25.5k | _mm_store_si128((__m128i*)(dst+stride),m128iS4); |
5075 | 25.5k | _mm_store_si128((__m128i*)(dst+stride+16),m128iS6); |
5076 | 25.5k | _mm_store_si128((__m128i*)(dst+2*stride),m128iS8); |
5077 | 25.5k | _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10); |
5078 | 25.5k | _mm_store_si128((__m128i*)(dst+3*stride),m128iS12); |
5079 | 25.5k | _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14); |
5080 | 25.5k | _mm_store_si128((__m128i*)(dst+4*stride),m128iS16); |
5081 | 25.5k | _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18); |
5082 | 25.5k | _mm_store_si128((__m128i*)(dst+5*stride),m128iS20); |
5083 | 25.5k | _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22); |
5084 | 25.5k | _mm_store_si128((__m128i*)(dst+6*stride),m128iS24); |
5085 | 25.5k | _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26); |
5086 | 25.5k | _mm_store_si128((__m128i*)(dst+7*stride),m128iS28); |
5087 | 25.5k | _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30); |
5088 | | |
5089 | | |
5090 | 25.5k | if(i==0){ |
5091 | | //load next values : |
5092 | 6.39k | m128iS0 = r1; |
5093 | 6.39k | m128iS1 = r5; |
5094 | 6.39k | m128iS2 = r9; |
5095 | 6.39k | m128iS3 = r13; |
5096 | 6.39k | m128iS4 = r17; |
5097 | 6.39k | m128iS5 = r21; |
5098 | 6.39k | m128iS6 = r25; |
5099 | 6.39k | m128iS7 = r29; |
5100 | 6.39k | m128iS8 = r33; |
5101 | 6.39k | m128iS9 = r37; |
5102 | 6.39k | m128iS10 = r41; |
5103 | 6.39k | m128iS11 = r45; |
5104 | 6.39k | m128iS12 = r49; |
5105 | 6.39k | m128iS13 = r53; |
5106 | 6.39k | m128iS14 = r57; |
5107 | 6.39k | m128iS15 = r61; |
5108 | 6.39k | m128iS16 = r65; |
5109 | 6.39k | m128iS17 = r69; |
5110 | 6.39k | m128iS18 = r73; |
5111 | 6.39k | m128iS19 = r77; |
5112 | 6.39k | m128iS20 = r81; |
5113 | 6.39k | m128iS21 = r85; |
5114 | 6.39k | m128iS22 = r89; |
5115 | 6.39k | m128iS23 = r93; |
5116 | 6.39k | m128iS24 = r97; |
5117 | 6.39k | m128iS25 = r101; |
5118 | 6.39k | m128iS26 = r105; |
5119 | 6.39k | m128iS27 = r109; |
5120 | 6.39k | m128iS28 = r113; |
5121 | 6.39k | m128iS29 = r117; |
5122 | 6.39k | m128iS30 = r121; |
5123 | 6.39k | m128iS31 =r125; |
5124 | | |
5125 | 19.1k | }else if(i ==8){ |
5126 | | //load next values : |
5127 | 6.39k | m128iS0 = r2; |
5128 | 6.39k | m128iS1 = r6; |
5129 | 6.39k | m128iS2 = r10; |
5130 | 6.39k | m128iS3 = r14; |
5131 | 6.39k | m128iS4 = r18; |
5132 | 6.39k | m128iS5 = r22; |
5133 | 6.39k | m128iS6 = r26; |
5134 | 6.39k | m128iS7 = r30; |
5135 | 6.39k | m128iS8 = r34; |
5136 | 6.39k | m128iS9 = r38; |
5137 | 6.39k | m128iS10 = r42; |
5138 | 6.39k | m128iS11 = r46; |
5139 | 6.39k | m128iS12 = r50; |
5140 | 6.39k | m128iS13 = r54; |
5141 | 6.39k | m128iS14 = r58; |
5142 | 6.39k | m128iS15 = r62; |
5143 | 6.39k | m128iS16 = r66; |
5144 | 6.39k | m128iS17 = r70; |
5145 | 6.39k | m128iS18 = r74; |
5146 | 6.39k | m128iS19 = r78; |
5147 | 6.39k | m128iS20 = r82; |
5148 | 6.39k | m128iS21 = r86; |
5149 | 6.39k | m128iS22 = r90; |
5150 | 6.39k | m128iS23 = r94; |
5151 | 6.39k | m128iS24 = r98; |
5152 | 6.39k | m128iS25 = r102; |
5153 | 6.39k | m128iS26 = r106; |
5154 | 6.39k | m128iS27 = r110; |
5155 | 6.39k | m128iS28 = r114; |
5156 | 6.39k | m128iS29 = r118; |
5157 | 6.39k | m128iS30 = r122; |
5158 | 6.39k | m128iS31 =r126; |
5159 | | |
5160 | 12.7k | }else if(i==16) |
5161 | 6.39k | { |
5162 | | //load next values : |
5163 | 6.39k | m128iS0 = r3; |
5164 | 6.39k | m128iS1 = r7; |
5165 | 6.39k | m128iS2 = r11; |
5166 | 6.39k | m128iS3 = r15; |
5167 | 6.39k | m128iS4 = r19; |
5168 | 6.39k | m128iS5 = r23; |
5169 | 6.39k | m128iS6 = r27; |
5170 | 6.39k | m128iS7 = r31; |
5171 | 6.39k | m128iS8 = r35; |
5172 | 6.39k | m128iS9 = r39; |
5173 | 6.39k | m128iS10 = r43; |
5174 | 6.39k | m128iS11 = r47; |
5175 | 6.39k | m128iS12 = r51; |
5176 | 6.39k | m128iS13 = r55; |
5177 | 6.39k | m128iS14 = r59; |
5178 | 6.39k | m128iS15 = r63; |
5179 | 6.39k | m128iS16 = r67; |
5180 | 6.39k | m128iS17 = r71; |
5181 | 6.39k | m128iS18 = r75; |
5182 | 6.39k | m128iS19 = r79; |
5183 | 6.39k | m128iS20 = r83; |
5184 | 6.39k | m128iS21 = r87; |
5185 | 6.39k | m128iS22 = r91; |
5186 | 6.39k | m128iS23 = r95; |
5187 | 6.39k | m128iS24 = r99; |
5188 | 6.39k | m128iS25 = r103; |
5189 | 6.39k | m128iS26 = r107; |
5190 | 6.39k | m128iS27 = r111; |
5191 | 6.39k | m128iS28 = r115; |
5192 | 6.39k | m128iS29 = r119; |
5193 | 6.39k | m128iS30 = r123; |
5194 | 6.39k | m128iS31 =r127; |
5195 | 6.39k | } |
5196 | 25.5k | } |
5197 | 51.1k | } |
5198 | 12.7k | } |
5199 | 6.39k | } |
5200 | | #endif |
5201 | | |
5202 | | |
5203 | | #if 0 |
5204 | | void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs, |
5205 | | ptrdiff_t _stride) { |
5206 | | int i, j; |
5207 | | uint16_t *dst = (uint16_t*) _dst; |
5208 | | ptrdiff_t stride = _stride / 2; |
5209 | | int shift; |
5210 | | uint8_t shift_2nd = 10; //20 - bit depth |
5211 | | uint16_t add_2nd = 1<<9; //shift2 - 1 |
5212 | | int16_t *src = coeffs; |
5213 | | |
5214 | | __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6, |
5215 | | m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13, |
5216 | | m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, |
5217 | | m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h, |
5218 | | E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h, |
5219 | | O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l, |
5220 | | E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h; |
5221 | | __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l; |
5222 | | __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h, |
5223 | | EEE0l, EEE1l, EEE0h, EEE1h; |
5224 | | __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21, |
5225 | | m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27, |
5226 | | m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9, |
5227 | | m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15, |
5228 | | O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l, |
5229 | | O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l, |
5230 | | EE4l, EE7h, EE6h, EE5h, EE4h; |
5231 | | m128iS0 = _mm_load_si128((__m128i *) (src)); |
5232 | | m128iS1 = _mm_load_si128((__m128i *) (src + 32)); |
5233 | | m128iS2 = _mm_load_si128((__m128i *) (src + 64)); |
5234 | | m128iS3 = _mm_load_si128((__m128i *) (src + 96)); |
5235 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 128)); |
5236 | | m128iS5 = _mm_load_si128((__m128i *) (src + 160)); |
5237 | | m128iS6 = _mm_load_si128((__m128i *) (src + 192)); |
5238 | | m128iS7 = _mm_load_si128((__m128i *) (src + 224)); |
5239 | | m128iS8 = _mm_load_si128((__m128i *) (src + 256)); |
5240 | | m128iS9 = _mm_load_si128((__m128i *) (src + 288)); |
5241 | | m128iS10 = _mm_load_si128((__m128i *) (src + 320)); |
5242 | | m128iS11 = _mm_load_si128((__m128i *) (src + 352)); |
5243 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 384)); |
5244 | | m128iS13 = _mm_load_si128((__m128i *) (src + 416)); |
5245 | | m128iS14 = _mm_load_si128((__m128i *) (src + 448)); |
5246 | | m128iS15 = _mm_load_si128((__m128i *) (src + 480)); |
5247 | | m128iS16 = _mm_load_si128((__m128i *) (src + 512)); |
5248 | | m128iS17 = _mm_load_si128((__m128i *) (src + 544)); |
5249 | | m128iS18 = _mm_load_si128((__m128i *) (src + 576)); |
5250 | | m128iS19 = _mm_load_si128((__m128i *) (src + 608)); |
5251 | | m128iS20 = _mm_load_si128((__m128i *) (src + 640)); |
5252 | | m128iS21 = _mm_load_si128((__m128i *) (src + 672)); |
5253 | | m128iS22 = _mm_load_si128((__m128i *) (src + 704)); |
5254 | | m128iS23 = _mm_load_si128((__m128i *) (src + 736)); |
5255 | | m128iS24 = _mm_load_si128((__m128i *) (src + 768)); |
5256 | | m128iS25 = _mm_load_si128((__m128i *) (src + 800)); |
5257 | | m128iS26 = _mm_load_si128((__m128i *) (src + 832)); |
5258 | | m128iS27 = _mm_load_si128((__m128i *) (src + 864)); |
5259 | | m128iS28 = _mm_load_si128((__m128i *) (src + 896)); |
5260 | | m128iS29 = _mm_load_si128((__m128i *) (src + 928)); |
5261 | | m128iS30 = _mm_load_si128((__m128i *) (src + 960)); |
5262 | | m128iS31 = _mm_load_si128((__m128i *) (src + 992)); |
5263 | | |
5264 | | shift = shift_1st; |
5265 | | m128iAdd = _mm_set1_epi32(add_1st); |
5266 | | |
5267 | | for (j = 0; j < 2; j++) { |
5268 | | for (i = 0; i < 32; i += 8) { |
5269 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3); |
5270 | | E0l = _mm_madd_epi16(m128Tmp0, |
5271 | | _mm_load_si128((__m128i *) (transform32x32[0][0]))); |
5272 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3); |
5273 | | E0h = _mm_madd_epi16(m128Tmp1, |
5274 | | _mm_load_si128((__m128i *) (transform32x32[0][0]))); |
5275 | | |
5276 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7); |
5277 | | E1l = _mm_madd_epi16(m128Tmp2, |
5278 | | _mm_load_si128((__m128i *) (transform32x32[1][0]))); |
5279 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7); |
5280 | | E1h = _mm_madd_epi16(m128Tmp3, |
5281 | | _mm_load_si128((__m128i *) (transform32x32[1][0]))); |
5282 | | |
5283 | | m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11); |
5284 | | E2l = _mm_madd_epi16(m128Tmp4, |
5285 | | _mm_load_si128((__m128i *) (transform32x32[2][0]))); |
5286 | | m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11); |
5287 | | E2h = _mm_madd_epi16(m128Tmp5, |
5288 | | _mm_load_si128((__m128i *) (transform32x32[2][0]))); |
5289 | | |
5290 | | m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15); |
5291 | | E3l = _mm_madd_epi16(m128Tmp6, |
5292 | | _mm_load_si128((__m128i *) (transform32x32[3][0]))); |
5293 | | m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15); |
5294 | | E3h = _mm_madd_epi16(m128Tmp7, |
5295 | | _mm_load_si128((__m128i *) (transform32x32[3][0]))); |
5296 | | |
5297 | | m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19); |
5298 | | E4l = _mm_madd_epi16(m128Tmp8, |
5299 | | _mm_load_si128((__m128i *) (transform32x32[4][0]))); |
5300 | | m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19); |
5301 | | E4h = _mm_madd_epi16(m128Tmp9, |
5302 | | _mm_load_si128((__m128i *) (transform32x32[4][0]))); |
5303 | | |
5304 | | m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23); |
5305 | | E5l = _mm_madd_epi16(m128Tmp10, |
5306 | | _mm_load_si128((__m128i *) (transform32x32[5][0]))); |
5307 | | m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23); |
5308 | | E5h = _mm_madd_epi16(m128Tmp11, |
5309 | | _mm_load_si128((__m128i *) (transform32x32[5][0]))); |
5310 | | |
5311 | | m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27); |
5312 | | E6l = _mm_madd_epi16(m128Tmp12, |
5313 | | _mm_load_si128((__m128i *) (transform32x32[6][0]))); |
5314 | | m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27); |
5315 | | E6h = _mm_madd_epi16(m128Tmp13, |
5316 | | _mm_load_si128((__m128i *) (transform32x32[6][0]))); |
5317 | | |
5318 | | m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31); |
5319 | | E7l = _mm_madd_epi16(m128Tmp14, |
5320 | | _mm_load_si128((__m128i *) (transform32x32[7][0]))); |
5321 | | m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31); |
5322 | | E7h = _mm_madd_epi16(m128Tmp15, |
5323 | | _mm_load_si128((__m128i *) (transform32x32[7][0]))); |
5324 | | |
5325 | | O0l = _mm_add_epi32(E0l, E1l); |
5326 | | O0l = _mm_add_epi32(O0l, E2l); |
5327 | | O0l = _mm_add_epi32(O0l, E3l); |
5328 | | O0l = _mm_add_epi32(O0l, E4l); |
5329 | | O0l = _mm_add_epi32(O0l, E5l); |
5330 | | O0l = _mm_add_epi32(O0l, E6l); |
5331 | | O0l = _mm_add_epi32(O0l, E7l); |
5332 | | |
5333 | | O0h = _mm_add_epi32(E0h, E1h); |
5334 | | O0h = _mm_add_epi32(O0h, E2h); |
5335 | | O0h = _mm_add_epi32(O0h, E3h); |
5336 | | O0h = _mm_add_epi32(O0h, E4h); |
5337 | | O0h = _mm_add_epi32(O0h, E5h); |
5338 | | O0h = _mm_add_epi32(O0h, E6h); |
5339 | | O0h = _mm_add_epi32(O0h, E7h); |
5340 | | |
5341 | | /* Compute O1*/ |
5342 | | E0l = _mm_madd_epi16(m128Tmp0, |
5343 | | _mm_load_si128((__m128i *) (transform32x32[0][1]))); |
5344 | | E0h = _mm_madd_epi16(m128Tmp1, |
5345 | | _mm_load_si128((__m128i *) (transform32x32[0][1]))); |
5346 | | E1l = _mm_madd_epi16(m128Tmp2, |
5347 | | _mm_load_si128((__m128i *) (transform32x32[1][1]))); |
5348 | | E1h = _mm_madd_epi16(m128Tmp3, |
5349 | | _mm_load_si128((__m128i *) (transform32x32[1][1]))); |
5350 | | E2l = _mm_madd_epi16(m128Tmp4, |
5351 | | _mm_load_si128((__m128i *) (transform32x32[2][1]))); |
5352 | | E2h = _mm_madd_epi16(m128Tmp5, |
5353 | | _mm_load_si128((__m128i *) (transform32x32[2][1]))); |
5354 | | E3l = _mm_madd_epi16(m128Tmp6, |
5355 | | _mm_load_si128((__m128i *) (transform32x32[3][1]))); |
5356 | | E3h = _mm_madd_epi16(m128Tmp7, |
5357 | | _mm_load_si128((__m128i *) (transform32x32[3][1]))); |
5358 | | |
5359 | | E4l = _mm_madd_epi16(m128Tmp8, |
5360 | | _mm_load_si128((__m128i *) (transform32x32[4][1]))); |
5361 | | E4h = _mm_madd_epi16(m128Tmp9, |
5362 | | _mm_load_si128((__m128i *) (transform32x32[4][1]))); |
5363 | | E5l = _mm_madd_epi16(m128Tmp10, |
5364 | | _mm_load_si128((__m128i *) (transform32x32[5][1]))); |
5365 | | E5h = _mm_madd_epi16(m128Tmp11, |
5366 | | _mm_load_si128((__m128i *) (transform32x32[5][1]))); |
5367 | | E6l = _mm_madd_epi16(m128Tmp12, |
5368 | | _mm_load_si128((__m128i *) (transform32x32[6][1]))); |
5369 | | E6h = _mm_madd_epi16(m128Tmp13, |
5370 | | _mm_load_si128((__m128i *) (transform32x32[6][1]))); |
5371 | | E7l = _mm_madd_epi16(m128Tmp14, |
5372 | | _mm_load_si128((__m128i *) (transform32x32[7][1]))); |
5373 | | E7h = _mm_madd_epi16(m128Tmp15, |
5374 | | _mm_load_si128((__m128i *) (transform32x32[7][1]))); |
5375 | | |
5376 | | O1l = _mm_add_epi32(E0l, E1l); |
5377 | | O1l = _mm_add_epi32(O1l, E2l); |
5378 | | O1l = _mm_add_epi32(O1l, E3l); |
5379 | | O1l = _mm_add_epi32(O1l, E4l); |
5380 | | O1l = _mm_add_epi32(O1l, E5l); |
5381 | | O1l = _mm_add_epi32(O1l, E6l); |
5382 | | O1l = _mm_add_epi32(O1l, E7l); |
5383 | | |
5384 | | O1h = _mm_add_epi32(E0h, E1h); |
5385 | | O1h = _mm_add_epi32(O1h, E2h); |
5386 | | O1h = _mm_add_epi32(O1h, E3h); |
5387 | | O1h = _mm_add_epi32(O1h, E4h); |
5388 | | O1h = _mm_add_epi32(O1h, E5h); |
5389 | | O1h = _mm_add_epi32(O1h, E6h); |
5390 | | O1h = _mm_add_epi32(O1h, E7h); |
5391 | | /* Compute O2*/ |
5392 | | E0l = _mm_madd_epi16(m128Tmp0, |
5393 | | _mm_load_si128((__m128i *) (transform32x32[0][2]))); |
5394 | | E0h = _mm_madd_epi16(m128Tmp1, |
5395 | | _mm_load_si128((__m128i *) (transform32x32[0][2]))); |
5396 | | E1l = _mm_madd_epi16(m128Tmp2, |
5397 | | _mm_load_si128((__m128i *) (transform32x32[1][2]))); |
5398 | | E1h = _mm_madd_epi16(m128Tmp3, |
5399 | | _mm_load_si128((__m128i *) (transform32x32[1][2]))); |
5400 | | E2l = _mm_madd_epi16(m128Tmp4, |
5401 | | _mm_load_si128((__m128i *) (transform32x32[2][2]))); |
5402 | | E2h = _mm_madd_epi16(m128Tmp5, |
5403 | | _mm_load_si128((__m128i *) (transform32x32[2][2]))); |
5404 | | E3l = _mm_madd_epi16(m128Tmp6, |
5405 | | _mm_load_si128((__m128i *) (transform32x32[3][2]))); |
5406 | | E3h = _mm_madd_epi16(m128Tmp7, |
5407 | | _mm_load_si128((__m128i *) (transform32x32[3][2]))); |
5408 | | |
5409 | | E4l = _mm_madd_epi16(m128Tmp8, |
5410 | | _mm_load_si128((__m128i *) (transform32x32[4][2]))); |
5411 | | E4h = _mm_madd_epi16(m128Tmp9, |
5412 | | _mm_load_si128((__m128i *) (transform32x32[4][2]))); |
5413 | | E5l = _mm_madd_epi16(m128Tmp10, |
5414 | | _mm_load_si128((__m128i *) (transform32x32[5][2]))); |
5415 | | E5h = _mm_madd_epi16(m128Tmp11, |
5416 | | _mm_load_si128((__m128i *) (transform32x32[5][2]))); |
5417 | | E6l = _mm_madd_epi16(m128Tmp12, |
5418 | | _mm_load_si128((__m128i *) (transform32x32[6][2]))); |
5419 | | E6h = _mm_madd_epi16(m128Tmp13, |
5420 | | _mm_load_si128((__m128i *) (transform32x32[6][2]))); |
5421 | | E7l = _mm_madd_epi16(m128Tmp14, |
5422 | | _mm_load_si128((__m128i *) (transform32x32[7][2]))); |
5423 | | E7h = _mm_madd_epi16(m128Tmp15, |
5424 | | _mm_load_si128((__m128i *) (transform32x32[7][2]))); |
5425 | | |
5426 | | O2l = _mm_add_epi32(E0l, E1l); |
5427 | | O2l = _mm_add_epi32(O2l, E2l); |
5428 | | O2l = _mm_add_epi32(O2l, E3l); |
5429 | | O2l = _mm_add_epi32(O2l, E4l); |
5430 | | O2l = _mm_add_epi32(O2l, E5l); |
5431 | | O2l = _mm_add_epi32(O2l, E6l); |
5432 | | O2l = _mm_add_epi32(O2l, E7l); |
5433 | | |
5434 | | O2h = _mm_add_epi32(E0h, E1h); |
5435 | | O2h = _mm_add_epi32(O2h, E2h); |
5436 | | O2h = _mm_add_epi32(O2h, E3h); |
5437 | | O2h = _mm_add_epi32(O2h, E4h); |
5438 | | O2h = _mm_add_epi32(O2h, E5h); |
5439 | | O2h = _mm_add_epi32(O2h, E6h); |
5440 | | O2h = _mm_add_epi32(O2h, E7h); |
5441 | | /* Compute O3*/ |
5442 | | E0l = _mm_madd_epi16(m128Tmp0, |
5443 | | _mm_load_si128((__m128i *) (transform32x32[0][3]))); |
5444 | | E0h = _mm_madd_epi16(m128Tmp1, |
5445 | | _mm_load_si128((__m128i *) (transform32x32[0][3]))); |
5446 | | E1l = _mm_madd_epi16(m128Tmp2, |
5447 | | _mm_load_si128((__m128i *) (transform32x32[1][3]))); |
5448 | | E1h = _mm_madd_epi16(m128Tmp3, |
5449 | | _mm_load_si128((__m128i *) (transform32x32[1][3]))); |
5450 | | E2l = _mm_madd_epi16(m128Tmp4, |
5451 | | _mm_load_si128((__m128i *) (transform32x32[2][3]))); |
5452 | | E2h = _mm_madd_epi16(m128Tmp5, |
5453 | | _mm_load_si128((__m128i *) (transform32x32[2][3]))); |
5454 | | E3l = _mm_madd_epi16(m128Tmp6, |
5455 | | _mm_load_si128((__m128i *) (transform32x32[3][3]))); |
5456 | | E3h = _mm_madd_epi16(m128Tmp7, |
5457 | | _mm_load_si128((__m128i *) (transform32x32[3][3]))); |
5458 | | |
5459 | | E4l = _mm_madd_epi16(m128Tmp8, |
5460 | | _mm_load_si128((__m128i *) (transform32x32[4][3]))); |
5461 | | E4h = _mm_madd_epi16(m128Tmp9, |
5462 | | _mm_load_si128((__m128i *) (transform32x32[4][3]))); |
5463 | | E5l = _mm_madd_epi16(m128Tmp10, |
5464 | | _mm_load_si128((__m128i *) (transform32x32[5][3]))); |
5465 | | E5h = _mm_madd_epi16(m128Tmp11, |
5466 | | _mm_load_si128((__m128i *) (transform32x32[5][3]))); |
5467 | | E6l = _mm_madd_epi16(m128Tmp12, |
5468 | | _mm_load_si128((__m128i *) (transform32x32[6][3]))); |
5469 | | E6h = _mm_madd_epi16(m128Tmp13, |
5470 | | _mm_load_si128((__m128i *) (transform32x32[6][3]))); |
5471 | | E7l = _mm_madd_epi16(m128Tmp14, |
5472 | | _mm_load_si128((__m128i *) (transform32x32[7][3]))); |
5473 | | E7h = _mm_madd_epi16(m128Tmp15, |
5474 | | _mm_load_si128((__m128i *) (transform32x32[7][3]))); |
5475 | | |
5476 | | O3l = _mm_add_epi32(E0l, E1l); |
5477 | | O3l = _mm_add_epi32(O3l, E2l); |
5478 | | O3l = _mm_add_epi32(O3l, E3l); |
5479 | | O3l = _mm_add_epi32(O3l, E4l); |
5480 | | O3l = _mm_add_epi32(O3l, E5l); |
5481 | | O3l = _mm_add_epi32(O3l, E6l); |
5482 | | O3l = _mm_add_epi32(O3l, E7l); |
5483 | | |
5484 | | O3h = _mm_add_epi32(E0h, E1h); |
5485 | | O3h = _mm_add_epi32(O3h, E2h); |
5486 | | O3h = _mm_add_epi32(O3h, E3h); |
5487 | | O3h = _mm_add_epi32(O3h, E4h); |
5488 | | O3h = _mm_add_epi32(O3h, E5h); |
5489 | | O3h = _mm_add_epi32(O3h, E6h); |
5490 | | O3h = _mm_add_epi32(O3h, E7h); |
5491 | | /* Compute O4*/ |
5492 | | |
5493 | | E0l = _mm_madd_epi16(m128Tmp0, |
5494 | | _mm_load_si128((__m128i *) (transform32x32[0][4]))); |
5495 | | E0h = _mm_madd_epi16(m128Tmp1, |
5496 | | _mm_load_si128((__m128i *) (transform32x32[0][4]))); |
5497 | | E1l = _mm_madd_epi16(m128Tmp2, |
5498 | | _mm_load_si128((__m128i *) (transform32x32[1][4]))); |
5499 | | E1h = _mm_madd_epi16(m128Tmp3, |
5500 | | _mm_load_si128((__m128i *) (transform32x32[1][4]))); |
5501 | | E2l = _mm_madd_epi16(m128Tmp4, |
5502 | | _mm_load_si128((__m128i *) (transform32x32[2][4]))); |
5503 | | E2h = _mm_madd_epi16(m128Tmp5, |
5504 | | _mm_load_si128((__m128i *) (transform32x32[2][4]))); |
5505 | | E3l = _mm_madd_epi16(m128Tmp6, |
5506 | | _mm_load_si128((__m128i *) (transform32x32[3][4]))); |
5507 | | E3h = _mm_madd_epi16(m128Tmp7, |
5508 | | _mm_load_si128((__m128i *) (transform32x32[3][4]))); |
5509 | | |
5510 | | E4l = _mm_madd_epi16(m128Tmp8, |
5511 | | _mm_load_si128((__m128i *) (transform32x32[4][4]))); |
5512 | | E4h = _mm_madd_epi16(m128Tmp9, |
5513 | | _mm_load_si128((__m128i *) (transform32x32[4][4]))); |
5514 | | E5l = _mm_madd_epi16(m128Tmp10, |
5515 | | _mm_load_si128((__m128i *) (transform32x32[5][4]))); |
5516 | | E5h = _mm_madd_epi16(m128Tmp11, |
5517 | | _mm_load_si128((__m128i *) (transform32x32[5][4]))); |
5518 | | E6l = _mm_madd_epi16(m128Tmp12, |
5519 | | _mm_load_si128((__m128i *) (transform32x32[6][4]))); |
5520 | | E6h = _mm_madd_epi16(m128Tmp13, |
5521 | | _mm_load_si128((__m128i *) (transform32x32[6][4]))); |
5522 | | E7l = _mm_madd_epi16(m128Tmp14, |
5523 | | _mm_load_si128((__m128i *) (transform32x32[7][4]))); |
5524 | | E7h = _mm_madd_epi16(m128Tmp15, |
5525 | | _mm_load_si128((__m128i *) (transform32x32[7][4]))); |
5526 | | |
5527 | | O4l = _mm_add_epi32(E0l, E1l); |
5528 | | O4l = _mm_add_epi32(O4l, E2l); |
5529 | | O4l = _mm_add_epi32(O4l, E3l); |
5530 | | O4l = _mm_add_epi32(O4l, E4l); |
5531 | | O4l = _mm_add_epi32(O4l, E5l); |
5532 | | O4l = _mm_add_epi32(O4l, E6l); |
5533 | | O4l = _mm_add_epi32(O4l, E7l); |
5534 | | |
5535 | | O4h = _mm_add_epi32(E0h, E1h); |
5536 | | O4h = _mm_add_epi32(O4h, E2h); |
5537 | | O4h = _mm_add_epi32(O4h, E3h); |
5538 | | O4h = _mm_add_epi32(O4h, E4h); |
5539 | | O4h = _mm_add_epi32(O4h, E5h); |
5540 | | O4h = _mm_add_epi32(O4h, E6h); |
5541 | | O4h = _mm_add_epi32(O4h, E7h); |
5542 | | |
5543 | | /* Compute O5*/ |
5544 | | E0l = _mm_madd_epi16(m128Tmp0, |
5545 | | _mm_load_si128((__m128i *) (transform32x32[0][5]))); |
5546 | | E0h = _mm_madd_epi16(m128Tmp1, |
5547 | | _mm_load_si128((__m128i *) (transform32x32[0][5]))); |
5548 | | E1l = _mm_madd_epi16(m128Tmp2, |
5549 | | _mm_load_si128((__m128i *) (transform32x32[1][5]))); |
5550 | | E1h = _mm_madd_epi16(m128Tmp3, |
5551 | | _mm_load_si128((__m128i *) (transform32x32[1][5]))); |
5552 | | E2l = _mm_madd_epi16(m128Tmp4, |
5553 | | _mm_load_si128((__m128i *) (transform32x32[2][5]))); |
5554 | | E2h = _mm_madd_epi16(m128Tmp5, |
5555 | | _mm_load_si128((__m128i *) (transform32x32[2][5]))); |
5556 | | E3l = _mm_madd_epi16(m128Tmp6, |
5557 | | _mm_load_si128((__m128i *) (transform32x32[3][5]))); |
5558 | | E3h = _mm_madd_epi16(m128Tmp7, |
5559 | | _mm_load_si128((__m128i *) (transform32x32[3][5]))); |
5560 | | |
5561 | | E4l = _mm_madd_epi16(m128Tmp8, |
5562 | | _mm_load_si128((__m128i *) (transform32x32[4][5]))); |
5563 | | E4h = _mm_madd_epi16(m128Tmp9, |
5564 | | _mm_load_si128((__m128i *) (transform32x32[4][5]))); |
5565 | | E5l = _mm_madd_epi16(m128Tmp10, |
5566 | | _mm_load_si128((__m128i *) (transform32x32[5][5]))); |
5567 | | E5h = _mm_madd_epi16(m128Tmp11, |
5568 | | _mm_load_si128((__m128i *) (transform32x32[5][5]))); |
5569 | | E6l = _mm_madd_epi16(m128Tmp12, |
5570 | | _mm_load_si128((__m128i *) (transform32x32[6][5]))); |
5571 | | E6h = _mm_madd_epi16(m128Tmp13, |
5572 | | _mm_load_si128((__m128i *) (transform32x32[6][5]))); |
5573 | | E7l = _mm_madd_epi16(m128Tmp14, |
5574 | | _mm_load_si128((__m128i *) (transform32x32[7][5]))); |
5575 | | E7h = _mm_madd_epi16(m128Tmp15, |
5576 | | _mm_load_si128((__m128i *) (transform32x32[7][5]))); |
5577 | | |
5578 | | O5l = _mm_add_epi32(E0l, E1l); |
5579 | | O5l = _mm_add_epi32(O5l, E2l); |
5580 | | O5l = _mm_add_epi32(O5l, E3l); |
5581 | | O5l = _mm_add_epi32(O5l, E4l); |
5582 | | O5l = _mm_add_epi32(O5l, E5l); |
5583 | | O5l = _mm_add_epi32(O5l, E6l); |
5584 | | O5l = _mm_add_epi32(O5l, E7l); |
5585 | | |
5586 | | O5h = _mm_add_epi32(E0h, E1h); |
5587 | | O5h = _mm_add_epi32(O5h, E2h); |
5588 | | O5h = _mm_add_epi32(O5h, E3h); |
5589 | | O5h = _mm_add_epi32(O5h, E4h); |
5590 | | O5h = _mm_add_epi32(O5h, E5h); |
5591 | | O5h = _mm_add_epi32(O5h, E6h); |
5592 | | O5h = _mm_add_epi32(O5h, E7h); |
5593 | | |
5594 | | /* Compute O6*/ |
5595 | | |
5596 | | E0l = _mm_madd_epi16(m128Tmp0, |
5597 | | _mm_load_si128((__m128i *) (transform32x32[0][6]))); |
5598 | | E0h = _mm_madd_epi16(m128Tmp1, |
5599 | | _mm_load_si128((__m128i *) (transform32x32[0][6]))); |
5600 | | E1l = _mm_madd_epi16(m128Tmp2, |
5601 | | _mm_load_si128((__m128i *) (transform32x32[1][6]))); |
5602 | | E1h = _mm_madd_epi16(m128Tmp3, |
5603 | | _mm_load_si128((__m128i *) (transform32x32[1][6]))); |
5604 | | E2l = _mm_madd_epi16(m128Tmp4, |
5605 | | _mm_load_si128((__m128i *) (transform32x32[2][6]))); |
5606 | | E2h = _mm_madd_epi16(m128Tmp5, |
5607 | | _mm_load_si128((__m128i *) (transform32x32[2][6]))); |
5608 | | E3l = _mm_madd_epi16(m128Tmp6, |
5609 | | _mm_load_si128((__m128i *) (transform32x32[3][6]))); |
5610 | | E3h = _mm_madd_epi16(m128Tmp7, |
5611 | | _mm_load_si128((__m128i *) (transform32x32[3][6]))); |
5612 | | |
5613 | | E4l = _mm_madd_epi16(m128Tmp8, |
5614 | | _mm_load_si128((__m128i *) (transform32x32[4][6]))); |
5615 | | E4h = _mm_madd_epi16(m128Tmp9, |
5616 | | _mm_load_si128((__m128i *) (transform32x32[4][6]))); |
5617 | | E5l = _mm_madd_epi16(m128Tmp10, |
5618 | | _mm_load_si128((__m128i *) (transform32x32[5][6]))); |
5619 | | E5h = _mm_madd_epi16(m128Tmp11, |
5620 | | _mm_load_si128((__m128i *) (transform32x32[5][6]))); |
5621 | | E6l = _mm_madd_epi16(m128Tmp12, |
5622 | | _mm_load_si128((__m128i *) (transform32x32[6][6]))); |
5623 | | E6h = _mm_madd_epi16(m128Tmp13, |
5624 | | _mm_load_si128((__m128i *) (transform32x32[6][6]))); |
5625 | | E7l = _mm_madd_epi16(m128Tmp14, |
5626 | | _mm_load_si128((__m128i *) (transform32x32[7][6]))); |
5627 | | E7h = _mm_madd_epi16(m128Tmp15, |
5628 | | _mm_load_si128((__m128i *) (transform32x32[7][6]))); |
5629 | | |
5630 | | O6l = _mm_add_epi32(E0l, E1l); |
5631 | | O6l = _mm_add_epi32(O6l, E2l); |
5632 | | O6l = _mm_add_epi32(O6l, E3l); |
5633 | | O6l = _mm_add_epi32(O6l, E4l); |
5634 | | O6l = _mm_add_epi32(O6l, E5l); |
5635 | | O6l = _mm_add_epi32(O6l, E6l); |
5636 | | O6l = _mm_add_epi32(O6l, E7l); |
5637 | | |
5638 | | O6h = _mm_add_epi32(E0h, E1h); |
5639 | | O6h = _mm_add_epi32(O6h, E2h); |
5640 | | O6h = _mm_add_epi32(O6h, E3h); |
5641 | | O6h = _mm_add_epi32(O6h, E4h); |
5642 | | O6h = _mm_add_epi32(O6h, E5h); |
5643 | | O6h = _mm_add_epi32(O6h, E6h); |
5644 | | O6h = _mm_add_epi32(O6h, E7h); |
5645 | | |
5646 | | /* Compute O7*/ |
5647 | | |
5648 | | E0l = _mm_madd_epi16(m128Tmp0, |
5649 | | _mm_load_si128((__m128i *) (transform32x32[0][7]))); |
5650 | | E0h = _mm_madd_epi16(m128Tmp1, |
5651 | | _mm_load_si128((__m128i *) (transform32x32[0][7]))); |
5652 | | E1l = _mm_madd_epi16(m128Tmp2, |
5653 | | _mm_load_si128((__m128i *) (transform32x32[1][7]))); |
5654 | | E1h = _mm_madd_epi16(m128Tmp3, |
5655 | | _mm_load_si128((__m128i *) (transform32x32[1][7]))); |
5656 | | E2l = _mm_madd_epi16(m128Tmp4, |
5657 | | _mm_load_si128((__m128i *) (transform32x32[2][7]))); |
5658 | | E2h = _mm_madd_epi16(m128Tmp5, |
5659 | | _mm_load_si128((__m128i *) (transform32x32[2][7]))); |
5660 | | E3l = _mm_madd_epi16(m128Tmp6, |
5661 | | _mm_load_si128((__m128i *) (transform32x32[3][7]))); |
5662 | | E3h = _mm_madd_epi16(m128Tmp7, |
5663 | | _mm_load_si128((__m128i *) (transform32x32[3][7]))); |
5664 | | |
5665 | | E4l = _mm_madd_epi16(m128Tmp8, |
5666 | | _mm_load_si128((__m128i *) (transform32x32[4][7]))); |
5667 | | E4h = _mm_madd_epi16(m128Tmp9, |
5668 | | _mm_load_si128((__m128i *) (transform32x32[4][7]))); |
5669 | | E5l = _mm_madd_epi16(m128Tmp10, |
5670 | | _mm_load_si128((__m128i *) (transform32x32[5][7]))); |
5671 | | E5h = _mm_madd_epi16(m128Tmp11, |
5672 | | _mm_load_si128((__m128i *) (transform32x32[5][7]))); |
5673 | | E6l = _mm_madd_epi16(m128Tmp12, |
5674 | | _mm_load_si128((__m128i *) (transform32x32[6][7]))); |
5675 | | E6h = _mm_madd_epi16(m128Tmp13, |
5676 | | _mm_load_si128((__m128i *) (transform32x32[6][7]))); |
5677 | | E7l = _mm_madd_epi16(m128Tmp14, |
5678 | | _mm_load_si128((__m128i *) (transform32x32[7][7]))); |
5679 | | E7h = _mm_madd_epi16(m128Tmp15, |
5680 | | _mm_load_si128((__m128i *) (transform32x32[7][7]))); |
5681 | | |
5682 | | O7l = _mm_add_epi32(E0l, E1l); |
5683 | | O7l = _mm_add_epi32(O7l, E2l); |
5684 | | O7l = _mm_add_epi32(O7l, E3l); |
5685 | | O7l = _mm_add_epi32(O7l, E4l); |
5686 | | O7l = _mm_add_epi32(O7l, E5l); |
5687 | | O7l = _mm_add_epi32(O7l, E6l); |
5688 | | O7l = _mm_add_epi32(O7l, E7l); |
5689 | | |
5690 | | O7h = _mm_add_epi32(E0h, E1h); |
5691 | | O7h = _mm_add_epi32(O7h, E2h); |
5692 | | O7h = _mm_add_epi32(O7h, E3h); |
5693 | | O7h = _mm_add_epi32(O7h, E4h); |
5694 | | O7h = _mm_add_epi32(O7h, E5h); |
5695 | | O7h = _mm_add_epi32(O7h, E6h); |
5696 | | O7h = _mm_add_epi32(O7h, E7h); |
5697 | | |
5698 | | /* Compute O8*/ |
5699 | | |
5700 | | E0l = _mm_madd_epi16(m128Tmp0, |
5701 | | _mm_load_si128((__m128i *) (transform32x32[0][8]))); |
5702 | | E0h = _mm_madd_epi16(m128Tmp1, |
5703 | | _mm_load_si128((__m128i *) (transform32x32[0][8]))); |
5704 | | E1l = _mm_madd_epi16(m128Tmp2, |
5705 | | _mm_load_si128((__m128i *) (transform32x32[1][8]))); |
5706 | | E1h = _mm_madd_epi16(m128Tmp3, |
5707 | | _mm_load_si128((__m128i *) (transform32x32[1][8]))); |
5708 | | E2l = _mm_madd_epi16(m128Tmp4, |
5709 | | _mm_load_si128((__m128i *) (transform32x32[2][8]))); |
5710 | | E2h = _mm_madd_epi16(m128Tmp5, |
5711 | | _mm_load_si128((__m128i *) (transform32x32[2][8]))); |
5712 | | E3l = _mm_madd_epi16(m128Tmp6, |
5713 | | _mm_load_si128((__m128i *) (transform32x32[3][8]))); |
5714 | | E3h = _mm_madd_epi16(m128Tmp7, |
5715 | | _mm_load_si128((__m128i *) (transform32x32[3][8]))); |
5716 | | |
5717 | | E4l = _mm_madd_epi16(m128Tmp8, |
5718 | | _mm_load_si128((__m128i *) (transform32x32[4][8]))); |
5719 | | E4h = _mm_madd_epi16(m128Tmp9, |
5720 | | _mm_load_si128((__m128i *) (transform32x32[4][8]))); |
5721 | | E5l = _mm_madd_epi16(m128Tmp10, |
5722 | | _mm_load_si128((__m128i *) (transform32x32[5][8]))); |
5723 | | E5h = _mm_madd_epi16(m128Tmp11, |
5724 | | _mm_load_si128((__m128i *) (transform32x32[5][8]))); |
5725 | | E6l = _mm_madd_epi16(m128Tmp12, |
5726 | | _mm_load_si128((__m128i *) (transform32x32[6][8]))); |
5727 | | E6h = _mm_madd_epi16(m128Tmp13, |
5728 | | _mm_load_si128((__m128i *) (transform32x32[6][8]))); |
5729 | | E7l = _mm_madd_epi16(m128Tmp14, |
5730 | | _mm_load_si128((__m128i *) (transform32x32[7][8]))); |
5731 | | E7h = _mm_madd_epi16(m128Tmp15, |
5732 | | _mm_load_si128((__m128i *) (transform32x32[7][8]))); |
5733 | | |
5734 | | O8l = _mm_add_epi32(E0l, E1l); |
5735 | | O8l = _mm_add_epi32(O8l, E2l); |
5736 | | O8l = _mm_add_epi32(O8l, E3l); |
5737 | | O8l = _mm_add_epi32(O8l, E4l); |
5738 | | O8l = _mm_add_epi32(O8l, E5l); |
5739 | | O8l = _mm_add_epi32(O8l, E6l); |
5740 | | O8l = _mm_add_epi32(O8l, E7l); |
5741 | | |
5742 | | O8h = _mm_add_epi32(E0h, E1h); |
5743 | | O8h = _mm_add_epi32(O8h, E2h); |
5744 | | O8h = _mm_add_epi32(O8h, E3h); |
5745 | | O8h = _mm_add_epi32(O8h, E4h); |
5746 | | O8h = _mm_add_epi32(O8h, E5h); |
5747 | | O8h = _mm_add_epi32(O8h, E6h); |
5748 | | O8h = _mm_add_epi32(O8h, E7h); |
5749 | | |
5750 | | /* Compute O9*/ |
5751 | | |
5752 | | E0l = _mm_madd_epi16(m128Tmp0, |
5753 | | _mm_load_si128((__m128i *) (transform32x32[0][9]))); |
5754 | | E0h = _mm_madd_epi16(m128Tmp1, |
5755 | | _mm_load_si128((__m128i *) (transform32x32[0][9]))); |
5756 | | E1l = _mm_madd_epi16(m128Tmp2, |
5757 | | _mm_load_si128((__m128i *) (transform32x32[1][9]))); |
5758 | | E1h = _mm_madd_epi16(m128Tmp3, |
5759 | | _mm_load_si128((__m128i *) (transform32x32[1][9]))); |
5760 | | E2l = _mm_madd_epi16(m128Tmp4, |
5761 | | _mm_load_si128((__m128i *) (transform32x32[2][9]))); |
5762 | | E2h = _mm_madd_epi16(m128Tmp5, |
5763 | | _mm_load_si128((__m128i *) (transform32x32[2][9]))); |
5764 | | E3l = _mm_madd_epi16(m128Tmp6, |
5765 | | _mm_load_si128((__m128i *) (transform32x32[3][9]))); |
5766 | | E3h = _mm_madd_epi16(m128Tmp7, |
5767 | | _mm_load_si128((__m128i *) (transform32x32[3][9]))); |
5768 | | |
5769 | | E4l = _mm_madd_epi16(m128Tmp8, |
5770 | | _mm_load_si128((__m128i *) (transform32x32[4][9]))); |
5771 | | E4h = _mm_madd_epi16(m128Tmp9, |
5772 | | _mm_load_si128((__m128i *) (transform32x32[4][9]))); |
5773 | | E5l = _mm_madd_epi16(m128Tmp10, |
5774 | | _mm_load_si128((__m128i *) (transform32x32[5][9]))); |
5775 | | E5h = _mm_madd_epi16(m128Tmp11, |
5776 | | _mm_load_si128((__m128i *) (transform32x32[5][9]))); |
5777 | | E6l = _mm_madd_epi16(m128Tmp12, |
5778 | | _mm_load_si128((__m128i *) (transform32x32[6][9]))); |
5779 | | E6h = _mm_madd_epi16(m128Tmp13, |
5780 | | _mm_load_si128((__m128i *) (transform32x32[6][9]))); |
5781 | | E7l = _mm_madd_epi16(m128Tmp14, |
5782 | | _mm_load_si128((__m128i *) (transform32x32[7][9]))); |
5783 | | E7h = _mm_madd_epi16(m128Tmp15, |
5784 | | _mm_load_si128((__m128i *) (transform32x32[7][9]))); |
5785 | | |
5786 | | O9l = _mm_add_epi32(E0l, E1l); |
5787 | | O9l = _mm_add_epi32(O9l, E2l); |
5788 | | O9l = _mm_add_epi32(O9l, E3l); |
5789 | | O9l = _mm_add_epi32(O9l, E4l); |
5790 | | O9l = _mm_add_epi32(O9l, E5l); |
5791 | | O9l = _mm_add_epi32(O9l, E6l); |
5792 | | O9l = _mm_add_epi32(O9l, E7l); |
5793 | | |
5794 | | O9h = _mm_add_epi32(E0h, E1h); |
5795 | | O9h = _mm_add_epi32(O9h, E2h); |
5796 | | O9h = _mm_add_epi32(O9h, E3h); |
5797 | | O9h = _mm_add_epi32(O9h, E4h); |
5798 | | O9h = _mm_add_epi32(O9h, E5h); |
5799 | | O9h = _mm_add_epi32(O9h, E6h); |
5800 | | O9h = _mm_add_epi32(O9h, E7h); |
5801 | | |
5802 | | /* Compute 10*/ |
5803 | | |
5804 | | E0l = _mm_madd_epi16(m128Tmp0, |
5805 | | _mm_load_si128((__m128i *) (transform32x32[0][10]))); |
5806 | | E0h = _mm_madd_epi16(m128Tmp1, |
5807 | | _mm_load_si128((__m128i *) (transform32x32[0][10]))); |
5808 | | E1l = _mm_madd_epi16(m128Tmp2, |
5809 | | _mm_load_si128((__m128i *) (transform32x32[1][10]))); |
5810 | | E1h = _mm_madd_epi16(m128Tmp3, |
5811 | | _mm_load_si128((__m128i *) (transform32x32[1][10]))); |
5812 | | E2l = _mm_madd_epi16(m128Tmp4, |
5813 | | _mm_load_si128((__m128i *) (transform32x32[2][10]))); |
5814 | | E2h = _mm_madd_epi16(m128Tmp5, |
5815 | | _mm_load_si128((__m128i *) (transform32x32[2][10]))); |
5816 | | E3l = _mm_madd_epi16(m128Tmp6, |
5817 | | _mm_load_si128((__m128i *) (transform32x32[3][10]))); |
5818 | | E3h = _mm_madd_epi16(m128Tmp7, |
5819 | | _mm_load_si128((__m128i *) (transform32x32[3][10]))); |
5820 | | |
5821 | | E4l = _mm_madd_epi16(m128Tmp8, |
5822 | | _mm_load_si128((__m128i *) (transform32x32[4][10]))); |
5823 | | E4h = _mm_madd_epi16(m128Tmp9, |
5824 | | _mm_load_si128((__m128i *) (transform32x32[4][10]))); |
5825 | | E5l = _mm_madd_epi16(m128Tmp10, |
5826 | | _mm_load_si128((__m128i *) (transform32x32[5][10]))); |
5827 | | E5h = _mm_madd_epi16(m128Tmp11, |
5828 | | _mm_load_si128((__m128i *) (transform32x32[5][10]))); |
5829 | | E6l = _mm_madd_epi16(m128Tmp12, |
5830 | | _mm_load_si128((__m128i *) (transform32x32[6][10]))); |
5831 | | E6h = _mm_madd_epi16(m128Tmp13, |
5832 | | _mm_load_si128((__m128i *) (transform32x32[6][10]))); |
5833 | | E7l = _mm_madd_epi16(m128Tmp14, |
5834 | | _mm_load_si128((__m128i *) (transform32x32[7][10]))); |
5835 | | E7h = _mm_madd_epi16(m128Tmp15, |
5836 | | _mm_load_si128((__m128i *) (transform32x32[7][10]))); |
5837 | | |
5838 | | O10l = _mm_add_epi32(E0l, E1l); |
5839 | | O10l = _mm_add_epi32(O10l, E2l); |
5840 | | O10l = _mm_add_epi32(O10l, E3l); |
5841 | | O10l = _mm_add_epi32(O10l, E4l); |
5842 | | O10l = _mm_add_epi32(O10l, E5l); |
5843 | | O10l = _mm_add_epi32(O10l, E6l); |
5844 | | O10l = _mm_add_epi32(O10l, E7l); |
5845 | | |
5846 | | O10h = _mm_add_epi32(E0h, E1h); |
5847 | | O10h = _mm_add_epi32(O10h, E2h); |
5848 | | O10h = _mm_add_epi32(O10h, E3h); |
5849 | | O10h = _mm_add_epi32(O10h, E4h); |
5850 | | O10h = _mm_add_epi32(O10h, E5h); |
5851 | | O10h = _mm_add_epi32(O10h, E6h); |
5852 | | O10h = _mm_add_epi32(O10h, E7h); |
5853 | | |
5854 | | /* Compute 11*/ |
5855 | | |
5856 | | E0l = _mm_madd_epi16(m128Tmp0, |
5857 | | _mm_load_si128((__m128i *) (transform32x32[0][11]))); |
5858 | | E0h = _mm_madd_epi16(m128Tmp1, |
5859 | | _mm_load_si128((__m128i *) (transform32x32[0][11]))); |
5860 | | E1l = _mm_madd_epi16(m128Tmp2, |
5861 | | _mm_load_si128((__m128i *) (transform32x32[1][11]))); |
5862 | | E1h = _mm_madd_epi16(m128Tmp3, |
5863 | | _mm_load_si128((__m128i *) (transform32x32[1][11]))); |
5864 | | E2l = _mm_madd_epi16(m128Tmp4, |
5865 | | _mm_load_si128((__m128i *) (transform32x32[2][11]))); |
5866 | | E2h = _mm_madd_epi16(m128Tmp5, |
5867 | | _mm_load_si128((__m128i *) (transform32x32[2][11]))); |
5868 | | E3l = _mm_madd_epi16(m128Tmp6, |
5869 | | _mm_load_si128((__m128i *) (transform32x32[3][11]))); |
5870 | | E3h = _mm_madd_epi16(m128Tmp7, |
5871 | | _mm_load_si128((__m128i *) (transform32x32[3][11]))); |
5872 | | |
5873 | | E4l = _mm_madd_epi16(m128Tmp8, |
5874 | | _mm_load_si128((__m128i *) (transform32x32[4][11]))); |
5875 | | E4h = _mm_madd_epi16(m128Tmp9, |
5876 | | _mm_load_si128((__m128i *) (transform32x32[4][11]))); |
5877 | | E5l = _mm_madd_epi16(m128Tmp10, |
5878 | | _mm_load_si128((__m128i *) (transform32x32[5][11]))); |
5879 | | E5h = _mm_madd_epi16(m128Tmp11, |
5880 | | _mm_load_si128((__m128i *) (transform32x32[5][11]))); |
5881 | | E6l = _mm_madd_epi16(m128Tmp12, |
5882 | | _mm_load_si128((__m128i *) (transform32x32[6][11]))); |
5883 | | E6h = _mm_madd_epi16(m128Tmp13, |
5884 | | _mm_load_si128((__m128i *) (transform32x32[6][11]))); |
5885 | | E7l = _mm_madd_epi16(m128Tmp14, |
5886 | | _mm_load_si128((__m128i *) (transform32x32[7][11]))); |
5887 | | E7h = _mm_madd_epi16(m128Tmp15, |
5888 | | _mm_load_si128((__m128i *) (transform32x32[7][11]))); |
5889 | | |
5890 | | O11l = _mm_add_epi32(E0l, E1l); |
5891 | | O11l = _mm_add_epi32(O11l, E2l); |
5892 | | O11l = _mm_add_epi32(O11l, E3l); |
5893 | | O11l = _mm_add_epi32(O11l, E4l); |
5894 | | O11l = _mm_add_epi32(O11l, E5l); |
5895 | | O11l = _mm_add_epi32(O11l, E6l); |
5896 | | O11l = _mm_add_epi32(O11l, E7l); |
5897 | | |
5898 | | O11h = _mm_add_epi32(E0h, E1h); |
5899 | | O11h = _mm_add_epi32(O11h, E2h); |
5900 | | O11h = _mm_add_epi32(O11h, E3h); |
5901 | | O11h = _mm_add_epi32(O11h, E4h); |
5902 | | O11h = _mm_add_epi32(O11h, E5h); |
5903 | | O11h = _mm_add_epi32(O11h, E6h); |
5904 | | O11h = _mm_add_epi32(O11h, E7h); |
5905 | | |
5906 | | /* Compute 12*/ |
5907 | | |
5908 | | E0l = _mm_madd_epi16(m128Tmp0, |
5909 | | _mm_load_si128((__m128i *) (transform32x32[0][12]))); |
5910 | | E0h = _mm_madd_epi16(m128Tmp1, |
5911 | | _mm_load_si128((__m128i *) (transform32x32[0][12]))); |
5912 | | E1l = _mm_madd_epi16(m128Tmp2, |
5913 | | _mm_load_si128((__m128i *) (transform32x32[1][12]))); |
5914 | | E1h = _mm_madd_epi16(m128Tmp3, |
5915 | | _mm_load_si128((__m128i *) (transform32x32[1][12]))); |
5916 | | E2l = _mm_madd_epi16(m128Tmp4, |
5917 | | _mm_load_si128((__m128i *) (transform32x32[2][12]))); |
5918 | | E2h = _mm_madd_epi16(m128Tmp5, |
5919 | | _mm_load_si128((__m128i *) (transform32x32[2][12]))); |
5920 | | E3l = _mm_madd_epi16(m128Tmp6, |
5921 | | _mm_load_si128((__m128i *) (transform32x32[3][12]))); |
5922 | | E3h = _mm_madd_epi16(m128Tmp7, |
5923 | | _mm_load_si128((__m128i *) (transform32x32[3][12]))); |
5924 | | |
5925 | | E4l = _mm_madd_epi16(m128Tmp8, |
5926 | | _mm_load_si128((__m128i *) (transform32x32[4][12]))); |
5927 | | E4h = _mm_madd_epi16(m128Tmp9, |
5928 | | _mm_load_si128((__m128i *) (transform32x32[4][12]))); |
5929 | | E5l = _mm_madd_epi16(m128Tmp10, |
5930 | | _mm_load_si128((__m128i *) (transform32x32[5][12]))); |
5931 | | E5h = _mm_madd_epi16(m128Tmp11, |
5932 | | _mm_load_si128((__m128i *) (transform32x32[5][12]))); |
5933 | | E6l = _mm_madd_epi16(m128Tmp12, |
5934 | | _mm_load_si128((__m128i *) (transform32x32[6][12]))); |
5935 | | E6h = _mm_madd_epi16(m128Tmp13, |
5936 | | _mm_load_si128((__m128i *) (transform32x32[6][12]))); |
5937 | | E7l = _mm_madd_epi16(m128Tmp14, |
5938 | | _mm_load_si128((__m128i *) (transform32x32[7][12]))); |
5939 | | E7h = _mm_madd_epi16(m128Tmp15, |
5940 | | _mm_load_si128((__m128i *) (transform32x32[7][12]))); |
5941 | | |
5942 | | O12l = _mm_add_epi32(E0l, E1l); |
5943 | | O12l = _mm_add_epi32(O12l, E2l); |
5944 | | O12l = _mm_add_epi32(O12l, E3l); |
5945 | | O12l = _mm_add_epi32(O12l, E4l); |
5946 | | O12l = _mm_add_epi32(O12l, E5l); |
5947 | | O12l = _mm_add_epi32(O12l, E6l); |
5948 | | O12l = _mm_add_epi32(O12l, E7l); |
5949 | | |
5950 | | O12h = _mm_add_epi32(E0h, E1h); |
5951 | | O12h = _mm_add_epi32(O12h, E2h); |
5952 | | O12h = _mm_add_epi32(O12h, E3h); |
5953 | | O12h = _mm_add_epi32(O12h, E4h); |
5954 | | O12h = _mm_add_epi32(O12h, E5h); |
5955 | | O12h = _mm_add_epi32(O12h, E6h); |
5956 | | O12h = _mm_add_epi32(O12h, E7h); |
5957 | | |
5958 | | /* Compute 13*/ |
5959 | | |
5960 | | E0l = _mm_madd_epi16(m128Tmp0, |
5961 | | _mm_load_si128((__m128i *) (transform32x32[0][13]))); |
5962 | | E0h = _mm_madd_epi16(m128Tmp1, |
5963 | | _mm_load_si128((__m128i *) (transform32x32[0][13]))); |
5964 | | E1l = _mm_madd_epi16(m128Tmp2, |
5965 | | _mm_load_si128((__m128i *) (transform32x32[1][13]))); |
5966 | | E1h = _mm_madd_epi16(m128Tmp3, |
5967 | | _mm_load_si128((__m128i *) (transform32x32[1][13]))); |
5968 | | E2l = _mm_madd_epi16(m128Tmp4, |
5969 | | _mm_load_si128((__m128i *) (transform32x32[2][13]))); |
5970 | | E2h = _mm_madd_epi16(m128Tmp5, |
5971 | | _mm_load_si128((__m128i *) (transform32x32[2][13]))); |
5972 | | E3l = _mm_madd_epi16(m128Tmp6, |
5973 | | _mm_load_si128((__m128i *) (transform32x32[3][13]))); |
5974 | | E3h = _mm_madd_epi16(m128Tmp7, |
5975 | | _mm_load_si128((__m128i *) (transform32x32[3][13]))); |
5976 | | |
5977 | | E4l = _mm_madd_epi16(m128Tmp8, |
5978 | | _mm_load_si128((__m128i *) (transform32x32[4][13]))); |
5979 | | E4h = _mm_madd_epi16(m128Tmp9, |
5980 | | _mm_load_si128((__m128i *) (transform32x32[4][13]))); |
5981 | | E5l = _mm_madd_epi16(m128Tmp10, |
5982 | | _mm_load_si128((__m128i *) (transform32x32[5][13]))); |
5983 | | E5h = _mm_madd_epi16(m128Tmp11, |
5984 | | _mm_load_si128((__m128i *) (transform32x32[5][13]))); |
5985 | | E6l = _mm_madd_epi16(m128Tmp12, |
5986 | | _mm_load_si128((__m128i *) (transform32x32[6][13]))); |
5987 | | E6h = _mm_madd_epi16(m128Tmp13, |
5988 | | _mm_load_si128((__m128i *) (transform32x32[6][13]))); |
5989 | | E7l = _mm_madd_epi16(m128Tmp14, |
5990 | | _mm_load_si128((__m128i *) (transform32x32[7][13]))); |
5991 | | E7h = _mm_madd_epi16(m128Tmp15, |
5992 | | _mm_load_si128((__m128i *) (transform32x32[7][13]))); |
5993 | | |
5994 | | O13l = _mm_add_epi32(E0l, E1l); |
5995 | | O13l = _mm_add_epi32(O13l, E2l); |
5996 | | O13l = _mm_add_epi32(O13l, E3l); |
5997 | | O13l = _mm_add_epi32(O13l, E4l); |
5998 | | O13l = _mm_add_epi32(O13l, E5l); |
5999 | | O13l = _mm_add_epi32(O13l, E6l); |
6000 | | O13l = _mm_add_epi32(O13l, E7l); |
6001 | | |
6002 | | O13h = _mm_add_epi32(E0h, E1h); |
6003 | | O13h = _mm_add_epi32(O13h, E2h); |
6004 | | O13h = _mm_add_epi32(O13h, E3h); |
6005 | | O13h = _mm_add_epi32(O13h, E4h); |
6006 | | O13h = _mm_add_epi32(O13h, E5h); |
6007 | | O13h = _mm_add_epi32(O13h, E6h); |
6008 | | O13h = _mm_add_epi32(O13h, E7h); |
6009 | | |
6010 | | /* Compute O14 */ |
6011 | | |
6012 | | E0l = _mm_madd_epi16(m128Tmp0, |
6013 | | _mm_load_si128((__m128i *) (transform32x32[0][14]))); |
6014 | | E0h = _mm_madd_epi16(m128Tmp1, |
6015 | | _mm_load_si128((__m128i *) (transform32x32[0][14]))); |
6016 | | E1l = _mm_madd_epi16(m128Tmp2, |
6017 | | _mm_load_si128((__m128i *) (transform32x32[1][14]))); |
6018 | | E1h = _mm_madd_epi16(m128Tmp3, |
6019 | | _mm_load_si128((__m128i *) (transform32x32[1][14]))); |
6020 | | E2l = _mm_madd_epi16(m128Tmp4, |
6021 | | _mm_load_si128((__m128i *) (transform32x32[2][14]))); |
6022 | | E2h = _mm_madd_epi16(m128Tmp5, |
6023 | | _mm_load_si128((__m128i *) (transform32x32[2][14]))); |
6024 | | E3l = _mm_madd_epi16(m128Tmp6, |
6025 | | _mm_load_si128((__m128i *) (transform32x32[3][14]))); |
6026 | | E3h = _mm_madd_epi16(m128Tmp7, |
6027 | | _mm_load_si128((__m128i *) (transform32x32[3][14]))); |
6028 | | |
6029 | | E4l = _mm_madd_epi16(m128Tmp8, |
6030 | | _mm_load_si128((__m128i *) (transform32x32[4][14]))); |
6031 | | E4h = _mm_madd_epi16(m128Tmp9, |
6032 | | _mm_load_si128((__m128i *) (transform32x32[4][14]))); |
6033 | | E5l = _mm_madd_epi16(m128Tmp10, |
6034 | | _mm_load_si128((__m128i *) (transform32x32[5][14]))); |
6035 | | E5h = _mm_madd_epi16(m128Tmp11, |
6036 | | _mm_load_si128((__m128i *) (transform32x32[5][14]))); |
6037 | | E6l = _mm_madd_epi16(m128Tmp12, |
6038 | | _mm_load_si128((__m128i *) (transform32x32[6][14]))); |
6039 | | E6h = _mm_madd_epi16(m128Tmp13, |
6040 | | _mm_load_si128((__m128i *) (transform32x32[6][14]))); |
6041 | | E7l = _mm_madd_epi16(m128Tmp14, |
6042 | | _mm_load_si128((__m128i *) (transform32x32[7][14]))); |
6043 | | E7h = _mm_madd_epi16(m128Tmp15, |
6044 | | _mm_load_si128((__m128i *) (transform32x32[7][14]))); |
6045 | | |
6046 | | O14l = _mm_add_epi32(E0l, E1l); |
6047 | | O14l = _mm_add_epi32(O14l, E2l); |
6048 | | O14l = _mm_add_epi32(O14l, E3l); |
6049 | | O14l = _mm_add_epi32(O14l, E4l); |
6050 | | O14l = _mm_add_epi32(O14l, E5l); |
6051 | | O14l = _mm_add_epi32(O14l, E6l); |
6052 | | O14l = _mm_add_epi32(O14l, E7l); |
6053 | | |
6054 | | O14h = _mm_add_epi32(E0h, E1h); |
6055 | | O14h = _mm_add_epi32(O14h, E2h); |
6056 | | O14h = _mm_add_epi32(O14h, E3h); |
6057 | | O14h = _mm_add_epi32(O14h, E4h); |
6058 | | O14h = _mm_add_epi32(O14h, E5h); |
6059 | | O14h = _mm_add_epi32(O14h, E6h); |
6060 | | O14h = _mm_add_epi32(O14h, E7h); |
6061 | | |
6062 | | /* Compute O15*/ |
6063 | | |
6064 | | E0l = _mm_madd_epi16(m128Tmp0, |
6065 | | _mm_load_si128((__m128i *) (transform32x32[0][15]))); |
6066 | | E0h = _mm_madd_epi16(m128Tmp1, |
6067 | | _mm_load_si128((__m128i *) (transform32x32[0][15]))); |
6068 | | E1l = _mm_madd_epi16(m128Tmp2, |
6069 | | _mm_load_si128((__m128i *) (transform32x32[1][15]))); |
6070 | | E1h = _mm_madd_epi16(m128Tmp3, |
6071 | | _mm_load_si128((__m128i *) (transform32x32[1][15]))); |
6072 | | E2l = _mm_madd_epi16(m128Tmp4, |
6073 | | _mm_load_si128((__m128i *) (transform32x32[2][15]))); |
6074 | | E2h = _mm_madd_epi16(m128Tmp5, |
6075 | | _mm_load_si128((__m128i *) (transform32x32[2][15]))); |
6076 | | E3l = _mm_madd_epi16(m128Tmp6, |
6077 | | _mm_load_si128((__m128i *) (transform32x32[3][15]))); |
6078 | | E3h = _mm_madd_epi16(m128Tmp7, |
6079 | | _mm_load_si128((__m128i *) (transform32x32[3][15]))); |
6080 | | |
6081 | | E4l = _mm_madd_epi16(m128Tmp8, |
6082 | | _mm_load_si128((__m128i *) (transform32x32[4][15]))); |
6083 | | E4h = _mm_madd_epi16(m128Tmp9, |
6084 | | _mm_load_si128((__m128i *) (transform32x32[4][15]))); |
6085 | | E5l = _mm_madd_epi16(m128Tmp10, |
6086 | | _mm_load_si128((__m128i *) (transform32x32[5][15]))); |
6087 | | E5h = _mm_madd_epi16(m128Tmp11, |
6088 | | _mm_load_si128((__m128i *) (transform32x32[5][15]))); |
6089 | | E6l = _mm_madd_epi16(m128Tmp12, |
6090 | | _mm_load_si128((__m128i *) (transform32x32[6][15]))); |
6091 | | E6h = _mm_madd_epi16(m128Tmp13, |
6092 | | _mm_load_si128((__m128i *) (transform32x32[6][15]))); |
6093 | | E7l = _mm_madd_epi16(m128Tmp14, |
6094 | | _mm_load_si128((__m128i *) (transform32x32[7][15]))); |
6095 | | E7h = _mm_madd_epi16(m128Tmp15, |
6096 | | _mm_load_si128((__m128i *) (transform32x32[7][15]))); |
6097 | | |
6098 | | O15l = _mm_add_epi32(E0l, E1l); |
6099 | | O15l = _mm_add_epi32(O15l, E2l); |
6100 | | O15l = _mm_add_epi32(O15l, E3l); |
6101 | | O15l = _mm_add_epi32(O15l, E4l); |
6102 | | O15l = _mm_add_epi32(O15l, E5l); |
6103 | | O15l = _mm_add_epi32(O15l, E6l); |
6104 | | O15l = _mm_add_epi32(O15l, E7l); |
6105 | | |
6106 | | O15h = _mm_add_epi32(E0h, E1h); |
6107 | | O15h = _mm_add_epi32(O15h, E2h); |
6108 | | O15h = _mm_add_epi32(O15h, E3h); |
6109 | | O15h = _mm_add_epi32(O15h, E4h); |
6110 | | O15h = _mm_add_epi32(O15h, E5h); |
6111 | | O15h = _mm_add_epi32(O15h, E6h); |
6112 | | O15h = _mm_add_epi32(O15h, E7h); |
6113 | | /* Compute E0 */ |
6114 | | |
6115 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6); |
6116 | | E0l = _mm_madd_epi16(m128Tmp0, |
6117 | | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
6118 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6); |
6119 | | E0h = _mm_madd_epi16(m128Tmp1, |
6120 | | _mm_load_si128((__m128i *) (transform16x16_1[0][0]))); |
6121 | | |
6122 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14); |
6123 | | E0l = _mm_add_epi32(E0l, |
6124 | | _mm_madd_epi16(m128Tmp2, |
6125 | | _mm_load_si128( |
6126 | | (__m128i *) (transform16x16_1[1][0])))); |
6127 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14); |
6128 | | E0h = _mm_add_epi32(E0h, |
6129 | | _mm_madd_epi16(m128Tmp3, |
6130 | | _mm_load_si128( |
6131 | | (__m128i *) (transform16x16_1[1][0])))); |
6132 | | |
6133 | | m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22); |
6134 | | E0l = _mm_add_epi32(E0l, |
6135 | | _mm_madd_epi16(m128Tmp4, |
6136 | | _mm_load_si128( |
6137 | | (__m128i *) (transform16x16_1[2][0])))); |
6138 | | m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22); |
6139 | | E0h = _mm_add_epi32(E0h, |
6140 | | _mm_madd_epi16(m128Tmp5, |
6141 | | _mm_load_si128( |
6142 | | (__m128i *) (transform16x16_1[2][0])))); |
6143 | | |
6144 | | m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30); |
6145 | | E0l = _mm_add_epi32(E0l, |
6146 | | _mm_madd_epi16(m128Tmp6, |
6147 | | _mm_load_si128( |
6148 | | (__m128i *) (transform16x16_1[3][0])))); |
6149 | | m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30); |
6150 | | E0h = _mm_add_epi32(E0h, |
6151 | | _mm_madd_epi16(m128Tmp7, |
6152 | | _mm_load_si128( |
6153 | | (__m128i *) (transform16x16_1[3][0])))); |
6154 | | |
6155 | | /* Compute E1 */ |
6156 | | E1l = _mm_madd_epi16(m128Tmp0, |
6157 | | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
6158 | | E1h = _mm_madd_epi16(m128Tmp1, |
6159 | | _mm_load_si128((__m128i *) (transform16x16_1[0][1]))); |
6160 | | E1l = _mm_add_epi32(E1l, |
6161 | | _mm_madd_epi16(m128Tmp2, |
6162 | | _mm_load_si128( |
6163 | | (__m128i *) (transform16x16_1[1][1])))); |
6164 | | E1h = _mm_add_epi32(E1h, |
6165 | | _mm_madd_epi16(m128Tmp3, |
6166 | | _mm_load_si128( |
6167 | | (__m128i *) (transform16x16_1[1][1])))); |
6168 | | E1l = _mm_add_epi32(E1l, |
6169 | | _mm_madd_epi16(m128Tmp4, |
6170 | | _mm_load_si128( |
6171 | | (__m128i *) (transform16x16_1[2][1])))); |
6172 | | E1h = _mm_add_epi32(E1h, |
6173 | | _mm_madd_epi16(m128Tmp5, |
6174 | | _mm_load_si128( |
6175 | | (__m128i *) (transform16x16_1[2][1])))); |
6176 | | E1l = _mm_add_epi32(E1l, |
6177 | | _mm_madd_epi16(m128Tmp6, |
6178 | | _mm_load_si128( |
6179 | | (__m128i *) (transform16x16_1[3][1])))); |
6180 | | E1h = _mm_add_epi32(E1h, |
6181 | | _mm_madd_epi16(m128Tmp7, |
6182 | | _mm_load_si128( |
6183 | | (__m128i *) (transform16x16_1[3][1])))); |
6184 | | |
6185 | | /* Compute E2 */ |
6186 | | E2l = _mm_madd_epi16(m128Tmp0, |
6187 | | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
6188 | | E2h = _mm_madd_epi16(m128Tmp1, |
6189 | | _mm_load_si128((__m128i *) (transform16x16_1[0][2]))); |
6190 | | E2l = _mm_add_epi32(E2l, |
6191 | | _mm_madd_epi16(m128Tmp2, |
6192 | | _mm_load_si128( |
6193 | | (__m128i *) (transform16x16_1[1][2])))); |
6194 | | E2h = _mm_add_epi32(E2h, |
6195 | | _mm_madd_epi16(m128Tmp3, |
6196 | | _mm_load_si128( |
6197 | | (__m128i *) (transform16x16_1[1][2])))); |
6198 | | E2l = _mm_add_epi32(E2l, |
6199 | | _mm_madd_epi16(m128Tmp4, |
6200 | | _mm_load_si128( |
6201 | | (__m128i *) (transform16x16_1[2][2])))); |
6202 | | E2h = _mm_add_epi32(E2h, |
6203 | | _mm_madd_epi16(m128Tmp5, |
6204 | | _mm_load_si128( |
6205 | | (__m128i *) (transform16x16_1[2][2])))); |
6206 | | E2l = _mm_add_epi32(E2l, |
6207 | | _mm_madd_epi16(m128Tmp6, |
6208 | | _mm_load_si128( |
6209 | | (__m128i *) (transform16x16_1[3][2])))); |
6210 | | E2h = _mm_add_epi32(E2h, |
6211 | | _mm_madd_epi16(m128Tmp7, |
6212 | | _mm_load_si128( |
6213 | | (__m128i *) (transform16x16_1[3][2])))); |
6214 | | |
6215 | | /* Compute E3 */ |
6216 | | E3l = _mm_madd_epi16(m128Tmp0, |
6217 | | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
6218 | | E3h = _mm_madd_epi16(m128Tmp1, |
6219 | | _mm_load_si128((__m128i *) (transform16x16_1[0][3]))); |
6220 | | E3l = _mm_add_epi32(E3l, |
6221 | | _mm_madd_epi16(m128Tmp2, |
6222 | | _mm_load_si128( |
6223 | | (__m128i *) (transform16x16_1[1][3])))); |
6224 | | E3h = _mm_add_epi32(E3h, |
6225 | | _mm_madd_epi16(m128Tmp3, |
6226 | | _mm_load_si128( |
6227 | | (__m128i *) (transform16x16_1[1][3])))); |
6228 | | E3l = _mm_add_epi32(E3l, |
6229 | | _mm_madd_epi16(m128Tmp4, |
6230 | | _mm_load_si128( |
6231 | | (__m128i *) (transform16x16_1[2][3])))); |
6232 | | E3h = _mm_add_epi32(E3h, |
6233 | | _mm_madd_epi16(m128Tmp5, |
6234 | | _mm_load_si128( |
6235 | | (__m128i *) (transform16x16_1[2][3])))); |
6236 | | E3l = _mm_add_epi32(E3l, |
6237 | | _mm_madd_epi16(m128Tmp6, |
6238 | | _mm_load_si128( |
6239 | | (__m128i *) (transform16x16_1[3][3])))); |
6240 | | E3h = _mm_add_epi32(E3h, |
6241 | | _mm_madd_epi16(m128Tmp7, |
6242 | | _mm_load_si128( |
6243 | | (__m128i *) (transform16x16_1[3][3])))); |
6244 | | |
6245 | | /* Compute E4 */ |
6246 | | E4l = _mm_madd_epi16(m128Tmp0, |
6247 | | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
6248 | | E4h = _mm_madd_epi16(m128Tmp1, |
6249 | | _mm_load_si128((__m128i *) (transform16x16_1[0][4]))); |
6250 | | E4l = _mm_add_epi32(E4l, |
6251 | | _mm_madd_epi16(m128Tmp2, |
6252 | | _mm_load_si128( |
6253 | | (__m128i *) (transform16x16_1[1][4])))); |
6254 | | E4h = _mm_add_epi32(E4h, |
6255 | | _mm_madd_epi16(m128Tmp3, |
6256 | | _mm_load_si128( |
6257 | | (__m128i *) (transform16x16_1[1][4])))); |
6258 | | E4l = _mm_add_epi32(E4l, |
6259 | | _mm_madd_epi16(m128Tmp4, |
6260 | | _mm_load_si128( |
6261 | | (__m128i *) (transform16x16_1[2][4])))); |
6262 | | E4h = _mm_add_epi32(E4h, |
6263 | | _mm_madd_epi16(m128Tmp5, |
6264 | | _mm_load_si128( |
6265 | | (__m128i *) (transform16x16_1[2][4])))); |
6266 | | E4l = _mm_add_epi32(E4l, |
6267 | | _mm_madd_epi16(m128Tmp6, |
6268 | | _mm_load_si128( |
6269 | | (__m128i *) (transform16x16_1[3][4])))); |
6270 | | E4h = _mm_add_epi32(E4h, |
6271 | | _mm_madd_epi16(m128Tmp7, |
6272 | | _mm_load_si128( |
6273 | | (__m128i *) (transform16x16_1[3][4])))); |
6274 | | |
6275 | | /* Compute E3 */ |
6276 | | E5l = _mm_madd_epi16(m128Tmp0, |
6277 | | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
6278 | | E5h = _mm_madd_epi16(m128Tmp1, |
6279 | | _mm_load_si128((__m128i *) (transform16x16_1[0][5]))); |
6280 | | E5l = _mm_add_epi32(E5l, |
6281 | | _mm_madd_epi16(m128Tmp2, |
6282 | | _mm_load_si128( |
6283 | | (__m128i *) (transform16x16_1[1][5])))); |
6284 | | E5h = _mm_add_epi32(E5h, |
6285 | | _mm_madd_epi16(m128Tmp3, |
6286 | | _mm_load_si128( |
6287 | | (__m128i *) (transform16x16_1[1][5])))); |
6288 | | E5l = _mm_add_epi32(E5l, |
6289 | | _mm_madd_epi16(m128Tmp4, |
6290 | | _mm_load_si128( |
6291 | | (__m128i *) (transform16x16_1[2][5])))); |
6292 | | E5h = _mm_add_epi32(E5h, |
6293 | | _mm_madd_epi16(m128Tmp5, |
6294 | | _mm_load_si128( |
6295 | | (__m128i *) (transform16x16_1[2][5])))); |
6296 | | E5l = _mm_add_epi32(E5l, |
6297 | | _mm_madd_epi16(m128Tmp6, |
6298 | | _mm_load_si128( |
6299 | | (__m128i *) (transform16x16_1[3][5])))); |
6300 | | E5h = _mm_add_epi32(E5h, |
6301 | | _mm_madd_epi16(m128Tmp7, |
6302 | | _mm_load_si128( |
6303 | | (__m128i *) (transform16x16_1[3][5])))); |
6304 | | |
6305 | | /* Compute E6 */ |
6306 | | E6l = _mm_madd_epi16(m128Tmp0, |
6307 | | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
6308 | | E6h = _mm_madd_epi16(m128Tmp1, |
6309 | | _mm_load_si128((__m128i *) (transform16x16_1[0][6]))); |
6310 | | E6l = _mm_add_epi32(E6l, |
6311 | | _mm_madd_epi16(m128Tmp2, |
6312 | | _mm_load_si128( |
6313 | | (__m128i *) (transform16x16_1[1][6])))); |
6314 | | E6h = _mm_add_epi32(E6h, |
6315 | | _mm_madd_epi16(m128Tmp3, |
6316 | | _mm_load_si128( |
6317 | | (__m128i *) (transform16x16_1[1][6])))); |
6318 | | E6l = _mm_add_epi32(E6l, |
6319 | | _mm_madd_epi16(m128Tmp4, |
6320 | | _mm_load_si128( |
6321 | | (__m128i *) (transform16x16_1[2][6])))); |
6322 | | E6h = _mm_add_epi32(E6h, |
6323 | | _mm_madd_epi16(m128Tmp5, |
6324 | | _mm_load_si128( |
6325 | | (__m128i *) (transform16x16_1[2][6])))); |
6326 | | E6l = _mm_add_epi32(E6l, |
6327 | | _mm_madd_epi16(m128Tmp6, |
6328 | | _mm_load_si128( |
6329 | | (__m128i *) (transform16x16_1[3][6])))); |
6330 | | E6h = _mm_add_epi32(E6h, |
6331 | | _mm_madd_epi16(m128Tmp7, |
6332 | | _mm_load_si128( |
6333 | | (__m128i *) (transform16x16_1[3][6])))); |
6334 | | |
6335 | | /* Compute E7 */ |
6336 | | E7l = _mm_madd_epi16(m128Tmp0, |
6337 | | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
6338 | | E7h = _mm_madd_epi16(m128Tmp1, |
6339 | | _mm_load_si128((__m128i *) (transform16x16_1[0][7]))); |
6340 | | E7l = _mm_add_epi32(E7l, |
6341 | | _mm_madd_epi16(m128Tmp2, |
6342 | | _mm_load_si128( |
6343 | | (__m128i *) (transform16x16_1[1][7])))); |
6344 | | E7h = _mm_add_epi32(E7h, |
6345 | | _mm_madd_epi16(m128Tmp3, |
6346 | | _mm_load_si128( |
6347 | | (__m128i *) (transform16x16_1[1][7])))); |
6348 | | E7l = _mm_add_epi32(E7l, |
6349 | | _mm_madd_epi16(m128Tmp4, |
6350 | | _mm_load_si128( |
6351 | | (__m128i *) (transform16x16_1[2][7])))); |
6352 | | E7h = _mm_add_epi32(E7h, |
6353 | | _mm_madd_epi16(m128Tmp5, |
6354 | | _mm_load_si128( |
6355 | | (__m128i *) (transform16x16_1[2][7])))); |
6356 | | E7l = _mm_add_epi32(E7l, |
6357 | | _mm_madd_epi16(m128Tmp6, |
6358 | | _mm_load_si128( |
6359 | | (__m128i *) (transform16x16_1[3][7])))); |
6360 | | E7h = _mm_add_epi32(E7h, |
6361 | | _mm_madd_epi16(m128Tmp7, |
6362 | | _mm_load_si128( |
6363 | | (__m128i *) (transform16x16_1[3][7])))); |
6364 | | |
6365 | | /* Compute EE0 and EEE */ |
6366 | | |
6367 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12); |
6368 | | E00l = _mm_madd_epi16(m128Tmp0, |
6369 | | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
6370 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12); |
6371 | | E00h = _mm_madd_epi16(m128Tmp1, |
6372 | | _mm_load_si128((__m128i *) (transform16x16_2[0][0]))); |
6373 | | |
6374 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28); |
6375 | | E00l = _mm_add_epi32(E00l, |
6376 | | _mm_madd_epi16(m128Tmp2, |
6377 | | _mm_load_si128( |
6378 | | (__m128i *) (transform16x16_2[1][0])))); |
6379 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28); |
6380 | | E00h = _mm_add_epi32(E00h, |
6381 | | _mm_madd_epi16(m128Tmp3, |
6382 | | _mm_load_si128( |
6383 | | (__m128i *) (transform16x16_2[1][0])))); |
6384 | | |
6385 | | E01l = _mm_madd_epi16(m128Tmp0, |
6386 | | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
6387 | | E01h = _mm_madd_epi16(m128Tmp1, |
6388 | | _mm_load_si128((__m128i *) (transform16x16_2[0][1]))); |
6389 | | E01l = _mm_add_epi32(E01l, |
6390 | | _mm_madd_epi16(m128Tmp2, |
6391 | | _mm_load_si128( |
6392 | | (__m128i *) (transform16x16_2[1][1])))); |
6393 | | E01h = _mm_add_epi32(E01h, |
6394 | | _mm_madd_epi16(m128Tmp3, |
6395 | | _mm_load_si128( |
6396 | | (__m128i *) (transform16x16_2[1][1])))); |
6397 | | |
6398 | | E02l = _mm_madd_epi16(m128Tmp0, |
6399 | | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
6400 | | E02h = _mm_madd_epi16(m128Tmp1, |
6401 | | _mm_load_si128((__m128i *) (transform16x16_2[0][2]))); |
6402 | | E02l = _mm_add_epi32(E02l, |
6403 | | _mm_madd_epi16(m128Tmp2, |
6404 | | _mm_load_si128( |
6405 | | (__m128i *) (transform16x16_2[1][2])))); |
6406 | | E02h = _mm_add_epi32(E02h, |
6407 | | _mm_madd_epi16(m128Tmp3, |
6408 | | _mm_load_si128( |
6409 | | (__m128i *) (transform16x16_2[1][2])))); |
6410 | | |
6411 | | E03l = _mm_madd_epi16(m128Tmp0, |
6412 | | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
6413 | | E03h = _mm_madd_epi16(m128Tmp1, |
6414 | | _mm_load_si128((__m128i *) (transform16x16_2[0][3]))); |
6415 | | E03l = _mm_add_epi32(E03l, |
6416 | | _mm_madd_epi16(m128Tmp2, |
6417 | | _mm_load_si128( |
6418 | | (__m128i *) (transform16x16_2[1][3])))); |
6419 | | E03h = _mm_add_epi32(E03h, |
6420 | | _mm_madd_epi16(m128Tmp3, |
6421 | | _mm_load_si128( |
6422 | | (__m128i *) (transform16x16_2[1][3])))); |
6423 | | |
6424 | | /* Compute EE0 and EEE */ |
6425 | | |
6426 | | m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24); |
6427 | | EE0l = _mm_madd_epi16(m128Tmp0, |
6428 | | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
6429 | | m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24); |
6430 | | EE0h = _mm_madd_epi16(m128Tmp1, |
6431 | | _mm_load_si128((__m128i *) (transform16x16_3[0][0]))); |
6432 | | |
6433 | | m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16); |
6434 | | EEE0l = _mm_madd_epi16(m128Tmp2, |
6435 | | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
6436 | | m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16); |
6437 | | EEE0h = _mm_madd_epi16(m128Tmp3, |
6438 | | _mm_load_si128((__m128i *) (transform16x16_3[1][0]))); |
6439 | | |
6440 | | EE1l = _mm_madd_epi16(m128Tmp0, |
6441 | | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
6442 | | EE1h = _mm_madd_epi16(m128Tmp1, |
6443 | | _mm_load_si128((__m128i *) (transform16x16_3[0][1]))); |
6444 | | |
6445 | | EEE1l = _mm_madd_epi16(m128Tmp2, |
6446 | | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
6447 | | EEE1h = _mm_madd_epi16(m128Tmp3, |
6448 | | _mm_load_si128((__m128i *) (transform16x16_3[1][1]))); |
6449 | | |
6450 | | /* Compute EE */ |
6451 | | |
6452 | | EE2l = _mm_sub_epi32(EEE1l, EE1l); |
6453 | | EE3l = _mm_sub_epi32(EEE0l, EE0l); |
6454 | | EE2h = _mm_sub_epi32(EEE1h, EE1h); |
6455 | | EE3h = _mm_sub_epi32(EEE0h, EE0h); |
6456 | | |
6457 | | EE0l = _mm_add_epi32(EEE0l, EE0l); |
6458 | | EE1l = _mm_add_epi32(EEE1l, EE1l); |
6459 | | EE0h = _mm_add_epi32(EEE0h, EE0h); |
6460 | | EE1h = _mm_add_epi32(EEE1h, EE1h); |
6461 | | /**/ |
6462 | | |
6463 | | EE7l = _mm_sub_epi32(EE0l, E00l); |
6464 | | EE6l = _mm_sub_epi32(EE1l, E01l); |
6465 | | EE5l = _mm_sub_epi32(EE2l, E02l); |
6466 | | EE4l = _mm_sub_epi32(EE3l, E03l); |
6467 | | |
6468 | | EE7h = _mm_sub_epi32(EE0h, E00h); |
6469 | | EE6h = _mm_sub_epi32(EE1h, E01h); |
6470 | | EE5h = _mm_sub_epi32(EE2h, E02h); |
6471 | | EE4h = _mm_sub_epi32(EE3h, E03h); |
6472 | | |
6473 | | EE0l = _mm_add_epi32(EE0l, E00l); |
6474 | | EE1l = _mm_add_epi32(EE1l, E01l); |
6475 | | EE2l = _mm_add_epi32(EE2l, E02l); |
6476 | | EE3l = _mm_add_epi32(EE3l, E03l); |
6477 | | |
6478 | | EE0h = _mm_add_epi32(EE0h, E00h); |
6479 | | EE1h = _mm_add_epi32(EE1h, E01h); |
6480 | | EE2h = _mm_add_epi32(EE2h, E02h); |
6481 | | EE3h = _mm_add_epi32(EE3h, E03h); |
6482 | | /* Compute E */ |
6483 | | |
6484 | | E15l = _mm_sub_epi32(EE0l, E0l); |
6485 | | E15l = _mm_add_epi32(E15l, m128iAdd); |
6486 | | E14l = _mm_sub_epi32(EE1l, E1l); |
6487 | | E14l = _mm_add_epi32(E14l, m128iAdd); |
6488 | | E13l = _mm_sub_epi32(EE2l, E2l); |
6489 | | E13l = _mm_add_epi32(E13l, m128iAdd); |
6490 | | E12l = _mm_sub_epi32(EE3l, E3l); |
6491 | | E12l = _mm_add_epi32(E12l, m128iAdd); |
6492 | | E11l = _mm_sub_epi32(EE4l, E4l); |
6493 | | E11l = _mm_add_epi32(E11l, m128iAdd); |
6494 | | E10l = _mm_sub_epi32(EE5l, E5l); |
6495 | | E10l = _mm_add_epi32(E10l, m128iAdd); |
6496 | | E9l = _mm_sub_epi32(EE6l, E6l); |
6497 | | E9l = _mm_add_epi32(E9l, m128iAdd); |
6498 | | E8l = _mm_sub_epi32(EE7l, E7l); |
6499 | | E8l = _mm_add_epi32(E8l, m128iAdd); |
6500 | | |
6501 | | E0l = _mm_add_epi32(EE0l, E0l); |
6502 | | E0l = _mm_add_epi32(E0l, m128iAdd); |
6503 | | E1l = _mm_add_epi32(EE1l, E1l); |
6504 | | E1l = _mm_add_epi32(E1l, m128iAdd); |
6505 | | E2l = _mm_add_epi32(EE2l, E2l); |
6506 | | E2l = _mm_add_epi32(E2l, m128iAdd); |
6507 | | E3l = _mm_add_epi32(EE3l, E3l); |
6508 | | E3l = _mm_add_epi32(E3l, m128iAdd); |
6509 | | E4l = _mm_add_epi32(EE4l, E4l); |
6510 | | E4l = _mm_add_epi32(E4l, m128iAdd); |
6511 | | E5l = _mm_add_epi32(EE5l, E5l); |
6512 | | E5l = _mm_add_epi32(E5l, m128iAdd); |
6513 | | E6l = _mm_add_epi32(EE6l, E6l); |
6514 | | E6l = _mm_add_epi32(E6l, m128iAdd); |
6515 | | E7l = _mm_add_epi32(EE7l, E7l); |
6516 | | E7l = _mm_add_epi32(E7l, m128iAdd); |
6517 | | |
6518 | | E15h = _mm_sub_epi32(EE0h, E0h); |
6519 | | E15h = _mm_add_epi32(E15h, m128iAdd); |
6520 | | E14h = _mm_sub_epi32(EE1h, E1h); |
6521 | | E14h = _mm_add_epi32(E14h, m128iAdd); |
6522 | | E13h = _mm_sub_epi32(EE2h, E2h); |
6523 | | E13h = _mm_add_epi32(E13h, m128iAdd); |
6524 | | E12h = _mm_sub_epi32(EE3h, E3h); |
6525 | | E12h = _mm_add_epi32(E12h, m128iAdd); |
6526 | | E11h = _mm_sub_epi32(EE4h, E4h); |
6527 | | E11h = _mm_add_epi32(E11h, m128iAdd); |
6528 | | E10h = _mm_sub_epi32(EE5h, E5h); |
6529 | | E10h = _mm_add_epi32(E10h, m128iAdd); |
6530 | | E9h = _mm_sub_epi32(EE6h, E6h); |
6531 | | E9h = _mm_add_epi32(E9h, m128iAdd); |
6532 | | E8h = _mm_sub_epi32(EE7h, E7h); |
6533 | | E8h = _mm_add_epi32(E8h, m128iAdd); |
6534 | | |
6535 | | E0h = _mm_add_epi32(EE0h, E0h); |
6536 | | E0h = _mm_add_epi32(E0h, m128iAdd); |
6537 | | E1h = _mm_add_epi32(EE1h, E1h); |
6538 | | E1h = _mm_add_epi32(E1h, m128iAdd); |
6539 | | E2h = _mm_add_epi32(EE2h, E2h); |
6540 | | E2h = _mm_add_epi32(E2h, m128iAdd); |
6541 | | E3h = _mm_add_epi32(EE3h, E3h); |
6542 | | E3h = _mm_add_epi32(E3h, m128iAdd); |
6543 | | E4h = _mm_add_epi32(EE4h, E4h); |
6544 | | E4h = _mm_add_epi32(E4h, m128iAdd); |
6545 | | E5h = _mm_add_epi32(EE5h, E5h); |
6546 | | E5h = _mm_add_epi32(E5h, m128iAdd); |
6547 | | E6h = _mm_add_epi32(EE6h, E6h); |
6548 | | E6h = _mm_add_epi32(E6h, m128iAdd); |
6549 | | E7h = _mm_add_epi32(EE7h, E7h); |
6550 | | E7h = _mm_add_epi32(E7h, m128iAdd); |
6551 | | |
6552 | | m128iS0 = _mm_packs_epi32( |
6553 | | _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift), |
6554 | | _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift)); |
6555 | | m128iS1 = _mm_packs_epi32( |
6556 | | _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift), |
6557 | | _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift)); |
6558 | | m128iS2 = _mm_packs_epi32( |
6559 | | _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift), |
6560 | | _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift)); |
6561 | | m128iS3 = _mm_packs_epi32( |
6562 | | _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift), |
6563 | | _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift)); |
6564 | | m128iS4 = _mm_packs_epi32( |
6565 | | _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift), |
6566 | | _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift)); |
6567 | | m128iS5 = _mm_packs_epi32( |
6568 | | _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift), |
6569 | | _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift)); |
6570 | | m128iS6 = _mm_packs_epi32( |
6571 | | _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift), |
6572 | | _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift)); |
6573 | | m128iS7 = _mm_packs_epi32( |
6574 | | _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift), |
6575 | | _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift)); |
6576 | | m128iS8 = _mm_packs_epi32( |
6577 | | _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift), |
6578 | | _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift)); |
6579 | | m128iS9 = _mm_packs_epi32( |
6580 | | _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift), |
6581 | | _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift)); |
6582 | | m128iS10 = _mm_packs_epi32( |
6583 | | _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift), |
6584 | | _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift)); |
6585 | | m128iS11 = _mm_packs_epi32( |
6586 | | _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift), |
6587 | | _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift)); |
6588 | | m128iS12 = _mm_packs_epi32( |
6589 | | _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift), |
6590 | | _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift)); |
6591 | | m128iS13 = _mm_packs_epi32( |
6592 | | _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift), |
6593 | | _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift)); |
6594 | | m128iS14 = _mm_packs_epi32( |
6595 | | _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift), |
6596 | | _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift)); |
6597 | | m128iS15 = _mm_packs_epi32( |
6598 | | _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift), |
6599 | | _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift)); |
6600 | | |
6601 | | m128iS31 = _mm_packs_epi32( |
6602 | | _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift), |
6603 | | _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift)); |
6604 | | m128iS30 = _mm_packs_epi32( |
6605 | | _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift), |
6606 | | _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift)); |
6607 | | m128iS29 = _mm_packs_epi32( |
6608 | | _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift), |
6609 | | _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift)); |
6610 | | m128iS28 = _mm_packs_epi32( |
6611 | | _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift), |
6612 | | _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift)); |
6613 | | m128iS27 = _mm_packs_epi32( |
6614 | | _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift), |
6615 | | _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift)); |
6616 | | m128iS26 = _mm_packs_epi32( |
6617 | | _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift), |
6618 | | _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift)); |
6619 | | m128iS25 = _mm_packs_epi32( |
6620 | | _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift), |
6621 | | _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift)); |
6622 | | m128iS24 = _mm_packs_epi32( |
6623 | | _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift), |
6624 | | _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift)); |
6625 | | m128iS23 = _mm_packs_epi32( |
6626 | | _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift), |
6627 | | _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift)); |
6628 | | m128iS22 = _mm_packs_epi32( |
6629 | | _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift), |
6630 | | _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift)); |
6631 | | m128iS21 = _mm_packs_epi32( |
6632 | | _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift), |
6633 | | _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift)); |
6634 | | m128iS20 = _mm_packs_epi32( |
6635 | | _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift), |
6636 | | _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift)); |
6637 | | m128iS19 = _mm_packs_epi32( |
6638 | | _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift), |
6639 | | _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift)); |
6640 | | m128iS18 = _mm_packs_epi32( |
6641 | | _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift), |
6642 | | _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift)); |
6643 | | m128iS17 = _mm_packs_epi32( |
6644 | | _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift), |
6645 | | _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift)); |
6646 | | m128iS16 = _mm_packs_epi32( |
6647 | | _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift), |
6648 | | _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift)); |
6649 | | |
6650 | | if (!j) { |
6651 | | /* Inverse the matrix */ |
6652 | | E0l = _mm_unpacklo_epi16(m128iS0, m128iS16); |
6653 | | E1l = _mm_unpacklo_epi16(m128iS1, m128iS17); |
6654 | | E2l = _mm_unpacklo_epi16(m128iS2, m128iS18); |
6655 | | E3l = _mm_unpacklo_epi16(m128iS3, m128iS19); |
6656 | | E4l = _mm_unpacklo_epi16(m128iS4, m128iS20); |
6657 | | E5l = _mm_unpacklo_epi16(m128iS5, m128iS21); |
6658 | | E6l = _mm_unpacklo_epi16(m128iS6, m128iS22); |
6659 | | E7l = _mm_unpacklo_epi16(m128iS7, m128iS23); |
6660 | | E8l = _mm_unpacklo_epi16(m128iS8, m128iS24); |
6661 | | E9l = _mm_unpacklo_epi16(m128iS9, m128iS25); |
6662 | | E10l = _mm_unpacklo_epi16(m128iS10, m128iS26); |
6663 | | E11l = _mm_unpacklo_epi16(m128iS11, m128iS27); |
6664 | | E12l = _mm_unpacklo_epi16(m128iS12, m128iS28); |
6665 | | E13l = _mm_unpacklo_epi16(m128iS13, m128iS29); |
6666 | | E14l = _mm_unpacklo_epi16(m128iS14, m128iS30); |
6667 | | E15l = _mm_unpacklo_epi16(m128iS15, m128iS31); |
6668 | | |
6669 | | O0l = _mm_unpackhi_epi16(m128iS0, m128iS16); |
6670 | | O1l = _mm_unpackhi_epi16(m128iS1, m128iS17); |
6671 | | O2l = _mm_unpackhi_epi16(m128iS2, m128iS18); |
6672 | | O3l = _mm_unpackhi_epi16(m128iS3, m128iS19); |
6673 | | O4l = _mm_unpackhi_epi16(m128iS4, m128iS20); |
6674 | | O5l = _mm_unpackhi_epi16(m128iS5, m128iS21); |
6675 | | O6l = _mm_unpackhi_epi16(m128iS6, m128iS22); |
6676 | | O7l = _mm_unpackhi_epi16(m128iS7, m128iS23); |
6677 | | O8l = _mm_unpackhi_epi16(m128iS8, m128iS24); |
6678 | | O9l = _mm_unpackhi_epi16(m128iS9, m128iS25); |
6679 | | O10l = _mm_unpackhi_epi16(m128iS10, m128iS26); |
6680 | | O11l = _mm_unpackhi_epi16(m128iS11, m128iS27); |
6681 | | O12l = _mm_unpackhi_epi16(m128iS12, m128iS28); |
6682 | | O13l = _mm_unpackhi_epi16(m128iS13, m128iS29); |
6683 | | O14l = _mm_unpackhi_epi16(m128iS14, m128iS30); |
6684 | | O15l = _mm_unpackhi_epi16(m128iS15, m128iS31); |
6685 | | |
6686 | | E0h = _mm_unpacklo_epi16(E0l, E8l); |
6687 | | E1h = _mm_unpacklo_epi16(E1l, E9l); |
6688 | | E2h = _mm_unpacklo_epi16(E2l, E10l); |
6689 | | E3h = _mm_unpacklo_epi16(E3l, E11l); |
6690 | | E4h = _mm_unpacklo_epi16(E4l, E12l); |
6691 | | E5h = _mm_unpacklo_epi16(E5l, E13l); |
6692 | | E6h = _mm_unpacklo_epi16(E6l, E14l); |
6693 | | E7h = _mm_unpacklo_epi16(E7l, E15l); |
6694 | | |
6695 | | E8h = _mm_unpackhi_epi16(E0l, E8l); |
6696 | | E9h = _mm_unpackhi_epi16(E1l, E9l); |
6697 | | E10h = _mm_unpackhi_epi16(E2l, E10l); |
6698 | | E11h = _mm_unpackhi_epi16(E3l, E11l); |
6699 | | E12h = _mm_unpackhi_epi16(E4l, E12l); |
6700 | | E13h = _mm_unpackhi_epi16(E5l, E13l); |
6701 | | E14h = _mm_unpackhi_epi16(E6l, E14l); |
6702 | | E15h = _mm_unpackhi_epi16(E7l, E15l); |
6703 | | |
6704 | | m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); |
6705 | | m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); |
6706 | | m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); |
6707 | | m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); |
6708 | | |
6709 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6710 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6711 | | m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6712 | | m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6713 | | |
6714 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6715 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6716 | | m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6717 | | m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6718 | | |
6719 | | m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); |
6720 | | m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); |
6721 | | m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); |
6722 | | m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); |
6723 | | |
6724 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6725 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6726 | | m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6727 | | m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6728 | | |
6729 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6730 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6731 | | m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6732 | | m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6733 | | |
6734 | | m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); |
6735 | | m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); |
6736 | | m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); |
6737 | | m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); |
6738 | | |
6739 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6740 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6741 | | m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6742 | | m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6743 | | |
6744 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6745 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6746 | | m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6747 | | m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6748 | | |
6749 | | m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); |
6750 | | m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); |
6751 | | m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); |
6752 | | m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); |
6753 | | |
6754 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6755 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6756 | | m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6757 | | m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6758 | | |
6759 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6760 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6761 | | m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6762 | | m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6763 | | |
6764 | | /* */ |
6765 | | E0h = _mm_unpacklo_epi16(O0l, O8l); |
6766 | | E1h = _mm_unpacklo_epi16(O1l, O9l); |
6767 | | E2h = _mm_unpacklo_epi16(O2l, O10l); |
6768 | | E3h = _mm_unpacklo_epi16(O3l, O11l); |
6769 | | E4h = _mm_unpacklo_epi16(O4l, O12l); |
6770 | | E5h = _mm_unpacklo_epi16(O5l, O13l); |
6771 | | E6h = _mm_unpacklo_epi16(O6l, O14l); |
6772 | | E7h = _mm_unpacklo_epi16(O7l, O15l); |
6773 | | |
6774 | | E8h = _mm_unpackhi_epi16(O0l, O8l); |
6775 | | E9h = _mm_unpackhi_epi16(O1l, O9l); |
6776 | | E10h = _mm_unpackhi_epi16(O2l, O10l); |
6777 | | E11h = _mm_unpackhi_epi16(O3l, O11l); |
6778 | | E12h = _mm_unpackhi_epi16(O4l, O12l); |
6779 | | E13h = _mm_unpackhi_epi16(O5l, O13l); |
6780 | | E14h = _mm_unpackhi_epi16(O6l, O14l); |
6781 | | E15h = _mm_unpackhi_epi16(O7l, O15l); |
6782 | | |
6783 | | m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h); |
6784 | | m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h); |
6785 | | m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h); |
6786 | | m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h); |
6787 | | |
6788 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6789 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6790 | | m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6791 | | m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6792 | | |
6793 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6794 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6795 | | m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6796 | | m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6797 | | |
6798 | | m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h); |
6799 | | m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h); |
6800 | | m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h); |
6801 | | m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h); |
6802 | | |
6803 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6804 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6805 | | m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6806 | | m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6807 | | |
6808 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6809 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6810 | | m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6811 | | m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6812 | | |
6813 | | m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h); |
6814 | | m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h); |
6815 | | m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h); |
6816 | | m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h); |
6817 | | |
6818 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6819 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6820 | | m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6821 | | m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6822 | | |
6823 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6824 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6825 | | m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6826 | | m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6827 | | |
6828 | | m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h); |
6829 | | m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h); |
6830 | | m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h); |
6831 | | m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h); |
6832 | | |
6833 | | m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2); |
6834 | | m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3); |
6835 | | m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6836 | | m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6837 | | |
6838 | | m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2); |
6839 | | m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3); |
6840 | | m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5); |
6841 | | m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5); |
6842 | | /* */ |
6843 | | _mm_store_si128((__m128i *) (src + i), m128iS0); |
6844 | | _mm_store_si128((__m128i *) (src + 32 + i), m128iS1); |
6845 | | _mm_store_si128((__m128i *) (src + 64 + i), m128iS2); |
6846 | | _mm_store_si128((__m128i *) (src + 96 + i), m128iS3); |
6847 | | _mm_store_si128((__m128i *) (src + 128 + i), m128iS4); |
6848 | | _mm_store_si128((__m128i *) (src + 160 + i), m128iS5); |
6849 | | _mm_store_si128((__m128i *) (src + 192 + i), m128iS6); |
6850 | | _mm_store_si128((__m128i *) (src + 224 + i), m128iS7); |
6851 | | _mm_store_si128((__m128i *) (src + 256 + i), m128iS8); |
6852 | | _mm_store_si128((__m128i *) (src + 288 + i), m128iS9); |
6853 | | _mm_store_si128((__m128i *) (src + 320 + i), m128iS10); |
6854 | | _mm_store_si128((__m128i *) (src + 352 + i), m128iS11); |
6855 | | _mm_store_si128((__m128i *) (src + 384 + i), m128iS12); |
6856 | | _mm_store_si128((__m128i *) (src + 416 + i), m128iS13); |
6857 | | _mm_store_si128((__m128i *) (src + 448 + i), m128iS14); |
6858 | | _mm_store_si128((__m128i *) (src + 480 + i), m128iS15); |
6859 | | _mm_store_si128((__m128i *) (src + 512 + i), m128iS16); |
6860 | | _mm_store_si128((__m128i *) (src + 544 + i), m128iS17); |
6861 | | _mm_store_si128((__m128i *) (src + 576 + i), m128iS18); |
6862 | | _mm_store_si128((__m128i *) (src + 608 + i), m128iS19); |
6863 | | _mm_store_si128((__m128i *) (src + 640 + i), m128iS20); |
6864 | | _mm_store_si128((__m128i *) (src + 672 + i), m128iS21); |
6865 | | _mm_store_si128((__m128i *) (src + 704 + i), m128iS22); |
6866 | | _mm_store_si128((__m128i *) (src + 736 + i), m128iS23); |
6867 | | _mm_store_si128((__m128i *) (src + 768 + i), m128iS24); |
6868 | | _mm_store_si128((__m128i *) (src + 800 + i), m128iS25); |
6869 | | _mm_store_si128((__m128i *) (src + 832 + i), m128iS26); |
6870 | | _mm_store_si128((__m128i *) (src + 864 + i), m128iS27); |
6871 | | _mm_store_si128((__m128i *) (src + 896 + i), m128iS28); |
6872 | | _mm_store_si128((__m128i *) (src + 928 + i), m128iS29); |
6873 | | _mm_store_si128((__m128i *) (src + 960 + i), m128iS30); |
6874 | | _mm_store_si128((__m128i *) (src + 992 + i), m128iS31); |
6875 | | |
6876 | | if (i <= 16) { |
6877 | | int k = i + 8; |
6878 | | m128iS0 = _mm_load_si128((__m128i *) (src + k)); |
6879 | | m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k)); |
6880 | | m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k)); |
6881 | | m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k)); |
6882 | | m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k)); |
6883 | | m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k)); |
6884 | | m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k)); |
6885 | | m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k)); |
6886 | | m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k)); |
6887 | | m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k)); |
6888 | | m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k)); |
6889 | | m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k)); |
6890 | | m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k)); |
6891 | | m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k)); |
6892 | | m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k)); |
6893 | | m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k)); |
6894 | | |
6895 | | m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k)); |
6896 | | m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k)); |
6897 | | m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k)); |
6898 | | m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k)); |
6899 | | m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k)); |
6900 | | m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k)); |
6901 | | m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k)); |
6902 | | m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k)); |
6903 | | m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k)); |
6904 | | m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k)); |
6905 | | m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k)); |
6906 | | m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k)); |
6907 | | m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k)); |
6908 | | m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k)); |
6909 | | m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k)); |
6910 | | m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k)); |
6911 | | } else { |
6912 | | m128iS0 = _mm_load_si128((__m128i *) (src)); |
6913 | | m128iS1 = _mm_load_si128((__m128i *) (src + 128)); |
6914 | | m128iS2 = _mm_load_si128((__m128i *) (src + 256)); |
6915 | | m128iS3 = _mm_load_si128((__m128i *) (src + 384)); |
6916 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 512)); |
6917 | | m128iS5 = _mm_load_si128((__m128i *) (src + 640)); |
6918 | | m128iS6 = _mm_load_si128((__m128i *) (src + 768)); |
6919 | | m128iS7 = _mm_load_si128((__m128i *) (src + 896)); |
6920 | | m128iS8 = _mm_load_si128((__m128i *) (src + 8)); |
6921 | | m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8)); |
6922 | | m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8)); |
6923 | | m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8)); |
6924 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8)); |
6925 | | m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8)); |
6926 | | m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8)); |
6927 | | m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8)); |
6928 | | m128iS16 = _mm_load_si128((__m128i *) (src + 16)); |
6929 | | m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16)); |
6930 | | m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16)); |
6931 | | m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16)); |
6932 | | m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16)); |
6933 | | m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16)); |
6934 | | m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16)); |
6935 | | m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16)); |
6936 | | m128iS24 = _mm_load_si128((__m128i *) (src + 24)); |
6937 | | m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24)); |
6938 | | m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24)); |
6939 | | m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24)); |
6940 | | m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24)); |
6941 | | m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24)); |
6942 | | m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24)); |
6943 | | m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24)); |
6944 | | shift = shift_2nd; |
6945 | | m128iAdd = _mm_set1_epi32(add_2nd); |
6946 | | } |
6947 | | |
6948 | | } else { |
6949 | | int k, m = 0; |
6950 | | _mm_storeu_si128((__m128i *) (src), m128iS0); |
6951 | | _mm_storeu_si128((__m128i *) (src + 8), m128iS1); |
6952 | | _mm_storeu_si128((__m128i *) (src + 16), m128iS2); |
6953 | | _mm_storeu_si128((__m128i *) (src + 24), m128iS3); |
6954 | | _mm_storeu_si128((__m128i *) (src + 128), m128iS4); |
6955 | | _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5); |
6956 | | _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6); |
6957 | | _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7); |
6958 | | _mm_storeu_si128((__m128i *) (src + 256), m128iS8); |
6959 | | _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9); |
6960 | | _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10); |
6961 | | _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11); |
6962 | | _mm_storeu_si128((__m128i *) (src + 384), m128iS12); |
6963 | | _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13); |
6964 | | _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14); |
6965 | | _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15); |
6966 | | |
6967 | | _mm_storeu_si128((__m128i *) (src + 512), m128iS16); |
6968 | | _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17); |
6969 | | _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18); |
6970 | | _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19); |
6971 | | _mm_storeu_si128((__m128i *) (src + 640), m128iS20); |
6972 | | _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21); |
6973 | | _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22); |
6974 | | _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23); |
6975 | | _mm_storeu_si128((__m128i *) (src + 768), m128iS24); |
6976 | | _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25); |
6977 | | _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26); |
6978 | | _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27); |
6979 | | _mm_storeu_si128((__m128i *) (src + 896), m128iS28); |
6980 | | _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29); |
6981 | | _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30); |
6982 | | _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31); |
6983 | | dst = (uint16_t*) _dst + (i * stride); |
6984 | | for (k = 0; k < 8; k++) { |
6985 | | dst[0] = av_clip_uintp2(dst[0] + src[m],10); |
6986 | | dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10); |
6987 | | dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10); |
6988 | | dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10); |
6989 | | dst[4] = av_clip_uintp2( |
6990 | | dst[4] + src[m + 128],10); |
6991 | | dst[5] = av_clip_uintp2( |
6992 | | dst[5] + src[m + 128 + 8],10); |
6993 | | dst[6] = av_clip_uintp2( |
6994 | | dst[6] + src[m + 128 + 16],10); |
6995 | | dst[7] = av_clip_uintp2( |
6996 | | dst[7] + src[m + 128 + 24],10); |
6997 | | |
6998 | | dst[8] = av_clip_uintp2( |
6999 | | dst[8] + src[m + 256],10); |
7000 | | dst[9] = av_clip_uintp2( |
7001 | | dst[9] + src[m + 256 + 8],10); |
7002 | | dst[10] = av_clip_uintp2( |
7003 | | dst[10] + src[m + 256 + 16],10); |
7004 | | dst[11] = av_clip_uintp2( |
7005 | | dst[11] + src[m + 256 + 24],10); |
7006 | | dst[12] = av_clip_uintp2( |
7007 | | dst[12] + src[m + 384],10); |
7008 | | dst[13] = av_clip_uintp2( |
7009 | | dst[13] + src[m + 384 + 8],10); |
7010 | | dst[14] = av_clip_uintp2( |
7011 | | dst[14] + src[m + 384 + 16],10); |
7012 | | dst[15] = av_clip_uintp2( |
7013 | | dst[15] + src[m + 384 + 24],10); |
7014 | | |
7015 | | dst[16] = av_clip_uintp2( |
7016 | | dst[16] + src[m + 512],10); |
7017 | | dst[17] = av_clip_uintp2( |
7018 | | dst[17] + src[m + 512 + 8],10); |
7019 | | dst[18] = av_clip_uintp2( |
7020 | | dst[18] + src[m + 512 + 16],10); |
7021 | | dst[19] = av_clip_uintp2( |
7022 | | dst[19] + src[m + 512 + 24],10); |
7023 | | dst[20] = av_clip_uintp2( |
7024 | | dst[20] + src[m + 640],10); |
7025 | | dst[21] = av_clip_uintp2( |
7026 | | dst[21] + src[m + 640 + 8],10); |
7027 | | dst[22] = av_clip_uintp2( |
7028 | | dst[22] + src[m + 640 + 16],10); |
7029 | | dst[23] = av_clip_uintp2( |
7030 | | dst[23] + src[m + 640 + 24],10); |
7031 | | |
7032 | | dst[24] = av_clip_uintp2( |
7033 | | dst[24] + src[m + 768],10); |
7034 | | dst[25] = av_clip_uintp2( |
7035 | | dst[25] + src[m + 768 + 8],10); |
7036 | | dst[26] = av_clip_uintp2( |
7037 | | dst[26] + src[m + 768 + 16],10); |
7038 | | dst[27] = av_clip_uintp2( |
7039 | | dst[27] + src[m + 768 + 24],10); |
7040 | | dst[28] = av_clip_uintp2( |
7041 | | dst[28] + src[m + 896],10); |
7042 | | dst[29] = av_clip_uintp2( |
7043 | | dst[29] + src[m + 896 + 8],10); |
7044 | | dst[30] = av_clip_uintp2( |
7045 | | dst[30] + src[m + 896 + 16],10); |
7046 | | dst[31] = av_clip_uintp2( |
7047 | | dst[31] + src[m + 896 + 24],10); |
7048 | | |
7049 | | m += 1; |
7050 | | dst += stride; |
7051 | | } |
7052 | | if (i <= 16) { |
7053 | | int k = (i + 8) * 4; |
7054 | | m128iS0 = _mm_load_si128((__m128i *) (src + k)); |
7055 | | m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k)); |
7056 | | m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k)); |
7057 | | m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k)); |
7058 | | m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k)); |
7059 | | m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k)); |
7060 | | m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k)); |
7061 | | m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k)); |
7062 | | m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k)); |
7063 | | m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k)); |
7064 | | m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k)); |
7065 | | m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k)); |
7066 | | m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k)); |
7067 | | m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k)); |
7068 | | m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k)); |
7069 | | m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k)); |
7070 | | m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k)); |
7071 | | m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k)); |
7072 | | m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k)); |
7073 | | m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k)); |
7074 | | m128iS20 = _mm_loadu_si128( |
7075 | | (__m128i *) (src + 512 + 16 + k)); |
7076 | | m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k)); |
7077 | | m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k)); |
7078 | | m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k)); |
7079 | | m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k)); |
7080 | | m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k)); |
7081 | | m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k)); |
7082 | | m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k)); |
7083 | | m128iS28 = _mm_loadu_si128( |
7084 | | (__m128i *) (src + 512 + 24 + k)); |
7085 | | m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k)); |
7086 | | m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k)); |
7087 | | m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k)); |
7088 | | } |
7089 | | } |
7090 | | } |
7091 | | } |
7092 | | } |
7093 | | #endif |
7094 | | |