Coverage Report

Created: 2025-11-14 07:32

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libde265/libde265/x86/sse-dct.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013 openHEVC contributors
4
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5
 *
6
 * This file is part of libde265.
7
 *
8
 * libde265 is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU Lesser General Public License as
10
 * published by the Free Software Foundation, either version 3 of
11
 * the License, or (at your option) any later version.
12
 *
13
 * libde265 is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public License
19
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include "x86/sse-dct.h"
23
#include "libde265/util.h"
24
25
#ifdef HAVE_CONFIG_H
26
#include "config.h"
27
#endif
28
29
#include <emmintrin.h> // SSE2
30
#include <tmmintrin.h> // SSSE3
31
32
#if HAVE_SSE4_1
33
#include <smmintrin.h> // SSE4.1
34
#endif
35
36
37
ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
38
{
39
    {   29, +84, 29,  +84,  29, +84,  29, +84 },
40
    {  +74, +55, +74, +55, +74, +55, +74, +55 },
41
    {   55, -29,  55, -29,  55, -29,  55, -29 },
42
    {  +74, -84, +74, -84, +74, -84, +74, -84 },
43
    {   74, -74,  74, -74,  74, -74,  74, -74 },
44
    {    0, +74,   0, +74,   0, +74,   0, +74 },
45
    {   84, +55,  84, +55,  84, +55,  84, +55 },
46
    {  -74, -29, -74, -29, -74, -29, -74, -29 }
47
};
48
49
ALIGNED_16(static const int16_t) transform4x4[4][8] = {
50
    { 64,  64, 64,  64, 64,  64, 64,  64 },
51
    { 64, -64, 64, -64, 64, -64, 64, -64 },
52
    { 83,  36, 83,  36, 83,  36, 83,  36 },
53
    { 36, -83, 36, -83, 36, -83, 36, -83 }
54
};
55
56
ALIGNED_16(static const int16_t) transform8x8[12][8] =
57
{
58
    {  89,  75,  89,  75, 89,  75, 89,  75 },
59
    {  50,  18,  50,  18, 50,  18, 50,  18 },
60
    {  75, -18,  75, -18, 75, -18, 75, -18 },
61
    { -89, -50, -89, -50,-89, -50,-89, -50 },
62
    {  50, -89,  50, -89, 50, -89, 50, -89 },
63
    {  18,  75,  18,  75, 18,  75, 18,  75 },
64
    {  18, -50,  18, -50, 18, -50, 18, -50 },
65
    {  75, -89,  75, -89, 75, -89, 75, -89 },
66
    {  64,  64,  64,  64, 64,  64, 64,  64 },
67
    {  64, -64,  64, -64, 64, -64, 64, -64 },
68
    {  83,  36,  83,  36, 83,  36, 83,  36 },
69
    {  36, -83,  36, -83, 36, -83, 36, -83 }
70
};
71
72
ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
73
{
74
    {/*1-3*/ /*2-6*/
75
        { 90,  87,  90,  87,  90,  87,  90,  87 },
76
        { 87,  57,  87,  57,  87,  57,  87,  57 },
77
        { 80,   9,  80,   9,  80,   9,  80,   9 },
78
        { 70, -43,  70, -43,  70, -43,  70, -43 },
79
        { 57, -80,  57, -80,  57, -80,  57, -80 },
80
        { 43, -90,  43, -90,  43, -90,  43, -90 },
81
        { 25, -70,  25, -70,  25, -70,  25, -70 },
82
        { 9,  -25,   9, -25,   9, -25,   9, -25 },
83
    },{ /*5-7*/ /*10-14*/
84
        {  80,  70,  80,  70,  80,  70,  80,  70 },
85
        {   9, -43,   9, -43,   9, -43,   9, -43 },
86
        { -70, -87, -70, -87, -70, -87, -70, -87 },
87
        { -87,   9, -87,   9, -87,   9, -87,   9 },
88
        { -25,  90, -25,  90, -25,  90, -25,  90 },
89
        {  57,  25,  57,  25,  57,  25,  57,  25 },
90
        {  90, -80,  90, -80,  90, -80,  90, -80 },
91
        {  43, -57,  43, -57,  43, -57,  43, -57 },
92
    },{ /*9-11*/ /*18-22*/
93
        {  57,  43,  57,  43,  57,  43,  57,  43 },
94
        { -80, -90, -80, -90, -80, -90, -80, -90 },
95
        { -25,  57, -25,  57, -25,  57, -25,  57 },
96
        {  90,  25,  90,  25,  90,  25,  90,  25 },
97
        {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
98
        { -87,  70, -87,  70, -87,  70, -87,  70 },
99
        {  43,   9,  43,   9,  43,   9,  43,   9 },
100
        {  70, -80,  70, -80,  70, -80,  70, -80 },
101
    },{/*13-15*/ /*  26-30   */
102
        {  25,   9,  25,   9,  25,   9,  25,   9 },
103
        { -70, -25, -70, -25, -70, -25, -70, -25 },
104
        {  90,  43,  90,  43,  90,  43,  90,  43 },
105
        { -80, -57, -80, -57, -80, -57, -80, -57 },
106
        {  43,  70,  43,  70,  43,  70,  43,  70 },
107
        {  9,  -80,   9, -80,   9, -80,   9, -80 },
108
        { -57,  87, -57,  87, -57,  87, -57,  87 },
109
        {  87, -90,  87, -90,  87, -90,  87, -90 },
110
    }
111
};
112
113
ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
114
{
115
    { /*2-6*/ /*4-12*/
116
        { 89,  75,  89,  75, 89,  75, 89,  75 },
117
        { 75, -18,  75, -18, 75, -18, 75, -18 },
118
        { 50, -89,  50, -89, 50, -89, 50, -89 },
119
        { 18, -50,  18, -50, 18, -50, 18, -50 },
120
    },{ /*10-14*/  /*20-28*/
121
        {  50,  18,  50,  18,  50,  18,  50,  18 },
122
        { -89, -50, -89, -50, -89, -50, -89, -50 },
123
        {  18,  75,  18,  75,  18,  75,  18,  75 },
124
        {  75, -89,  75, -89,  75, -89,  75, -89 },
125
    }
126
};
127
128
ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
129
{
130
    {/*4-12*/ /*8-24*/
131
        {  83,  36,  83,  36,  83,  36,  83,  36 },
132
        {  36, -83,  36, -83,  36, -83,  36, -83 },
133
    },{ /*0-8*/  /*0-16*/
134
        { 64,  64, 64,  64, 64,  64, 64,  64 },
135
        { 64, -64, 64, -64, 64, -64, 64, -64 },
136
    }
137
};
138
139
140
ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
141
{
142
    { /*   1-3     */
143
        { 90,  90, 90,  90, 90,  90, 90,  90 },
144
        { 90,  82, 90,  82, 90,  82, 90,  82 },
145
        { 88,  67, 88,  67, 88,  67, 88,  67 },
146
        { 85,  46, 85,  46, 85,  46, 85,  46 },
147
        { 82,  22, 82,  22, 82,  22, 82,  22 },
148
        { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
149
        { 73, -31, 73, -31, 73, -31, 73, -31 },
150
        { 67, -54, 67, -54, 67, -54, 67, -54 },
151
        { 61, -73, 61, -73, 61, -73, 61, -73 },
152
        { 54, -85, 54, -85, 54, -85, 54, -85 },
153
        { 46, -90, 46, -90, 46, -90, 46, -90 },
154
        { 38, -88, 38, -88, 38, -88, 38, -88 },
155
        { 31, -78, 31, -78, 31, -78, 31, -78 },
156
        { 22, -61, 22, -61, 22, -61, 22, -61 },
157
        { 13, -38, 13, -38, 13, -38, 13, -38 },
158
        { 4,  -13,  4, -13,  4, -13,  4, -13 },
159
    },{/*  5-7 */
160
        {  88,  85,  88,  85,  88,  85,  88,  85 },
161
        {  67,  46,  67,  46,  67,  46,  67,  46 },
162
        {  31, -13,  31, -13,  31, -13,  31, -13 },
163
        { -13, -67, -13, -67, -13, -67, -13, -67 },
164
        { -54, -90, -54, -90, -54, -90, -54, -90 },
165
        { -82, -73, -82, -73, -82, -73, -82, -73 },
166
        { -90, -22, -90, -22, -90, -22, -90, -22 },
167
        { -78,  38, -78,  38, -78,  38, -78,  38 },
168
        { -46,  82, -46,  82, -46,  82, -46,  82 },
169
        {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
170
        {  38,  54,  38,  54,  38,  54,  38,  54 },
171
        {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
172
        {  90, -61,  90, -61,  90, -61,  90, -61 },
173
        {  85, -90,  85, -90,  85, -90,  85, -90 },
174
        {  61, -78,  61, -78,  61, -78,  61, -78 },
175
        {  22, -31,  22, -31,  22, -31,  22, -31 },
176
    },{/*  9-11   */
177
        {  82,  78,  82,  78,  82,  78,  82,  78 },
178
        {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
179
        { -54, -82, -54, -82, -54, -82, -54, -82 },
180
        { -90, -73, -90, -73, -90, -73, -90, -73 },
181
        { -61,  13, -61,  13, -61,  13, -61,  13 },
182
        {  13,  85,  13,  85,  13,  85,  13,  85 },
183
        {  78,  67,  78,  67,  78,  67,  78,  67 },
184
        {  85, -22,  85, -22,  85, -22,  85, -22 },
185
        {  31, -88,  31, -88,  31, -88,  31, -88 },
186
        { -46, -61, -46, -61, -46, -61, -46, -61 },
187
        { -90,  31, -90,  31, -90,  31, -90,  31 },
188
        { -67,  90, -67,  90, -67,  90, -67,  90 },
189
        {   4,  54,   4,  54,   4,  54,   4,  54 },
190
        {  73, -38,  73, -38,  73, -38,  73, -38 },
191
        {  88, -90,  88, -90,  88, -90,  88, -90 },
192
        {  38, -46,  38, -46,  38, -46,  38, -46 },
193
    },{/*  13-15   */
194
        {  73,  67,  73,  67,  73,  67,  73,  67 },
195
        { -31, -54, -31, -54, -31, -54, -31, -54 },
196
        { -90, -78, -90, -78, -90, -78, -90, -78 },
197
        { -22,  38, -22,  38, -22,  38, -22,  38 },
198
        {  78,  85,  78,  85,  78,  85,  78,  85 },
199
        {  67, -22,  67, -22,  67, -22,  67, -22 },
200
        { -38, -90, -38, -90, -38, -90, -38, -90 },
201
        { -90,   4, -90,   4, -90,   4, -90,   4 },
202
        { -13,  90, -13,  90, -13,  90, -13,  90 },
203
        {  82,  13,  82,  13,  82,  13,  82,  13 },
204
        {  61, -88,  61, -88,  61, -88,  61, -88 },
205
        { -46, -31, -46, -31, -46, -31, -46, -31 },
206
        { -88,  82, -88,  82, -88,  82, -88,  82 },
207
        { -4,   46, -4,   46, -4,   46, -4,   46 },
208
        {  85, -73,  85, -73,  85, -73,  85, -73 },
209
        {  54, -61,  54, -61,  54, -61,  54, -61 },
210
    },{/*  17-19   */
211
        {  61,  54,  61,  54,  61,  54,  61,  54 },
212
        { -73, -85, -73, -85, -73, -85, -73, -85 },
213
        { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
214
        {  82,  88,  82,  88,  82,  88,  82,  88 },
215
        {  31, -46,  31, -46,  31, -46,  31, -46 },
216
        { -88, -61, -88, -61, -88, -61, -88, -61 },
217
        { -13,  82, -13,  82, -13,  82, -13,  82 },
218
        {  90,  13,  90,  13,  90,  13,  90,  13 },
219
        { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
220
        { -90,  38, -90,  38, -90,  38, -90,  38 },
221
        {  22,  67,  22,  67,  22,  67,  22,  67 },
222
        {  85, -78,  85, -78,  85, -78,  85, -78 },
223
        { -38, -22, -38, -22, -38, -22, -38, -22 },
224
        { -78,  90, -78,  90, -78,  90, -78,  90 },
225
        {  54, -31,  54, -31,  54, -31,  54, -31 },
226
        {  67, -73,  67, -73,  67, -73,  67, -73 },
227
    },{ /*  21-23   */
228
        {  46,  38,  46,  38,  46,  38,  46,  38 },
229
        { -90, -88, -90, -88, -90, -88, -90, -88 },
230
        {  38,  73,  38,  73,  38,  73,  38,  73 },
231
        {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
232
        { -90, -67, -90, -67, -90, -67, -90, -67 },
233
        {  31,  90,  31,  90,  31,  90,  31,  90 },
234
        {  61, -46,  61, -46,  61, -46,  61, -46 },
235
        { -88, -31, -88, -31, -88, -31, -88, -31 },
236
        {  22,  85,  22,  85,  22,  85,  22,  85 },
237
        {  67, -78,  67, -78,  67, -78,  67, -78 },
238
        { -85,  13, -85,  13, -85,  13, -85,  13 },
239
        {  13,  61,  13,  61,  13,  61,  13,  61 },
240
        {  73, -90,  73, -90,  73, -90,  73, -90 },
241
        { -82,  54, -82,  54, -82,  54, -82,  54 },
242
        {   4,  22,   4,  22,   4,  22,   4,  22 },
243
        {  78, -82,  78, -82,  78, -82,  78, -82 },
244
    },{ /*  25-27   */
245
        {  31,  22,  31,  22,  31,  22,  31,  22 },
246
        { -78, -61, -78, -61, -78, -61, -78, -61 },
247
        {  90,  85,  90,  85,  90,  85,  90,  85 },
248
        { -61, -90, -61, -90, -61, -90, -61, -90 },
249
        {   4,  73,   4,  73,   4,  73,   4,  73 },
250
        {  54, -38,  54, -38,  54, -38,  54, -38 },
251
        { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
252
        {  82,  46,  82,  46,  82,  46,  82,  46 },
253
        { -38, -78, -38, -78, -38, -78, -38, -78 },
254
        { -22,  90, -22,  90, -22,  90, -22,  90 },
255
        {  73, -82,  73, -82,  73, -82,  73, -82 },
256
        { -90,  54, -90,  54, -90,  54, -90,  54 },
257
        {  67, -13,  67, -13,  67, -13,  67, -13 },
258
        { -13, -31, -13, -31, -13, -31, -13, -31 },
259
        { -46,  67, -46,  67, -46,  67, -46,  67 },
260
        {  85, -88,  85, -88,  85, -88,  85, -88 },
261
    },{/*  29-31   */
262
        {  13,   4,  13,   4,  13,   4,  13,   4 },
263
        { -38, -13, -38, -13, -38, -13, -38, -13 },
264
        {  61,  22,  61,  22,  61,  22,  61,  22 },
265
        { -78, -31, -78, -31, -78, -31, -78, -31 },
266
        {  88,  38,  88,  38,  88,  38,  88,  38 },
267
        { -90, -46, -90, -46, -90, -46, -90, -46 },
268
        {  85,  54,  85,  54,  85,  54,  85,  54 },
269
        { -73, -61, -73, -61, -73, -61, -73, -61 },
270
        {  54,  67,  54,  67,  54,  67,  54,  67 },
271
        { -31, -73, -31, -73, -31, -73, -31, -73 },
272
        {   4,  78,   4,  78,   4,  78,   4,  78 },
273
        {  22, -82,  22, -82,  22, -82,  22, -82 },
274
        { -46,  85, -46,  85, -46,  85, -46,  85 },
275
        {  67, -88,  67, -88,  67, -88,  67, -88 },
276
        { -82,  90, -82,  90, -82,  90, -82,  90 },
277
        {  90, -90,  90, -90,  90, -90,  90, -90 },
278
    }
279
};
280
281
4.67M
#define shift_1st 7
282
289k
#define add_1st (1 << (shift_1st - 1))
283
284
285
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
286
0
{
287
0
    uint8_t *dst = (uint8_t*)_dst;
288
0
    ptrdiff_t stride = _stride;
289
0
    int shift = 5;
290
0
    int offset = 16;
291
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r9;
292
293
0
    r9= _mm_setzero_si128();
294
    //r8= _mm_set_epi32(0,0,0,-1);
295
0
    r2= _mm_set1_epi16(offset);
296
297
0
    r0= _mm_load_si128((__m128i*)(coeffs));
298
0
    r1= _mm_load_si128((__m128i*)(coeffs+8));
299
300
301
0
    r0= _mm_adds_epi16(r0,r2);
302
0
    r1= _mm_adds_epi16(r1,r2);
303
304
0
    r0= _mm_srai_epi16(r0,shift);
305
0
    r1= _mm_srai_epi16(r1,shift);
306
307
0
    r3= _mm_loadl_epi64((__m128i*)(dst));
308
0
    r4= _mm_loadl_epi64((__m128i*)(dst + stride));
309
0
    r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
310
0
    r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
311
312
0
    r3= _mm_unpacklo_epi8(r3,r9);
313
0
    r4= _mm_unpacklo_epi8(r4,r9);
314
0
    r5= _mm_unpacklo_epi8(r5,r9);
315
0
    r6= _mm_unpacklo_epi8(r6,r9);
316
0
    r3= _mm_unpacklo_epi64(r3,r4);
317
0
    r4= _mm_unpacklo_epi64(r5,r6);
318
319
320
0
    r3= _mm_adds_epi16(r3,r0);
321
0
    r4= _mm_adds_epi16(r4,r1);
322
323
0
    r3= _mm_packus_epi16(r3,r4);
324
    //r8= _mm_set_epi32(0,0,0,-1);
325
326
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
327
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
328
329
0
    r3= _mm_srli_si128(r3,4);
330
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
331
0
    *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
332
333
0
    r3= _mm_srli_si128(r3,4);
334
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
335
0
    *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
336
337
0
    r3= _mm_srli_si128(r3,4);
338
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
339
0
    *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
340
0
}
341
342
343
344
#if HAVE_SSE4_1
345
void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
346
0
                                           ptrdiff_t _stride) {
347
348
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
349
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
350
351
0
    uint8_t *dst = (uint8_t*) _dst;
352
0
    ptrdiff_t stride = _stride;
353
0
    const int16_t *src = coeffs;
354
0
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
355
0
            m128iD;
356
0
    m128iAdd = _mm_set1_epi32(64);
357
358
0
    S0 = _mm_load_si128((__m128i *) (src));
359
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
360
361
0
    m128iAC = _mm_unpacklo_epi16(S0, S8);
362
0
    m128iBD = _mm_unpackhi_epi16(S0, S8);
363
364
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
365
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
366
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
367
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
368
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369
0
    S0 = _mm_add_epi32(S0, m128iAdd);
370
0
    S0 = _mm_srai_epi32(S0, shift_1st);
371
372
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
373
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
374
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
375
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
376
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377
0
    S8 = _mm_add_epi32(S8, m128iAdd);
378
0
    S8 = _mm_srai_epi32(S8, shift_1st);
379
380
0
    m128iA = _mm_packs_epi32(S0, S8);
381
382
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
383
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
384
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
385
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
386
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
387
0
    S0 = _mm_add_epi32(S0, m128iAdd);
388
0
    S0 = _mm_srai_epi32(S0, shift_1st);
389
390
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
391
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
392
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
393
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
394
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
395
0
    S8 = _mm_add_epi32(S8, m128iAdd);
396
0
    S8 = _mm_srai_epi32(S8, shift_1st);
397
398
0
    m128iD = _mm_packs_epi32(S0, S8);
399
400
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
401
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
402
403
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
404
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
405
406
    /*   ###################    */
407
0
    m128iAdd = _mm_set1_epi32(add_2nd);
408
409
0
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
410
0
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
411
412
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
413
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
414
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
415
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
416
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417
0
    S0 = _mm_add_epi32(S0, m128iAdd);
418
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
419
420
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
421
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
422
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
423
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
424
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425
0
    S8 = _mm_add_epi32(S8, m128iAdd);
426
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
427
428
0
    m128iA = _mm_packs_epi32(S0, S8);
429
430
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
431
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
432
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
433
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
434
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
435
0
    S0 = _mm_add_epi32(S0, m128iAdd);
436
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
437
438
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
439
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
440
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
441
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
442
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
443
0
    S8 = _mm_add_epi32(S8, m128iAdd);
444
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
445
446
0
    m128iD = _mm_packs_epi32(S0, S8);
447
448
//    _mm_storeu_si128((__m128i *) (src), m128iA);
449
//    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
450
451
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
452
0
    S8 = _mm_move_epi64(m128iD); //row 2
453
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
454
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
455
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
456
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
457
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
458
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
459
460
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
461
462
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
463
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
465
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468
469
0
    dst += stride;
470
471
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
472
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
474
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477
478
0
    dst += stride;
479
480
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
481
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
482
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
483
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
484
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
485
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
486
487
0
    dst += stride;
488
489
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
490
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
491
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
492
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
493
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
494
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
495
0
}
496
#endif // SSE4.1
497
498
#if 0
499
void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
500
        ptrdiff_t _stride) {
501
    int i,j;
502
    uint8_t shift_2nd = 10; // 20 - Bit depth
503
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
504
505
    uint16_t *dst = (uint16_t*) _dst;
506
    ptrdiff_t stride = _stride/(sizeof(uint16_t));
507
    int16_t *src = coeffs;
508
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
509
            m128iD;
510
511
    m128iAdd = _mm_set1_epi32(64);
512
513
    S0 = _mm_loadu_si128((__m128i *) (src));
514
    S8 = _mm_loadu_si128((__m128i *) (src + 8));
515
516
    m128iAC = _mm_unpacklo_epi16(S0, S8);
517
    m128iBD = _mm_unpackhi_epi16(S0, S8);
518
519
    m128iTmp1 = _mm_madd_epi16(m128iAC,
520
            _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
521
    m128iTmp2 = _mm_madd_epi16(m128iBD,
522
            _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
523
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524
    S0 = _mm_add_epi32(S0, m128iAdd);
525
    S0 = _mm_srai_epi32(S0, shift_1st);
526
527
    m128iTmp1 = _mm_madd_epi16(m128iAC,
528
            _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
529
    m128iTmp2 = _mm_madd_epi16(m128iBD,
530
            _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
531
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532
    S8 = _mm_add_epi32(S8, m128iAdd);
533
    S8 = _mm_srai_epi32(S8, shift_1st);
534
535
    m128iA = _mm_packs_epi32(S0, S8);
536
537
    m128iTmp1 = _mm_madd_epi16(m128iAC,
538
            _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
539
    m128iTmp2 = _mm_madd_epi16(m128iBD,
540
            _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
541
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
542
    S0 = _mm_add_epi32(S0, m128iAdd);
543
    S0 = _mm_srai_epi32(S0, shift_1st);
544
545
    m128iTmp1 = _mm_madd_epi16(m128iAC,
546
            _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
547
    m128iTmp2 = _mm_madd_epi16(m128iBD,
548
            _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
549
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
550
    S8 = _mm_add_epi32(S8, m128iAdd);
551
    S8 = _mm_srai_epi32(S8, shift_1st);
552
553
    m128iD = _mm_packs_epi32(S0, S8);
554
555
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
556
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
557
558
    m128iA = _mm_unpacklo_epi16(S0, S8);
559
    m128iD = _mm_unpackhi_epi16(S0, S8);
560
561
    /*   ###################    */
562
    m128iAdd = _mm_set1_epi32(add_2nd);
563
564
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
565
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
566
567
    m128iTmp1 = _mm_madd_epi16(m128iAC,
568
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
569
    m128iTmp2 = _mm_madd_epi16(m128iBD,
570
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
571
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572
    S0 = _mm_add_epi32(S0, m128iAdd);
573
    S0 = _mm_srai_epi32(S0, shift_2nd);
574
575
    m128iTmp1 = _mm_madd_epi16(m128iAC,
576
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
577
    m128iTmp2 = _mm_madd_epi16(m128iBD,
578
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
579
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580
    S8 = _mm_add_epi32(S8, m128iAdd);
581
    S8 = _mm_srai_epi32(S8, shift_2nd);
582
583
    m128iA = _mm_packs_epi32(S0, S8);
584
585
    m128iTmp1 = _mm_madd_epi16(m128iAC,
586
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
587
    m128iTmp2 = _mm_madd_epi16(m128iBD,
588
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
589
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
590
    S0 = _mm_add_epi32(S0, m128iAdd);
591
    S0 = _mm_srai_epi32(S0, shift_2nd);
592
593
    m128iTmp1 = _mm_madd_epi16(m128iAC,
594
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
595
    m128iTmp2 = _mm_madd_epi16(m128iBD,
596
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
597
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
598
    S8 = _mm_add_epi32(S8, m128iAdd);
599
    S8 = _mm_srai_epi32(S8, shift_2nd);
600
601
    m128iD = _mm_packs_epi32(S0, S8);
602
603
    _mm_storeu_si128((__m128i *) (src), m128iA);
604
    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
605
    j = 0;
606
    for (i = 0; i < 2; i++) {
607
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
608
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
609
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
610
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
611
        j += 1;
612
        dst += stride;
613
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
614
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
615
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
616
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
617
        j += 1;
618
        dst += stride;
619
    }
620
621
}
622
#endif
623
624
625
#if HAVE_SSE4_1
626
void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
627
0
        ptrdiff_t _stride) {
628
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
629
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
630
631
0
    uint8_t *dst = (uint8_t*) _dst;
632
0
    ptrdiff_t stride = _stride;
633
0
    const int16_t *src = coeffs;
634
635
0
    __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
636
0
    S0 = _mm_load_si128((__m128i *) (src));
637
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
638
0
    m128iAdd = _mm_set1_epi32(add_1st);
639
640
0
    m128Tmp = _mm_unpacklo_epi16(S0, S8);
641
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
642
0
    E1 = _mm_add_epi32(E1, m128iAdd);
643
644
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
645
0
    E2 = _mm_add_epi32(E2, m128iAdd);
646
647
0
    m128Tmp = _mm_unpackhi_epi16(S0, S8);
648
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
649
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
650
651
0
    m128iA = _mm_add_epi32(E1, O1);
652
0
    m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
653
0
    m128Tmp = _mm_add_epi32(E2, O2);
654
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
655
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
656
657
0
    m128iD = _mm_sub_epi32(E2, O2);
658
0
    m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
659
660
0
    m128Tmp = _mm_sub_epi32(E1, O1);
661
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
662
663
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
664
665
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
666
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
667
668
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
669
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
670
671
    /*  ##########################  */
672
673
0
    m128iAdd = _mm_set1_epi32(add_2nd);
674
0
    m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
675
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
676
0
    E1 = _mm_add_epi32(E1, m128iAdd);
677
678
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
679
0
    E2 = _mm_add_epi32(E2, m128iAdd);
680
681
0
    m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
682
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
683
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
684
685
0
    m128iA = _mm_add_epi32(E1, O1);
686
0
    m128iA = _mm_srai_epi32(m128iA, shift_2nd);
687
0
    m128Tmp = _mm_add_epi32(E2, O2);
688
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
689
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
690
691
0
    m128iD = _mm_sub_epi32(E2, O2);
692
0
    m128iD = _mm_srai_epi32(m128iD, shift_2nd);
693
694
0
    m128Tmp = _mm_sub_epi32(E1, O1);
695
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
696
697
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
698
699
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
700
0
    S8 = _mm_move_epi64(m128iD); //row 2
701
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
702
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
703
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
704
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
705
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
706
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
707
708
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
709
710
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
711
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
713
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716
717
0
    dst += stride;
718
719
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
720
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
722
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725
726
0
    dst += stride;
727
728
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
729
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
730
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
731
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
732
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
733
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
734
735
0
    dst += stride;
736
737
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
738
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
739
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
740
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
741
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
742
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
743
0
}
744
#endif
745
746
#if 0
747
void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
748
        ptrdiff_t _stride) {
749
    int i;
750
    uint8_t shift_2nd = 10; // 20 - Bit depth
751
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
752
753
    uint16_t *dst = (uint16_t*) _dst;
754
    ptrdiff_t stride = _stride/2;
755
    int16_t *src = coeffs;
756
757
    int j;
758
        __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
759
        S0 = _mm_load_si128((__m128i *) (src));
760
        S8 = _mm_load_si128((__m128i *) (src + 8));
761
        m128iAdd = _mm_set1_epi32(add_1st);
762
763
        m128Tmp = _mm_unpacklo_epi16(S0, S8);
764
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
765
        E1 = _mm_add_epi32(E1, m128iAdd);
766
767
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
768
        E2 = _mm_add_epi32(E2, m128iAdd);
769
770
        m128Tmp = _mm_unpackhi_epi16(S0, S8);
771
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
772
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
773
774
        m128iA = _mm_add_epi32(E1, O1);
775
        m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
776
        m128Tmp = _mm_add_epi32(E2, O2);
777
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
778
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
779
780
        m128iD = _mm_sub_epi32(E2, O2);
781
        m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
782
783
        m128Tmp = _mm_sub_epi32(E1, O1);
784
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
785
786
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
787
788
        S0 = _mm_unpacklo_epi16(m128iA, m128iD);
789
        S8 = _mm_unpackhi_epi16(m128iA, m128iD);
790
791
        m128iA = _mm_unpacklo_epi16(S0, S8);
792
        m128iD = _mm_unpackhi_epi16(S0, S8);
793
794
        /*  ##########################  */
795
796
        m128iAdd = _mm_set1_epi32(add_2nd);
797
        m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
798
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
799
        E1 = _mm_add_epi32(E1, m128iAdd);
800
801
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
802
        E2 = _mm_add_epi32(E2, m128iAdd);
803
804
        m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
805
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
806
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
807
808
        m128iA = _mm_add_epi32(E1, O1);
809
        m128iA = _mm_srai_epi32(m128iA, shift_2nd);
810
        m128Tmp = _mm_add_epi32(E2, O2);
811
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
812
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
813
814
        m128iD = _mm_sub_epi32(E2, O2);
815
        m128iD = _mm_srai_epi32(m128iD, shift_2nd);
816
817
        m128Tmp = _mm_sub_epi32(E1, O1);
818
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
819
820
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
821
        _mm_storeu_si128((__m128i *) (src), m128iA);
822
        _mm_storeu_si128((__m128i *) (src + 8), m128iD);
823
        j = 0;
824
        for (i = 0; i < 2; i++) {
825
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
826
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
827
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
828
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
829
            j += 1;
830
            dst += stride;
831
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
832
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
833
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
834
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
835
            j += 1;
836
            dst += stride;
837
        }
838
}
839
#endif
840
841
#if HAVE_SSE4_1
842
void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
843
272k
        ptrdiff_t _stride) {
844
272k
    uint8_t shift_2nd = 12; // 20 - Bit depth
845
272k
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
846
847
272k
    uint8_t *dst = (uint8_t*) _dst;
848
272k
    ptrdiff_t stride = _stride / sizeof(uint8_t);
849
272k
    const int16_t *src = coeffs;
850
272k
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
851
272k
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
852
272k
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
853
854
272k
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
855
272k
            T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
856
272k
    T0= _mm_load_si128((__m128i *) (transform8x8[0]));
857
272k
    T1= _mm_load_si128((__m128i *) (transform8x8[1]));
858
272k
    T2= _mm_load_si128((__m128i *) (transform8x8[2]));
859
272k
    T3= _mm_load_si128((__m128i *) (transform8x8[3]));
860
272k
    T4= _mm_load_si128((__m128i *) (transform8x8[4]));
861
272k
    T5= _mm_load_si128((__m128i *) (transform8x8[5]));
862
272k
    T6= _mm_load_si128((__m128i *) (transform8x8[6]));
863
272k
    T7= _mm_load_si128((__m128i *) (transform8x8[7]));
864
272k
    T8= _mm_load_si128((__m128i *) (transform8x8[8]));
865
272k
    T9= _mm_load_si128((__m128i *) (transform8x8[9]));
866
272k
    T10= _mm_load_si128((__m128i *) (transform8x8[10]));
867
272k
    T11= _mm_load_si128((__m128i *) (transform8x8[11]));
868
869
272k
    m128iAdd = _mm_set1_epi32(add_1st);
870
871
272k
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
872
272k
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
873
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
874
272k
    E1l = _mm_madd_epi16(m128Tmp0, T0);
875
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
876
272k
    E1h = _mm_madd_epi16(m128Tmp1, T0);
877
272k
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
878
272k
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
879
272k
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
880
272k
    E2l = _mm_madd_epi16(m128Tmp2, T1);
881
272k
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
882
272k
    E2h = _mm_madd_epi16(m128Tmp3, T1);
883
272k
    O0l = _mm_add_epi32(E1l, E2l);
884
272k
    O0h = _mm_add_epi32(E1h, E2h);
885
886
272k
    E1l = _mm_madd_epi16(m128Tmp0, T2);
887
272k
    E1h = _mm_madd_epi16(m128Tmp1, T2);
888
272k
    E2l = _mm_madd_epi16(m128Tmp2, T3);
889
272k
    E2h = _mm_madd_epi16(m128Tmp3, T3);
890
891
272k
    O1l = _mm_add_epi32(E1l, E2l);
892
272k
    O1h = _mm_add_epi32(E1h, E2h);
893
894
272k
    E1l = _mm_madd_epi16(m128Tmp0, T4);
895
272k
    E1h = _mm_madd_epi16(m128Tmp1, T4);
896
272k
    E2l = _mm_madd_epi16(m128Tmp2, T5);
897
272k
    E2h = _mm_madd_epi16(m128Tmp3, T5);
898
272k
    O2l = _mm_add_epi32(E1l, E2l);
899
272k
    O2h = _mm_add_epi32(E1h, E2h);
900
901
272k
    E1l = _mm_madd_epi16(m128Tmp0, T6);
902
272k
    E1h = _mm_madd_epi16(m128Tmp1, T6);
903
272k
    E2l = _mm_madd_epi16(m128Tmp2, T7);
904
272k
    E2h = _mm_madd_epi16(m128Tmp3, T7);
905
272k
    O3h = _mm_add_epi32(E1h, E2h);
906
272k
    O3l = _mm_add_epi32(E1l, E2l);
907
908
    /*    -------     */
909
910
272k
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
911
272k
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
912
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
913
272k
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
914
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
915
272k
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
916
917
272k
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
918
272k
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
919
920
    /*    -------     */
921
922
272k
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
923
272k
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
924
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
925
272k
    E00l = _mm_madd_epi16(m128Tmp0, T10);
926
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
927
272k
    E00h = _mm_madd_epi16(m128Tmp1, T10);
928
272k
    E01l = _mm_madd_epi16(m128Tmp0, T11);
929
272k
    E01h = _mm_madd_epi16(m128Tmp1, T11);
930
272k
    E0l = _mm_add_epi32(EE0l, E00l);
931
272k
    E0l = _mm_add_epi32(E0l, m128iAdd);
932
272k
    E0h = _mm_add_epi32(EE0h, E00h);
933
272k
    E0h = _mm_add_epi32(E0h, m128iAdd);
934
272k
    E3l = _mm_sub_epi32(EE0l, E00l);
935
272k
    E3l = _mm_add_epi32(E3l, m128iAdd);
936
272k
    E3h = _mm_sub_epi32(EE0h, E00h);
937
272k
    E3h = _mm_add_epi32(E3h, m128iAdd);
938
939
272k
    E1l = _mm_add_epi32(EE1l, E01l);
940
272k
    E1l = _mm_add_epi32(E1l, m128iAdd);
941
272k
    E1h = _mm_add_epi32(EE1h, E01h);
942
272k
    E1h = _mm_add_epi32(E1h, m128iAdd);
943
272k
    E2l = _mm_sub_epi32(EE1l, E01l);
944
272k
    E2l = _mm_add_epi32(E2l, m128iAdd);
945
272k
    E2h = _mm_sub_epi32(EE1h, E01h);
946
272k
    E2h = _mm_add_epi32(E2h, m128iAdd);
947
272k
    m128iS0 = _mm_packs_epi32(
948
272k
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
949
272k
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
950
272k
    m128iS1 = _mm_packs_epi32(
951
272k
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
952
272k
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
953
272k
    m128iS2 = _mm_packs_epi32(
954
272k
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
955
272k
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
956
272k
    m128iS3 = _mm_packs_epi32(
957
272k
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
958
272k
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
959
272k
    m128iS4 = _mm_packs_epi32(
960
272k
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
961
272k
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
962
272k
    m128iS5 = _mm_packs_epi32(
963
272k
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
964
272k
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
965
272k
    m128iS6 = _mm_packs_epi32(
966
272k
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
967
272k
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
968
272k
    m128iS7 = _mm_packs_epi32(
969
272k
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
970
272k
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
971
    /*  Invers matrix   */
972
973
272k
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
974
272k
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
975
272k
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
976
272k
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
977
272k
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
978
272k
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
979
272k
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
980
272k
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
981
272k
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
982
272k
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
983
272k
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
984
272k
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
985
272k
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
986
272k
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
987
272k
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
988
272k
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
989
272k
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
990
272k
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
991
272k
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
992
272k
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
993
272k
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
994
272k
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
995
272k
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
996
272k
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
997
998
272k
    m128iAdd = _mm_set1_epi32(add_2nd);
999
1000
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1001
272k
    E1l = _mm_madd_epi16(m128Tmp0, T0);
1002
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1003
272k
    E1h = _mm_madd_epi16(m128Tmp1, T0);
1004
272k
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1005
272k
    E2l = _mm_madd_epi16(m128Tmp2, T1);
1006
272k
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1007
272k
    E2h = _mm_madd_epi16(m128Tmp3, T1);
1008
272k
    O0l = _mm_add_epi32(E1l, E2l);
1009
272k
    O0h = _mm_add_epi32(E1h, E2h);
1010
272k
    E1l = _mm_madd_epi16(m128Tmp0, T2);
1011
272k
    E1h = _mm_madd_epi16(m128Tmp1, T2);
1012
272k
    E2l = _mm_madd_epi16(m128Tmp2, T3);
1013
272k
    E2h = _mm_madd_epi16(m128Tmp3, T3);
1014
272k
    O1l = _mm_add_epi32(E1l, E2l);
1015
272k
    O1h = _mm_add_epi32(E1h, E2h);
1016
272k
    E1l = _mm_madd_epi16(m128Tmp0, T4);
1017
272k
    E1h = _mm_madd_epi16(m128Tmp1, T4);
1018
272k
    E2l = _mm_madd_epi16(m128Tmp2, T5);
1019
272k
    E2h = _mm_madd_epi16(m128Tmp3, T5);
1020
272k
    O2l = _mm_add_epi32(E1l, E2l);
1021
272k
    O2h = _mm_add_epi32(E1h, E2h);
1022
272k
    E1l = _mm_madd_epi16(m128Tmp0, T6);
1023
272k
    E1h = _mm_madd_epi16(m128Tmp1, T6);
1024
272k
    E2l = _mm_madd_epi16(m128Tmp2, T7);
1025
272k
    E2h = _mm_madd_epi16(m128Tmp3, T7);
1026
272k
    O3h = _mm_add_epi32(E1h, E2h);
1027
272k
    O3l = _mm_add_epi32(E1l, E2l);
1028
1029
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1030
272k
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
1031
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1032
272k
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
1033
272k
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
1034
272k
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
1035
1036
272k
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1037
272k
    E00l = _mm_madd_epi16(m128Tmp0, T10);
1038
272k
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1039
272k
    E00h = _mm_madd_epi16(m128Tmp1, T10);
1040
272k
    E01l = _mm_madd_epi16(m128Tmp0, T11);
1041
272k
    E01h = _mm_madd_epi16(m128Tmp1, T11);
1042
272k
    E0l = _mm_add_epi32(EE0l, E00l);
1043
272k
    E0l = _mm_add_epi32(E0l, m128iAdd);
1044
272k
    E0h = _mm_add_epi32(EE0h, E00h);
1045
272k
    E0h = _mm_add_epi32(E0h, m128iAdd);
1046
272k
    E3l = _mm_sub_epi32(EE0l, E00l);
1047
272k
    E3l = _mm_add_epi32(E3l, m128iAdd);
1048
272k
    E3h = _mm_sub_epi32(EE0h, E00h);
1049
272k
    E3h = _mm_add_epi32(E3h, m128iAdd);
1050
272k
    E1l = _mm_add_epi32(EE1l, E01l);
1051
272k
    E1l = _mm_add_epi32(E1l, m128iAdd);
1052
272k
    E1h = _mm_add_epi32(EE1h, E01h);
1053
272k
    E1h = _mm_add_epi32(E1h, m128iAdd);
1054
272k
    E2l = _mm_sub_epi32(EE1l, E01l);
1055
272k
    E2l = _mm_add_epi32(E2l, m128iAdd);
1056
272k
    E2h = _mm_sub_epi32(EE1h, E01h);
1057
272k
    E2h = _mm_add_epi32(E2h, m128iAdd);
1058
1059
272k
    m128iS0 = _mm_packs_epi32(
1060
272k
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1061
272k
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1062
272k
    m128iS1 = _mm_packs_epi32(
1063
272k
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1064
272k
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1065
272k
    m128iS2 = _mm_packs_epi32(
1066
272k
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1067
272k
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1068
272k
    m128iS3 = _mm_packs_epi32(
1069
272k
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1070
272k
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1071
272k
    m128iS4 = _mm_packs_epi32(
1072
272k
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1073
272k
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1074
272k
    m128iS5 = _mm_packs_epi32(
1075
272k
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1076
272k
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1077
272k
    m128iS6 = _mm_packs_epi32(
1078
272k
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1079
272k
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1080
272k
    m128iS7 = _mm_packs_epi32(
1081
272k
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1082
272k
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1083
1084
272k
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1085
272k
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1086
272k
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1087
272k
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1088
272k
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1089
272k
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1090
272k
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1091
272k
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1092
272k
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1093
272k
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1094
272k
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1095
272k
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1096
272k
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1097
272k
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1098
272k
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1099
272k
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1100
272k
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1101
272k
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1102
272k
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1103
272k
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1104
272k
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1105
272k
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1106
272k
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1107
272k
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1108
1109
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1110
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1111
1112
272k
    E0l = _mm_adds_epi16(E0l, m128iS0);
1113
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1114
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1115
272k
    dst += stride;
1116
1117
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1118
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1119
1120
272k
    E0l = _mm_adds_epi16(E0l, m128iS1);
1121
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1122
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1123
272k
    dst += stride;
1124
1125
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1126
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1127
1128
272k
    E0l = _mm_adds_epi16(E0l, m128iS2);
1129
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1130
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1131
272k
    dst += stride;
1132
1133
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1134
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1135
1136
272k
    E0l = _mm_adds_epi16(E0l, m128iS3);
1137
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1138
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1139
272k
    dst += stride;
1140
1141
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1142
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1143
1144
272k
    E0l = _mm_adds_epi16(E0l, m128iS4);
1145
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1146
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1147
272k
    dst += stride;
1148
1149
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1150
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1151
1152
272k
    E0l = _mm_adds_epi16(E0l, m128iS5);
1153
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1154
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1155
272k
    dst += stride;
1156
1157
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1158
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1159
1160
272k
    E0l = _mm_adds_epi16(E0l, m128iS6);
1161
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1162
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1163
272k
    dst += stride;
1164
1165
272k
    E0l = _mm_loadl_epi64((__m128i *) dst);
1166
272k
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1167
1168
272k
    E0l = _mm_adds_epi16(E0l, m128iS7);
1169
272k
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1170
272k
    _mm_storel_epi64((__m128i *) dst, E0l);
1171
272k
    dst += stride;
1172
1173
272k
}
1174
#endif
1175
1176
#if 0
1177
void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
1178
        ptrdiff_t _stride) {
1179
    int i;
1180
    uint16_t *dst = (uint16_t*) _dst;
1181
    ptrdiff_t stride = _stride / sizeof(uint16_t);
1182
    int16_t *src = coeffs;
1183
    uint8_t shift_2nd = 10; // 20 - Bit depth
1184
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1185
1186
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1187
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1188
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1189
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1190
    int j;
1191
    m128iAdd = _mm_set1_epi32(add_1st);
1192
1193
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1194
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1195
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1196
    E1l = _mm_madd_epi16(m128Tmp0,
1197
            _mm_load_si128((__m128i *) (transform8x8[0])));
1198
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1199
    E1h = _mm_madd_epi16(m128Tmp1,
1200
            _mm_load_si128((__m128i *) (transform8x8[0])));
1201
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1202
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1203
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1204
    E2l = _mm_madd_epi16(m128Tmp2,
1205
            _mm_load_si128((__m128i *) (transform8x8[1])));
1206
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1207
    E2h = _mm_madd_epi16(m128Tmp3,
1208
            _mm_load_si128((__m128i *) (transform8x8[1])));
1209
    O0l = _mm_add_epi32(E1l, E2l);
1210
    O0h = _mm_add_epi32(E1h, E2h);
1211
1212
    E1l = _mm_madd_epi16(m128Tmp0,
1213
            _mm_load_si128((__m128i *) (transform8x8[2])));
1214
    E1h = _mm_madd_epi16(m128Tmp1,
1215
            _mm_load_si128((__m128i *) (transform8x8[2])));
1216
    E2l = _mm_madd_epi16(m128Tmp2,
1217
            _mm_load_si128((__m128i *) (transform8x8[3])));
1218
    E2h = _mm_madd_epi16(m128Tmp3,
1219
            _mm_load_si128((__m128i *) (transform8x8[3])));
1220
1221
    O1l = _mm_add_epi32(E1l, E2l);
1222
    O1h = _mm_add_epi32(E1h, E2h);
1223
1224
    E1l = _mm_madd_epi16(m128Tmp0,
1225
            _mm_load_si128((__m128i *) (transform8x8[4])));
1226
    E1h = _mm_madd_epi16(m128Tmp1,
1227
            _mm_load_si128((__m128i *) (transform8x8[4])));
1228
    E2l = _mm_madd_epi16(m128Tmp2,
1229
            _mm_load_si128((__m128i *) (transform8x8[5])));
1230
    E2h = _mm_madd_epi16(m128Tmp3,
1231
            _mm_load_si128((__m128i *) (transform8x8[5])));
1232
    O2l = _mm_add_epi32(E1l, E2l);
1233
    O2h = _mm_add_epi32(E1h, E2h);
1234
1235
    E1l = _mm_madd_epi16(m128Tmp0,
1236
            _mm_load_si128((__m128i *) (transform8x8[6])));
1237
    E1h = _mm_madd_epi16(m128Tmp1,
1238
            _mm_load_si128((__m128i *) (transform8x8[6])));
1239
    E2l = _mm_madd_epi16(m128Tmp2,
1240
            _mm_load_si128((__m128i *) (transform8x8[7])));
1241
    E2h = _mm_madd_epi16(m128Tmp3,
1242
            _mm_load_si128((__m128i *) (transform8x8[7])));
1243
    O3h = _mm_add_epi32(E1h, E2h);
1244
    O3l = _mm_add_epi32(E1l, E2l);
1245
1246
    /*    -------     */
1247
1248
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1249
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1250
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1251
    EE0l = _mm_madd_epi16(m128Tmp0,
1252
            _mm_load_si128((__m128i *) (transform8x8[8])));
1253
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1254
    EE0h = _mm_madd_epi16(m128Tmp1,
1255
            _mm_load_si128((__m128i *) (transform8x8[8])));
1256
1257
    EE1l = _mm_madd_epi16(m128Tmp0,
1258
            _mm_load_si128((__m128i *) (transform8x8[9])));
1259
    EE1h = _mm_madd_epi16(m128Tmp1,
1260
            _mm_load_si128((__m128i *) (transform8x8[9])));
1261
1262
    /*    -------     */
1263
1264
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1265
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1266
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1267
    E00l = _mm_madd_epi16(m128Tmp0,
1268
            _mm_load_si128((__m128i *) (transform8x8[10])));
1269
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1270
    E00h = _mm_madd_epi16(m128Tmp1,
1271
            _mm_load_si128((__m128i *) (transform8x8[10])));
1272
    E01l = _mm_madd_epi16(m128Tmp0,
1273
            _mm_load_si128((__m128i *) (transform8x8[11])));
1274
    E01h = _mm_madd_epi16(m128Tmp1,
1275
            _mm_load_si128((__m128i *) (transform8x8[11])));
1276
    E0l = _mm_add_epi32(EE0l, E00l);
1277
    E0l = _mm_add_epi32(E0l, m128iAdd);
1278
    E0h = _mm_add_epi32(EE0h, E00h);
1279
    E0h = _mm_add_epi32(E0h, m128iAdd);
1280
    E3l = _mm_sub_epi32(EE0l, E00l);
1281
    E3l = _mm_add_epi32(E3l, m128iAdd);
1282
    E3h = _mm_sub_epi32(EE0h, E00h);
1283
    E3h = _mm_add_epi32(E3h, m128iAdd);
1284
1285
    E1l = _mm_add_epi32(EE1l, E01l);
1286
    E1l = _mm_add_epi32(E1l, m128iAdd);
1287
    E1h = _mm_add_epi32(EE1h, E01h);
1288
    E1h = _mm_add_epi32(E1h, m128iAdd);
1289
    E2l = _mm_sub_epi32(EE1l, E01l);
1290
    E2l = _mm_add_epi32(E2l, m128iAdd);
1291
    E2h = _mm_sub_epi32(EE1h, E01h);
1292
    E2h = _mm_add_epi32(E2h, m128iAdd);
1293
    m128iS0 = _mm_packs_epi32(
1294
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1295
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1296
    m128iS1 = _mm_packs_epi32(
1297
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1298
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1299
    m128iS2 = _mm_packs_epi32(
1300
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1301
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1302
    m128iS3 = _mm_packs_epi32(
1303
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1304
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1305
    m128iS4 = _mm_packs_epi32(
1306
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1307
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1308
    m128iS5 = _mm_packs_epi32(
1309
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1310
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1311
    m128iS6 = _mm_packs_epi32(
1312
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1313
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1314
    m128iS7 = _mm_packs_epi32(
1315
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1316
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1317
    /*  Invers matrix   */
1318
1319
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1320
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1321
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1322
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1323
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1324
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1325
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1326
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1327
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1328
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1329
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1330
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1331
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1332
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1333
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1334
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1335
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1336
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1337
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1338
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1339
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1340
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1341
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1342
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1343
1344
    m128iAdd = _mm_set1_epi32(add_2nd);
1345
1346
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1347
    E1l = _mm_madd_epi16(m128Tmp0,
1348
            _mm_load_si128((__m128i *) (transform8x8[0])));
1349
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1350
    E1h = _mm_madd_epi16(m128Tmp1,
1351
            _mm_load_si128((__m128i *) (transform8x8[0])));
1352
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1353
    E2l = _mm_madd_epi16(m128Tmp2,
1354
            _mm_load_si128((__m128i *) (transform8x8[1])));
1355
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1356
    E2h = _mm_madd_epi16(m128Tmp3,
1357
            _mm_load_si128((__m128i *) (transform8x8[1])));
1358
    O0l = _mm_add_epi32(E1l, E2l);
1359
    O0h = _mm_add_epi32(E1h, E2h);
1360
    E1l = _mm_madd_epi16(m128Tmp0,
1361
            _mm_load_si128((__m128i *) (transform8x8[2])));
1362
    E1h = _mm_madd_epi16(m128Tmp1,
1363
            _mm_load_si128((__m128i *) (transform8x8[2])));
1364
    E2l = _mm_madd_epi16(m128Tmp2,
1365
            _mm_load_si128((__m128i *) (transform8x8[3])));
1366
    E2h = _mm_madd_epi16(m128Tmp3,
1367
            _mm_load_si128((__m128i *) (transform8x8[3])));
1368
    O1l = _mm_add_epi32(E1l, E2l);
1369
    O1h = _mm_add_epi32(E1h, E2h);
1370
    E1l = _mm_madd_epi16(m128Tmp0,
1371
            _mm_load_si128((__m128i *) (transform8x8[4])));
1372
    E1h = _mm_madd_epi16(m128Tmp1,
1373
            _mm_load_si128((__m128i *) (transform8x8[4])));
1374
    E2l = _mm_madd_epi16(m128Tmp2,
1375
            _mm_load_si128((__m128i *) (transform8x8[5])));
1376
    E2h = _mm_madd_epi16(m128Tmp3,
1377
            _mm_load_si128((__m128i *) (transform8x8[5])));
1378
    O2l = _mm_add_epi32(E1l, E2l);
1379
    O2h = _mm_add_epi32(E1h, E2h);
1380
    E1l = _mm_madd_epi16(m128Tmp0,
1381
            _mm_load_si128((__m128i *) (transform8x8[6])));
1382
    E1h = _mm_madd_epi16(m128Tmp1,
1383
            _mm_load_si128((__m128i *) (transform8x8[6])));
1384
    E2l = _mm_madd_epi16(m128Tmp2,
1385
            _mm_load_si128((__m128i *) (transform8x8[7])));
1386
    E2h = _mm_madd_epi16(m128Tmp3,
1387
            _mm_load_si128((__m128i *) (transform8x8[7])));
1388
    O3h = _mm_add_epi32(E1h, E2h);
1389
    O3l = _mm_add_epi32(E1l, E2l);
1390
1391
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1392
    EE0l = _mm_madd_epi16(m128Tmp0,
1393
            _mm_load_si128((__m128i *) (transform8x8[8])));
1394
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1395
    EE0h = _mm_madd_epi16(m128Tmp1,
1396
            _mm_load_si128((__m128i *) (transform8x8[8])));
1397
    EE1l = _mm_madd_epi16(m128Tmp0,
1398
            _mm_load_si128((__m128i *) (transform8x8[9])));
1399
    EE1h = _mm_madd_epi16(m128Tmp1,
1400
            _mm_load_si128((__m128i *) (transform8x8[9])));
1401
1402
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1403
    E00l = _mm_madd_epi16(m128Tmp0,
1404
            _mm_load_si128((__m128i *) (transform8x8[10])));
1405
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1406
    E00h = _mm_madd_epi16(m128Tmp1,
1407
            _mm_load_si128((__m128i *) (transform8x8[10])));
1408
    E01l = _mm_madd_epi16(m128Tmp0,
1409
            _mm_load_si128((__m128i *) (transform8x8[11])));
1410
    E01h = _mm_madd_epi16(m128Tmp1,
1411
            _mm_load_si128((__m128i *) (transform8x8[11])));
1412
    E0l = _mm_add_epi32(EE0l, E00l);
1413
    E0l = _mm_add_epi32(E0l, m128iAdd);
1414
    E0h = _mm_add_epi32(EE0h, E00h);
1415
    E0h = _mm_add_epi32(E0h, m128iAdd);
1416
    E3l = _mm_sub_epi32(EE0l, E00l);
1417
    E3l = _mm_add_epi32(E3l, m128iAdd);
1418
    E3h = _mm_sub_epi32(EE0h, E00h);
1419
    E3h = _mm_add_epi32(E3h, m128iAdd);
1420
    E1l = _mm_add_epi32(EE1l, E01l);
1421
    E1l = _mm_add_epi32(E1l, m128iAdd);
1422
    E1h = _mm_add_epi32(EE1h, E01h);
1423
    E1h = _mm_add_epi32(E1h, m128iAdd);
1424
    E2l = _mm_sub_epi32(EE1l, E01l);
1425
    E2l = _mm_add_epi32(E2l, m128iAdd);
1426
    E2h = _mm_sub_epi32(EE1h, E01h);
1427
    E2h = _mm_add_epi32(E2h, m128iAdd);
1428
1429
    m128iS0 = _mm_packs_epi32(
1430
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1431
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1432
    m128iS1 = _mm_packs_epi32(
1433
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1434
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1435
    m128iS2 = _mm_packs_epi32(
1436
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1437
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1438
    m128iS3 = _mm_packs_epi32(
1439
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1440
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1441
    m128iS4 = _mm_packs_epi32(
1442
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1443
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1444
    m128iS5 = _mm_packs_epi32(
1445
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1446
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1447
    m128iS6 = _mm_packs_epi32(
1448
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1449
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1450
    m128iS7 = _mm_packs_epi32(
1451
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1452
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1453
1454
    _mm_store_si128((__m128i *) (src), m128iS0);
1455
    _mm_store_si128((__m128i *) (src + 8), m128iS1);
1456
    _mm_store_si128((__m128i *) (src + 16), m128iS2);
1457
    _mm_store_si128((__m128i *) (src + 24), m128iS3);
1458
    _mm_store_si128((__m128i *) (src + 32), m128iS4);
1459
    _mm_store_si128((__m128i *) (src + 40), m128iS5);
1460
    _mm_store_si128((__m128i *) (src + 48), m128iS6);
1461
    _mm_store_si128((__m128i *) (src + 56), m128iS7);
1462
1463
    j = 0;
1464
    for (i = 0; i < 4; i++) {
1465
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1466
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1467
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1468
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1469
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1470
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1471
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1472
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1473
        j += 1;
1474
        dst += stride;
1475
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1476
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1477
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1478
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1479
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1480
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1481
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1482
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1483
        j += 1;
1484
        dst += stride;
1485
    }
1486
1487
}
1488
#endif
1489
1490
1491
#if HAVE_SSE4_1
1492
void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
1493
10.6k
        ptrdiff_t _stride) {
1494
10.6k
    uint8_t shift_2nd = 12; // 20 - Bit depth
1495
10.6k
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1496
10.6k
    int i;
1497
10.6k
    uint8_t *dst = (uint8_t*) _dst;
1498
10.6k
    ptrdiff_t stride = _stride / sizeof(uint8_t);
1499
10.6k
    const int16_t *src = coeffs;
1500
10.6k
    int32_t shift;
1501
10.6k
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1502
10.6k
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1503
10.6k
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1504
10.6k
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1505
10.6k
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1506
10.6k
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1507
10.6k
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1508
10.6k
    __m128i E4l, E5l, E6l, E7l;
1509
10.6k
    __m128i E4h, E5h, E6h, E7h;
1510
10.6k
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1511
10.6k
    __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1512
1513
1514
    /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1515
    __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1516
    __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1517
    __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1518
1519
    __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1520
1521
    __m128i V00,V01, V10, V11;*/
1522
1523
1524
10.6k
    const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1525
10.6k
    const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1526
10.6k
    const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1527
10.6k
    const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1528
10.6k
    const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1529
10.6k
    const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1530
10.6k
    const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1531
10.6k
    const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1532
10.6k
    const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1533
10.6k
    const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1534
10.6k
    const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1535
10.6k
    const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1536
10.6k
    const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1537
10.6k
    const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1538
10.6k
    const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1539
10.6k
    const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1540
10.6k
    const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1541
10.6k
    const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1542
10.6k
    const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1543
10.6k
    const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1544
10.6k
    const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1545
10.6k
    const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1546
10.6k
    const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1547
10.6k
    const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1548
10.6k
    const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1549
10.6k
    const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1550
10.6k
    const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1551
10.6k
    const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1552
10.6k
    const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1553
10.6k
    const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1554
10.6k
    const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1555
10.6k
    const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1556
1557
10.6k
    const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1558
10.6k
    const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1559
10.6k
    const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1560
10.6k
    const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1561
10.6k
    const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1562
10.6k
    const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1563
10.6k
    const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1564
10.6k
    const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1565
1566
10.6k
    const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1567
10.6k
    const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1568
10.6k
    const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1569
10.6k
    const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1570
1571
1572
1573
10.6k
    int j;
1574
10.6k
    m128iS0 = _mm_load_si128((__m128i *) (src));
1575
10.6k
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1576
10.6k
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1577
10.6k
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1578
10.6k
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1579
10.6k
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1580
10.6k
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1581
10.6k
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1582
10.6k
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1583
10.6k
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1584
10.6k
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1585
10.6k
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1586
10.6k
    m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1587
10.6k
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1588
10.6k
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1589
10.6k
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1590
10.6k
    shift = shift_1st;
1591
10.6k
    m128iAdd = _mm_set1_epi32(add_1st);
1592
1593
31.9k
    for (j = 0; j < 2; j++) {
1594
63.9k
        for (i = 0; i < 16; i += 8) {
1595
1596
42.6k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1597
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T00);
1598
42.6k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1599
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T00);
1600
1601
42.6k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1602
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T10);
1603
42.6k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1604
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T10);
1605
1606
42.6k
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1607
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T20);
1608
42.6k
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1609
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T20);
1610
1611
42.6k
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1612
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T30);
1613
42.6k
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1614
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T30);
1615
1616
42.6k
            O0l = _mm_add_epi32(E0l, E1l);
1617
42.6k
            O0l = _mm_add_epi32(O0l, E2l);
1618
42.6k
            O0l = _mm_add_epi32(O0l, E3l);
1619
1620
42.6k
            O0h = _mm_add_epi32(E0h, E1h);
1621
42.6k
            O0h = _mm_add_epi32(O0h, E2h);
1622
42.6k
            O0h = _mm_add_epi32(O0h, E3h);
1623
1624
            /* Compute O1*/
1625
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T01);
1626
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T01);
1627
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T11);
1628
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T11);
1629
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T21);
1630
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T21);
1631
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T31);
1632
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T31);
1633
42.6k
            O1l = _mm_add_epi32(E0l, E1l);
1634
42.6k
            O1l = _mm_add_epi32(O1l, E2l);
1635
42.6k
            O1l = _mm_add_epi32(O1l, E3l);
1636
42.6k
            O1h = _mm_add_epi32(E0h, E1h);
1637
42.6k
            O1h = _mm_add_epi32(O1h, E2h);
1638
42.6k
            O1h = _mm_add_epi32(O1h, E3h);
1639
1640
            /* Compute O2*/
1641
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T02);
1642
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T02);
1643
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T12);
1644
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T12);
1645
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T22);
1646
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T22);
1647
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T32);
1648
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T32);
1649
42.6k
            O2l = _mm_add_epi32(E0l, E1l);
1650
42.6k
            O2l = _mm_add_epi32(O2l, E2l);
1651
42.6k
            O2l = _mm_add_epi32(O2l, E3l);
1652
1653
42.6k
            O2h = _mm_add_epi32(E0h, E1h);
1654
42.6k
            O2h = _mm_add_epi32(O2h, E2h);
1655
42.6k
            O2h = _mm_add_epi32(O2h, E3h);
1656
1657
            /* Compute O3*/
1658
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T03);
1659
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T03);
1660
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T13);
1661
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T13);
1662
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T23);
1663
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T23);
1664
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T33);
1665
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T33);
1666
1667
42.6k
            O3l = _mm_add_epi32(E0l, E1l);
1668
42.6k
            O3l = _mm_add_epi32(O3l, E2l);
1669
42.6k
            O3l = _mm_add_epi32(O3l, E3l);
1670
1671
42.6k
            O3h = _mm_add_epi32(E0h, E1h);
1672
42.6k
            O3h = _mm_add_epi32(O3h, E2h);
1673
42.6k
            O3h = _mm_add_epi32(O3h, E3h);
1674
1675
            /* Compute O4*/
1676
1677
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T04);
1678
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T04);
1679
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T14);
1680
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T14);
1681
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T24);
1682
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T24);
1683
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T34);
1684
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T34);
1685
1686
42.6k
            O4l = _mm_add_epi32(E0l, E1l);
1687
42.6k
            O4l = _mm_add_epi32(O4l, E2l);
1688
42.6k
            O4l = _mm_add_epi32(O4l, E3l);
1689
1690
42.6k
            O4h = _mm_add_epi32(E0h, E1h);
1691
42.6k
            O4h = _mm_add_epi32(O4h, E2h);
1692
42.6k
            O4h = _mm_add_epi32(O4h, E3h);
1693
1694
            /* Compute O5*/
1695
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T05);
1696
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T05);
1697
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T15);
1698
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T15);
1699
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T25);
1700
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T25);
1701
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T35);
1702
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T35);
1703
1704
42.6k
            O5l = _mm_add_epi32(E0l, E1l);
1705
42.6k
            O5l = _mm_add_epi32(O5l, E2l);
1706
42.6k
            O5l = _mm_add_epi32(O5l, E3l);
1707
1708
42.6k
            O5h = _mm_add_epi32(E0h, E1h);
1709
42.6k
            O5h = _mm_add_epi32(O5h, E2h);
1710
42.6k
            O5h = _mm_add_epi32(O5h, E3h);
1711
1712
            /* Compute O6*/
1713
1714
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T06);
1715
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T06);
1716
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T16);
1717
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T16);
1718
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T26);
1719
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T26);
1720
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T36);
1721
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T36);
1722
1723
42.6k
            O6l = _mm_add_epi32(E0l, E1l);
1724
42.6k
            O6l = _mm_add_epi32(O6l, E2l);
1725
42.6k
            O6l = _mm_add_epi32(O6l, E3l);
1726
1727
42.6k
            O6h = _mm_add_epi32(E0h, E1h);
1728
42.6k
            O6h = _mm_add_epi32(O6h, E2h);
1729
42.6k
            O6h = _mm_add_epi32(O6h, E3h);
1730
1731
            /* Compute O7*/
1732
1733
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,T07);
1734
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,T07);
1735
42.6k
            E1l = _mm_madd_epi16(m128Tmp2,T17);
1736
42.6k
            E1h = _mm_madd_epi16(m128Tmp3,T17);
1737
42.6k
            E2l = _mm_madd_epi16(m128Tmp4,T27);
1738
42.6k
            E2h = _mm_madd_epi16(m128Tmp5,T27);
1739
42.6k
            E3l = _mm_madd_epi16(m128Tmp6,T37);
1740
42.6k
            E3h = _mm_madd_epi16(m128Tmp7,T37);
1741
1742
42.6k
            O7l = _mm_add_epi32(E0l, E1l);
1743
42.6k
            O7l = _mm_add_epi32(O7l, E2l);
1744
42.6k
            O7l = _mm_add_epi32(O7l, E3l);
1745
1746
42.6k
            O7h = _mm_add_epi32(E0h, E1h);
1747
42.6k
            O7h = _mm_add_epi32(O7h, E2h);
1748
42.6k
            O7h = _mm_add_epi32(O7h, E3h);
1749
1750
            /*  Compute E0  */
1751
1752
1753
1754
42.6k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1755
42.6k
            E0l = _mm_madd_epi16(m128Tmp0,U00);
1756
42.6k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1757
42.6k
            E0h = _mm_madd_epi16(m128Tmp1,U00);
1758
1759
42.6k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1760
42.6k
            E0l = _mm_add_epi32(E0l,
1761
42.6k
                    _mm_madd_epi16(m128Tmp2,U10));
1762
42.6k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1763
42.6k
            E0h = _mm_add_epi32(E0h,
1764
42.6k
                    _mm_madd_epi16(m128Tmp3,U10));
1765
1766
            /*  Compute E1  */
1767
42.6k
            E1l = _mm_madd_epi16(m128Tmp0,U01);
1768
42.6k
            E1h = _mm_madd_epi16(m128Tmp1,U01);
1769
42.6k
            E1l = _mm_add_epi32(E1l,
1770
42.6k
                    _mm_madd_epi16(m128Tmp2,U11));
1771
42.6k
            E1h = _mm_add_epi32(E1h,
1772
42.6k
                    _mm_madd_epi16(m128Tmp3,U11));
1773
1774
            /*  Compute E2  */
1775
42.6k
            E2l = _mm_madd_epi16(m128Tmp0,U02);
1776
42.6k
            E2h = _mm_madd_epi16(m128Tmp1,U02);
1777
42.6k
            E2l = _mm_add_epi32(E2l,
1778
42.6k
                    _mm_madd_epi16(m128Tmp2,U12));
1779
42.6k
            E2h = _mm_add_epi32(E2h,
1780
42.6k
                    _mm_madd_epi16(m128Tmp3,U12));
1781
            /*  Compute E3  */
1782
42.6k
            E3l = _mm_madd_epi16(m128Tmp0,U03);
1783
42.6k
            E3h = _mm_madd_epi16(m128Tmp1,U03);
1784
42.6k
            E3l = _mm_add_epi32(E3l,
1785
42.6k
                    _mm_madd_epi16(m128Tmp2,U13));
1786
42.6k
            E3h = _mm_add_epi32(E3h,
1787
42.6k
                    _mm_madd_epi16(m128Tmp3,U13));
1788
1789
            /*  Compute EE0 and EEE */
1790
1791
42.6k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1792
42.6k
            E00l = _mm_madd_epi16(m128Tmp0,V00);
1793
42.6k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1794
42.6k
            E00h = _mm_madd_epi16(m128Tmp1,V00);
1795
1796
42.6k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1797
42.6k
            EE0l = _mm_madd_epi16(m128Tmp2,V10);
1798
42.6k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1799
42.6k
            EE0h = _mm_madd_epi16(m128Tmp3,V10);
1800
1801
42.6k
            E01l = _mm_madd_epi16(m128Tmp0,V01);
1802
42.6k
            E01h = _mm_madd_epi16(m128Tmp1,V01);
1803
1804
42.6k
            EE1l = _mm_madd_epi16(m128Tmp2,V11);
1805
42.6k
            EE1h = _mm_madd_epi16(m128Tmp3,V11);
1806
1807
            /*  Compute EE    */
1808
42.6k
            EE2l = _mm_sub_epi32(EE1l, E01l);
1809
42.6k
            EE3l = _mm_sub_epi32(EE0l, E00l);
1810
42.6k
            EE2h = _mm_sub_epi32(EE1h, E01h);
1811
42.6k
            EE3h = _mm_sub_epi32(EE0h, E00h);
1812
1813
42.6k
            EE0l = _mm_add_epi32(EE0l, E00l);
1814
42.6k
            EE1l = _mm_add_epi32(EE1l, E01l);
1815
42.6k
            EE0h = _mm_add_epi32(EE0h, E00h);
1816
42.6k
            EE1h = _mm_add_epi32(EE1h, E01h);
1817
1818
            /*      Compute E       */
1819
1820
42.6k
            E4l = _mm_sub_epi32(EE3l, E3l);
1821
42.6k
            E4l = _mm_add_epi32(E4l, m128iAdd);
1822
1823
42.6k
            E5l = _mm_sub_epi32(EE2l, E2l);
1824
42.6k
            E5l = _mm_add_epi32(E5l, m128iAdd);
1825
1826
42.6k
            E6l = _mm_sub_epi32(EE1l, E1l);
1827
42.6k
            E6l = _mm_add_epi32(E6l, m128iAdd);
1828
1829
42.6k
            E7l = _mm_sub_epi32(EE0l, E0l);
1830
42.6k
            E7l = _mm_add_epi32(E7l, m128iAdd);
1831
1832
42.6k
            E4h = _mm_sub_epi32(EE3h, E3h);
1833
42.6k
            E4h = _mm_add_epi32(E4h, m128iAdd);
1834
1835
42.6k
            E5h = _mm_sub_epi32(EE2h, E2h);
1836
42.6k
            E5h = _mm_add_epi32(E5h, m128iAdd);
1837
1838
42.6k
            E6h = _mm_sub_epi32(EE1h, E1h);
1839
42.6k
            E6h = _mm_add_epi32(E6h, m128iAdd);
1840
1841
42.6k
            E7h = _mm_sub_epi32(EE0h, E0h);
1842
42.6k
            E7h = _mm_add_epi32(E7h, m128iAdd);
1843
1844
42.6k
            E0l = _mm_add_epi32(EE0l, E0l);
1845
42.6k
            E0l = _mm_add_epi32(E0l, m128iAdd);
1846
1847
42.6k
            E1l = _mm_add_epi32(EE1l, E1l);
1848
42.6k
            E1l = _mm_add_epi32(E1l, m128iAdd);
1849
1850
42.6k
            E2l = _mm_add_epi32(EE2l, E2l);
1851
42.6k
            E2l = _mm_add_epi32(E2l, m128iAdd);
1852
1853
42.6k
            E3l = _mm_add_epi32(EE3l, E3l);
1854
42.6k
            E3l = _mm_add_epi32(E3l, m128iAdd);
1855
1856
42.6k
            E0h = _mm_add_epi32(EE0h, E0h);
1857
42.6k
            E0h = _mm_add_epi32(E0h, m128iAdd);
1858
1859
42.6k
            E1h = _mm_add_epi32(EE1h, E1h);
1860
42.6k
            E1h = _mm_add_epi32(E1h, m128iAdd);
1861
1862
42.6k
            E2h = _mm_add_epi32(EE2h, E2h);
1863
42.6k
            E2h = _mm_add_epi32(E2h, m128iAdd);
1864
1865
42.6k
            E3h = _mm_add_epi32(EE3h, E3h);
1866
42.6k
            E3h = _mm_add_epi32(E3h, m128iAdd);
1867
1868
42.6k
            m128iS0 = _mm_packs_epi32(
1869
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1870
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1871
42.6k
            m128iS1 = _mm_packs_epi32(
1872
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1873
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1874
42.6k
            m128iS2 = _mm_packs_epi32(
1875
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1876
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1877
42.6k
            m128iS3 = _mm_packs_epi32(
1878
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1879
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1880
1881
42.6k
            m128iS4 = _mm_packs_epi32(
1882
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1883
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1884
42.6k
            m128iS5 = _mm_packs_epi32(
1885
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1886
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1887
42.6k
            m128iS6 = _mm_packs_epi32(
1888
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1889
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1890
42.6k
            m128iS7 = _mm_packs_epi32(
1891
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1892
42.6k
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1893
1894
42.6k
            m128iS15 = _mm_packs_epi32(
1895
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1896
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1897
42.6k
            m128iS14 = _mm_packs_epi32(
1898
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1899
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1900
42.6k
            m128iS13 = _mm_packs_epi32(
1901
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1902
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1903
42.6k
            m128iS12 = _mm_packs_epi32(
1904
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1905
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1906
1907
42.6k
            m128iS11 = _mm_packs_epi32(
1908
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1909
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1910
42.6k
            m128iS10 = _mm_packs_epi32(
1911
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1912
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1913
42.6k
            m128iS9 = _mm_packs_epi32(
1914
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1915
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1916
42.6k
            m128iS8 = _mm_packs_epi32(
1917
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1918
42.6k
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1919
1920
1921
1922
42.6k
            if (!j) { //first pass
1923
1924
                /*      Inverse the matrix      */
1925
21.3k
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1926
21.3k
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1927
21.3k
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1928
21.3k
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1929
21.3k
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1930
21.3k
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1931
21.3k
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1932
21.3k
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1933
1934
21.3k
                E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1935
21.3k
                E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1936
21.3k
                E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1937
21.3k
                E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1938
21.3k
                E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1939
21.3k
                E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1940
21.3k
                E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1941
21.3k
                E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1942
1943
21.3k
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1944
21.3k
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1945
21.3k
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1946
21.3k
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1947
1948
21.3k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1949
21.3k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1950
21.3k
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1951
21.3k
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1952
1953
21.3k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1954
21.3k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1955
21.3k
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1956
21.3k
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1957
1958
21.3k
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1959
21.3k
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1960
21.3k
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1961
21.3k
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1962
1963
21.3k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1964
21.3k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1965
21.3k
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1966
21.3k
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1967
1968
21.3k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1969
21.3k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1970
21.3k
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1971
21.3k
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1972
1973
21.3k
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1974
21.3k
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1975
21.3k
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1976
21.3k
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1977
1978
21.3k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1979
21.3k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1980
21.3k
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1981
21.3k
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1982
1983
21.3k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1984
21.3k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1985
21.3k
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1986
21.3k
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1987
1988
21.3k
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1989
21.3k
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1990
21.3k
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1991
21.3k
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1992
1993
21.3k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1994
21.3k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1995
21.3k
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1996
21.3k
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1997
1998
21.3k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1999
21.3k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2000
21.3k
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2001
21.3k
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2002
2003
21.3k
                if (!i) {
2004
2005
10.6k
                    r0= m128iS0;    //0
2006
10.6k
                    r1= m128iS1;    //16
2007
10.6k
                    r2= m128iS2;    //32
2008
10.6k
                    r3= m128iS3;    //48
2009
10.6k
                    r4= m128iS4;    //64
2010
10.6k
                    r5= m128iS5;    //80
2011
10.6k
                    r6= m128iS6;    //96
2012
10.6k
                    r7= m128iS7;    //112
2013
10.6k
                    r8= m128iS8;    //128
2014
10.6k
                    r9= m128iS9;    //144
2015
10.6k
                    r10= m128iS10;  //160
2016
10.6k
                    r11= m128iS11;  //176
2017
10.6k
                    r12= m128iS12;  //192
2018
10.6k
                    r13= m128iS13;  //208
2019
10.6k
                    r14= m128iS14;  //224
2020
10.6k
                    r15= m128iS15;  //240
2021
2022
2023
2024
10.6k
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2025
10.6k
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2026
10.6k
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2027
10.6k
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2028
10.6k
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2029
10.6k
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2030
10.6k
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2031
10.6k
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2032
10.6k
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2033
10.6k
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2034
10.6k
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2035
10.6k
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2036
10.6k
                    m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2037
10.6k
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2038
10.6k
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2039
10.6k
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2040
10.6k
                } else {
2041
2042
10.6k
                    r16= m128iS0;    //8
2043
10.6k
                    r17= m128iS1;    //24
2044
10.6k
                    r18= m128iS2;    //40
2045
10.6k
                    r19= m128iS3;    //56
2046
10.6k
                    r20= m128iS4;    //72
2047
10.6k
                    r21= m128iS5;    //88
2048
10.6k
                    r22= m128iS6;    //104
2049
10.6k
                    r23= m128iS7;    //120
2050
10.6k
                    r24= m128iS8;    //136
2051
10.6k
                    r25= m128iS9;    //152
2052
10.6k
                    r26= m128iS10;  //168
2053
10.6k
                    r27= m128iS11;  //184
2054
10.6k
                    r28= m128iS12;  //200
2055
10.6k
                    r29= m128iS13;  //216
2056
10.6k
                    r30= m128iS14;  //232
2057
10.6k
                    r31= m128iS15;  //248
2058
2059
                    //prepare next iteration :
2060
2061
10.6k
                    m128iS0= r0;
2062
10.6k
                    m128iS1= r2;
2063
10.6k
                    m128iS2= r4;
2064
10.6k
                    m128iS3= r6;
2065
10.6k
                    m128iS4= r8;
2066
10.6k
                    m128iS5= r10;
2067
10.6k
                    m128iS6= r12;
2068
10.6k
                    m128iS7= r14;
2069
10.6k
                    m128iS8= r16;
2070
10.6k
                    m128iS9= r18;
2071
10.6k
                    m128iS10=r20;
2072
10.6k
                    m128iS11=r22;
2073
10.6k
                    m128iS12=r24;
2074
10.6k
                    m128iS13=r26;
2075
10.6k
                    m128iS14=r28;
2076
10.6k
                    m128iS15=r30;
2077
2078
10.6k
                    shift = shift_2nd;
2079
10.6k
                    m128iAdd = _mm_set1_epi32(add_2nd);
2080
10.6k
                }
2081
2082
21.3k
            } else {
2083
2084
                //transpose half matrix :
2085
                //instead of having 1 register = 1 half-column,
2086
                //1 register = 1 half-row.
2087
21.3k
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2088
21.3k
                E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2089
21.3k
                E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2090
21.3k
                E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2091
21.3k
                E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2092
21.3k
                E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2093
21.3k
                E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2094
21.3k
                E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2095
2096
21.3k
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2097
21.3k
                O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2098
21.3k
                O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2099
21.3k
                O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2100
21.3k
                O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2101
21.3k
                O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2102
21.3k
                O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2103
21.3k
                O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2104
2105
2106
21.3k
                m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2107
21.3k
                m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2108
2109
21.3k
                m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2110
21.3k
                m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2111
2112
21.3k
                r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);    //1st half 1st row
2113
21.3k
                r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);    //2nd half 1st row
2114
2115
2116
21.3k
                r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);    //1st half 2nd row
2117
21.3k
                r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);    //2nd half 2nd row
2118
2119
21.3k
                m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2120
21.3k
                m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2121
21.3k
                m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2122
21.3k
                m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2123
2124
2125
21.3k
                r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2126
21.3k
                r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2127
2128
21.3k
                r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2129
21.3k
                r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2130
2131
21.3k
                m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2132
21.3k
                m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2133
21.3k
                m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2134
21.3k
                m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2135
2136
21.3k
                r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2137
21.3k
                r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2138
2139
2140
21.3k
                r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2141
21.3k
                r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2142
2143
21.3k
                m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2144
21.3k
                m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2145
21.3k
                m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2146
21.3k
                m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2147
2148
21.3k
                r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2149
21.3k
                r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2150
2151
2152
21.3k
                r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2153
21.3k
                r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2154
2155
21.3k
                dst = (uint8_t*) (_dst + (i*stride));
2156
21.3k
                m128Tmp0= _mm_setzero_si128();
2157
21.3k
                m128Tmp1= _mm_load_si128((__m128i*)dst);
2158
21.3k
                m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2159
21.3k
                m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2160
21.3k
                m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2161
21.3k
                m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2162
21.3k
                m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2163
21.3k
                m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2164
21.3k
                E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2165
2166
2167
21.3k
                r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2168
21.3k
                r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2169
21.3k
                r0= _mm_packus_epi16(r0,r2);
2170
2171
2172
2173
2174
21.3k
                r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2175
21.3k
                r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2176
21.3k
                r4= _mm_packus_epi16(r4,r6);
2177
2178
2179
21.3k
                r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2180
21.3k
                r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2181
21.3k
                r8= _mm_packus_epi16(r8,r10);
2182
2183
2184
21.3k
                r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2185
21.3k
                r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2186
21.3k
                r12= _mm_packus_epi16(r12,r14);
2187
2188
2189
21.3k
                r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2190
21.3k
                r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2191
21.3k
                r16= _mm_packus_epi16(r16,r18);
2192
2193
2194
21.3k
                r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2195
21.3k
                r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2196
21.3k
                r20= _mm_packus_epi16(r20,r22);
2197
2198
2199
21.3k
                r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2200
21.3k
                r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2201
21.3k
                r24= _mm_packus_epi16(r24,r26);
2202
2203
2204
2205
21.3k
                r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2206
21.3k
                r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2207
21.3k
                r28= _mm_packus_epi16(r28,r30);
2208
2209
21.3k
                _mm_store_si128((__m128i*)dst,r0);
2210
21.3k
                _mm_store_si128((__m128i*)(dst+stride),r4);
2211
21.3k
                _mm_store_si128((__m128i*)(dst+2*stride),r8);
2212
21.3k
                _mm_store_si128((__m128i*)(dst+3*stride),r12);
2213
21.3k
                _mm_store_si128((__m128i*)(dst+4*stride),r16);
2214
21.3k
                _mm_store_si128((__m128i*)(dst+5*stride),r20);
2215
21.3k
                _mm_store_si128((__m128i*)(dst+6*stride),r24);
2216
21.3k
                _mm_store_si128((__m128i*)(dst+7*stride),r28);
2217
2218
2219
2220
21.3k
                if (!i) {
2221
                    //first half done, can store !
2222
2223
2224
10.6k
                    m128iS0= r1;
2225
10.6k
                    m128iS1= r3;
2226
10.6k
                    m128iS2= r5;
2227
10.6k
                    m128iS3= r7;
2228
10.6k
                    m128iS4= r9;
2229
10.6k
                    m128iS5= r11;
2230
10.6k
                    m128iS6= r13;
2231
10.6k
                    m128iS7= r15;
2232
10.6k
                    m128iS8= r17;
2233
10.6k
                    m128iS9= r19;
2234
10.6k
                    m128iS10=r21;
2235
10.6k
                    m128iS11=r23;
2236
10.6k
                    m128iS12=r25;
2237
10.6k
                    m128iS13=r27;
2238
10.6k
                    m128iS14=r29;
2239
10.6k
                    m128iS15=r31;
2240
10.6k
                }
2241
21.3k
            }
2242
42.6k
        }
2243
21.3k
    }
2244
10.6k
}
2245
#endif
2246
2247
2248
#if 0
2249
void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
2250
        ptrdiff_t _stride) {
2251
    int i;
2252
    uint16_t *dst = (uint16_t*) _dst;
2253
    ptrdiff_t stride = _stride / 2;
2254
    int16_t *src = coeffs;
2255
    int32_t shift;
2256
    uint8_t shift_2nd = 10; //20 - bit depth
2257
    uint16_t add_2nd = 1 << 9; //shift - 1;
2258
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2259
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2260
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2261
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2262
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2263
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2264
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2265
    __m128i E4l, E5l, E6l, E7l;
2266
    __m128i E4h, E5h, E6h, E7h;
2267
    int j;
2268
    m128iS0 = _mm_load_si128((__m128i *) (src));
2269
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2270
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2271
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2272
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2273
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2274
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2275
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2276
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2277
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2278
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2279
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2280
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2281
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2282
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2283
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2284
    shift = shift_1st;
2285
    m128iAdd = _mm_set1_epi32(add_1st);
2286
2287
    for (j = 0; j < 2; j++) {
2288
        for (i = 0; i < 16; i += 8) {
2289
2290
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2291
            E0l = _mm_madd_epi16(m128Tmp0,
2292
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2293
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2294
            E0h = _mm_madd_epi16(m128Tmp1,
2295
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2296
2297
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2298
            E1l = _mm_madd_epi16(m128Tmp2,
2299
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2300
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2301
            E1h = _mm_madd_epi16(m128Tmp3,
2302
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2303
2304
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2305
            E2l = _mm_madd_epi16(m128Tmp4,
2306
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2307
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2308
            E2h = _mm_madd_epi16(m128Tmp5,
2309
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2310
2311
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2312
            E3l = _mm_madd_epi16(m128Tmp6,
2313
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2314
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2315
            E3h = _mm_madd_epi16(m128Tmp7,
2316
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2317
2318
            O0l = _mm_add_epi32(E0l, E1l);
2319
            O0l = _mm_add_epi32(O0l, E2l);
2320
            O0l = _mm_add_epi32(O0l, E3l);
2321
2322
            O0h = _mm_add_epi32(E0h, E1h);
2323
            O0h = _mm_add_epi32(O0h, E2h);
2324
            O0h = _mm_add_epi32(O0h, E3h);
2325
2326
            /* Compute O1*/
2327
            E0l = _mm_madd_epi16(m128Tmp0,
2328
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2329
            E0h = _mm_madd_epi16(m128Tmp1,
2330
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2331
            E1l = _mm_madd_epi16(m128Tmp2,
2332
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2333
            E1h = _mm_madd_epi16(m128Tmp3,
2334
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2335
            E2l = _mm_madd_epi16(m128Tmp4,
2336
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2337
            E2h = _mm_madd_epi16(m128Tmp5,
2338
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2339
            E3l = _mm_madd_epi16(m128Tmp6,
2340
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2341
            E3h = _mm_madd_epi16(m128Tmp7,
2342
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2343
            O1l = _mm_add_epi32(E0l, E1l);
2344
            O1l = _mm_add_epi32(O1l, E2l);
2345
            O1l = _mm_add_epi32(O1l, E3l);
2346
            O1h = _mm_add_epi32(E0h, E1h);
2347
            O1h = _mm_add_epi32(O1h, E2h);
2348
            O1h = _mm_add_epi32(O1h, E3h);
2349
2350
            /* Compute O2*/
2351
            E0l = _mm_madd_epi16(m128Tmp0,
2352
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2353
            E0h = _mm_madd_epi16(m128Tmp1,
2354
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2355
            E1l = _mm_madd_epi16(m128Tmp2,
2356
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2357
            E1h = _mm_madd_epi16(m128Tmp3,
2358
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2359
            E2l = _mm_madd_epi16(m128Tmp4,
2360
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2361
            E2h = _mm_madd_epi16(m128Tmp5,
2362
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2363
            E3l = _mm_madd_epi16(m128Tmp6,
2364
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2365
            E3h = _mm_madd_epi16(m128Tmp7,
2366
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2367
            O2l = _mm_add_epi32(E0l, E1l);
2368
            O2l = _mm_add_epi32(O2l, E2l);
2369
            O2l = _mm_add_epi32(O2l, E3l);
2370
2371
            O2h = _mm_add_epi32(E0h, E1h);
2372
            O2h = _mm_add_epi32(O2h, E2h);
2373
            O2h = _mm_add_epi32(O2h, E3h);
2374
2375
            /* Compute O3*/
2376
            E0l = _mm_madd_epi16(m128Tmp0,
2377
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2378
            E0h = _mm_madd_epi16(m128Tmp1,
2379
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2380
            E1l = _mm_madd_epi16(m128Tmp2,
2381
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2382
            E1h = _mm_madd_epi16(m128Tmp3,
2383
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2384
            E2l = _mm_madd_epi16(m128Tmp4,
2385
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2386
            E2h = _mm_madd_epi16(m128Tmp5,
2387
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2388
            E3l = _mm_madd_epi16(m128Tmp6,
2389
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2390
            E3h = _mm_madd_epi16(m128Tmp7,
2391
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2392
2393
            O3l = _mm_add_epi32(E0l, E1l);
2394
            O3l = _mm_add_epi32(O3l, E2l);
2395
            O3l = _mm_add_epi32(O3l, E3l);
2396
2397
            O3h = _mm_add_epi32(E0h, E1h);
2398
            O3h = _mm_add_epi32(O3h, E2h);
2399
            O3h = _mm_add_epi32(O3h, E3h);
2400
2401
            /* Compute O4*/
2402
2403
            E0l = _mm_madd_epi16(m128Tmp0,
2404
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2405
            E0h = _mm_madd_epi16(m128Tmp1,
2406
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2407
            E1l = _mm_madd_epi16(m128Tmp2,
2408
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2409
            E1h = _mm_madd_epi16(m128Tmp3,
2410
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2411
            E2l = _mm_madd_epi16(m128Tmp4,
2412
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2413
            E2h = _mm_madd_epi16(m128Tmp5,
2414
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2415
            E3l = _mm_madd_epi16(m128Tmp6,
2416
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2417
            E3h = _mm_madd_epi16(m128Tmp7,
2418
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2419
2420
            O4l = _mm_add_epi32(E0l, E1l);
2421
            O4l = _mm_add_epi32(O4l, E2l);
2422
            O4l = _mm_add_epi32(O4l, E3l);
2423
2424
            O4h = _mm_add_epi32(E0h, E1h);
2425
            O4h = _mm_add_epi32(O4h, E2h);
2426
            O4h = _mm_add_epi32(O4h, E3h);
2427
2428
            /* Compute O5*/
2429
            E0l = _mm_madd_epi16(m128Tmp0,
2430
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2431
            E0h = _mm_madd_epi16(m128Tmp1,
2432
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2433
            E1l = _mm_madd_epi16(m128Tmp2,
2434
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2435
            E1h = _mm_madd_epi16(m128Tmp3,
2436
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2437
            E2l = _mm_madd_epi16(m128Tmp4,
2438
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2439
            E2h = _mm_madd_epi16(m128Tmp5,
2440
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2441
            E3l = _mm_madd_epi16(m128Tmp6,
2442
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2443
            E3h = _mm_madd_epi16(m128Tmp7,
2444
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2445
2446
            O5l = _mm_add_epi32(E0l, E1l);
2447
            O5l = _mm_add_epi32(O5l, E2l);
2448
            O5l = _mm_add_epi32(O5l, E3l);
2449
2450
            O5h = _mm_add_epi32(E0h, E1h);
2451
            O5h = _mm_add_epi32(O5h, E2h);
2452
            O5h = _mm_add_epi32(O5h, E3h);
2453
2454
            /* Compute O6*/
2455
2456
            E0l = _mm_madd_epi16(m128Tmp0,
2457
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2458
            E0h = _mm_madd_epi16(m128Tmp1,
2459
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2460
            E1l = _mm_madd_epi16(m128Tmp2,
2461
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2462
            E1h = _mm_madd_epi16(m128Tmp3,
2463
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2464
            E2l = _mm_madd_epi16(m128Tmp4,
2465
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2466
            E2h = _mm_madd_epi16(m128Tmp5,
2467
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2468
            E3l = _mm_madd_epi16(m128Tmp6,
2469
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2470
            E3h = _mm_madd_epi16(m128Tmp7,
2471
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2472
2473
            O6l = _mm_add_epi32(E0l, E1l);
2474
            O6l = _mm_add_epi32(O6l, E2l);
2475
            O6l = _mm_add_epi32(O6l, E3l);
2476
2477
            O6h = _mm_add_epi32(E0h, E1h);
2478
            O6h = _mm_add_epi32(O6h, E2h);
2479
            O6h = _mm_add_epi32(O6h, E3h);
2480
2481
            /* Compute O7*/
2482
2483
            E0l = _mm_madd_epi16(m128Tmp0,
2484
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2485
            E0h = _mm_madd_epi16(m128Tmp1,
2486
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2487
            E1l = _mm_madd_epi16(m128Tmp2,
2488
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2489
            E1h = _mm_madd_epi16(m128Tmp3,
2490
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2491
            E2l = _mm_madd_epi16(m128Tmp4,
2492
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2493
            E2h = _mm_madd_epi16(m128Tmp5,
2494
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2495
            E3l = _mm_madd_epi16(m128Tmp6,
2496
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2497
            E3h = _mm_madd_epi16(m128Tmp7,
2498
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2499
2500
            O7l = _mm_add_epi32(E0l, E1l);
2501
            O7l = _mm_add_epi32(O7l, E2l);
2502
            O7l = _mm_add_epi32(O7l, E3l);
2503
2504
            O7h = _mm_add_epi32(E0h, E1h);
2505
            O7h = _mm_add_epi32(O7h, E2h);
2506
            O7h = _mm_add_epi32(O7h, E3h);
2507
2508
            /*  Compute E0  */
2509
2510
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2511
            E0l = _mm_madd_epi16(m128Tmp0,
2512
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2513
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2514
            E0h = _mm_madd_epi16(m128Tmp1,
2515
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2516
2517
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2518
            E0l = _mm_add_epi32(E0l,
2519
                    _mm_madd_epi16(m128Tmp2,
2520
                            _mm_load_si128(
2521
                                    (__m128i *) (transform16x16_2[1][0]))));
2522
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2523
            E0h = _mm_add_epi32(E0h,
2524
                    _mm_madd_epi16(m128Tmp3,
2525
                            _mm_load_si128(
2526
                                    (__m128i *) (transform16x16_2[1][0]))));
2527
2528
            /*  Compute E1  */
2529
            E1l = _mm_madd_epi16(m128Tmp0,
2530
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2531
            E1h = _mm_madd_epi16(m128Tmp1,
2532
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2533
            E1l = _mm_add_epi32(E1l,
2534
                    _mm_madd_epi16(m128Tmp2,
2535
                            _mm_load_si128(
2536
                                    (__m128i *) (transform16x16_2[1][1]))));
2537
            E1h = _mm_add_epi32(E1h,
2538
                    _mm_madd_epi16(m128Tmp3,
2539
                            _mm_load_si128(
2540
                                    (__m128i *) (transform16x16_2[1][1]))));
2541
2542
            /*  Compute E2  */
2543
            E2l = _mm_madd_epi16(m128Tmp0,
2544
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2545
            E2h = _mm_madd_epi16(m128Tmp1,
2546
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2547
            E2l = _mm_add_epi32(E2l,
2548
                    _mm_madd_epi16(m128Tmp2,
2549
                            _mm_load_si128(
2550
                                    (__m128i *) (transform16x16_2[1][2]))));
2551
            E2h = _mm_add_epi32(E2h,
2552
                    _mm_madd_epi16(m128Tmp3,
2553
                            _mm_load_si128(
2554
                                    (__m128i *) (transform16x16_2[1][2]))));
2555
            /*  Compute E3  */
2556
            E3l = _mm_madd_epi16(m128Tmp0,
2557
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2558
            E3h = _mm_madd_epi16(m128Tmp1,
2559
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2560
            E3l = _mm_add_epi32(E3l,
2561
                    _mm_madd_epi16(m128Tmp2,
2562
                            _mm_load_si128(
2563
                                    (__m128i *) (transform16x16_2[1][3]))));
2564
            E3h = _mm_add_epi32(E3h,
2565
                    _mm_madd_epi16(m128Tmp3,
2566
                            _mm_load_si128(
2567
                                    (__m128i *) (transform16x16_2[1][3]))));
2568
2569
            /*  Compute EE0 and EEE */
2570
2571
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2572
            E00l = _mm_madd_epi16(m128Tmp0,
2573
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2574
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2575
            E00h = _mm_madd_epi16(m128Tmp1,
2576
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2577
2578
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2579
            EE0l = _mm_madd_epi16(m128Tmp2,
2580
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2581
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2582
            EE0h = _mm_madd_epi16(m128Tmp3,
2583
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2584
2585
            E01l = _mm_madd_epi16(m128Tmp0,
2586
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2587
            E01h = _mm_madd_epi16(m128Tmp1,
2588
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2589
2590
            EE1l = _mm_madd_epi16(m128Tmp2,
2591
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2592
            EE1h = _mm_madd_epi16(m128Tmp3,
2593
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2594
2595
            /*  Compute EE    */
2596
            EE2l = _mm_sub_epi32(EE1l, E01l);
2597
            EE3l = _mm_sub_epi32(EE0l, E00l);
2598
            EE2h = _mm_sub_epi32(EE1h, E01h);
2599
            EE3h = _mm_sub_epi32(EE0h, E00h);
2600
2601
            EE0l = _mm_add_epi32(EE0l, E00l);
2602
            EE1l = _mm_add_epi32(EE1l, E01l);
2603
            EE0h = _mm_add_epi32(EE0h, E00h);
2604
            EE1h = _mm_add_epi32(EE1h, E01h);
2605
2606
            /*      Compute E       */
2607
2608
            E4l = _mm_sub_epi32(EE3l, E3l);
2609
            E4l = _mm_add_epi32(E4l, m128iAdd);
2610
2611
            E5l = _mm_sub_epi32(EE2l, E2l);
2612
            E5l = _mm_add_epi32(E5l, m128iAdd);
2613
2614
            E6l = _mm_sub_epi32(EE1l, E1l);
2615
            E6l = _mm_add_epi32(E6l, m128iAdd);
2616
2617
            E7l = _mm_sub_epi32(EE0l, E0l);
2618
            E7l = _mm_add_epi32(E7l, m128iAdd);
2619
2620
            E4h = _mm_sub_epi32(EE3h, E3h);
2621
            E4h = _mm_add_epi32(E4h, m128iAdd);
2622
2623
            E5h = _mm_sub_epi32(EE2h, E2h);
2624
            E5h = _mm_add_epi32(E5h, m128iAdd);
2625
2626
            E6h = _mm_sub_epi32(EE1h, E1h);
2627
            E6h = _mm_add_epi32(E6h, m128iAdd);
2628
2629
            E7h = _mm_sub_epi32(EE0h, E0h);
2630
            E7h = _mm_add_epi32(E7h, m128iAdd);
2631
2632
            E0l = _mm_add_epi32(EE0l, E0l);
2633
            E0l = _mm_add_epi32(E0l, m128iAdd);
2634
2635
            E1l = _mm_add_epi32(EE1l, E1l);
2636
            E1l = _mm_add_epi32(E1l, m128iAdd);
2637
2638
            E2l = _mm_add_epi32(EE2l, E2l);
2639
            E2l = _mm_add_epi32(E2l, m128iAdd);
2640
2641
            E3l = _mm_add_epi32(EE3l, E3l);
2642
            E3l = _mm_add_epi32(E3l, m128iAdd);
2643
2644
            E0h = _mm_add_epi32(EE0h, E0h);
2645
            E0h = _mm_add_epi32(E0h, m128iAdd);
2646
2647
            E1h = _mm_add_epi32(EE1h, E1h);
2648
            E1h = _mm_add_epi32(E1h, m128iAdd);
2649
2650
            E2h = _mm_add_epi32(EE2h, E2h);
2651
            E2h = _mm_add_epi32(E2h, m128iAdd);
2652
2653
            E3h = _mm_add_epi32(EE3h, E3h);
2654
            E3h = _mm_add_epi32(E3h, m128iAdd);
2655
2656
            m128iS0 = _mm_packs_epi32(
2657
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2658
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2659
            m128iS1 = _mm_packs_epi32(
2660
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2661
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2662
            m128iS2 = _mm_packs_epi32(
2663
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2664
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2665
            m128iS3 = _mm_packs_epi32(
2666
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2667
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2668
2669
            m128iS4 = _mm_packs_epi32(
2670
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2671
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2672
            m128iS5 = _mm_packs_epi32(
2673
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2674
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2675
            m128iS6 = _mm_packs_epi32(
2676
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2677
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2678
            m128iS7 = _mm_packs_epi32(
2679
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2680
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2681
2682
            m128iS15 = _mm_packs_epi32(
2683
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2684
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2685
            m128iS14 = _mm_packs_epi32(
2686
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2687
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2688
            m128iS13 = _mm_packs_epi32(
2689
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2690
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2691
            m128iS12 = _mm_packs_epi32(
2692
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2693
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2694
2695
            m128iS11 = _mm_packs_epi32(
2696
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2697
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2698
            m128iS10 = _mm_packs_epi32(
2699
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2700
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2701
            m128iS9 = _mm_packs_epi32(
2702
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2703
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2704
            m128iS8 = _mm_packs_epi32(
2705
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2706
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2707
2708
            if (!j) {
2709
                /*      Inverse the matrix      */
2710
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2711
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2712
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2713
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2714
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2715
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2716
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2717
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2718
2719
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2720
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2721
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2722
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2723
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2724
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2725
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2726
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2727
2728
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2729
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2730
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2731
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2732
2733
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2734
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2735
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2736
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2737
2738
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2739
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2740
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2741
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2742
2743
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2744
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2745
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2746
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2747
2748
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2749
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2750
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2751
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2752
2753
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2754
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2755
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2756
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2757
2758
                m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2759
                m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2760
                m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2761
                m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2762
2763
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2764
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2765
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2766
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2767
2768
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2769
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2770
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2771
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2772
2773
                m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2774
                m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2775
                m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2776
                m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2777
2778
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2779
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2780
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2781
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2782
2783
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2784
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2785
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2786
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2787
2788
                /*  */
2789
                _mm_store_si128((__m128i *) (src + i), m128iS0);
2790
                _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2791
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2792
                _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2793
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2794
                _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2795
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2796
                _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2797
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2798
                _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2799
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2800
                _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2801
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2802
                _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2803
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2804
                _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2805
2806
                if (!i) {
2807
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2808
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2809
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2810
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2811
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2812
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2813
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2814
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2815
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2816
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2817
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2818
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2819
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2820
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2821
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2822
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2823
                } else {
2824
                    m128iS0 = _mm_load_si128((__m128i *) (src));
2825
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2826
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2827
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2828
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2829
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2830
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2831
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2832
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2833
                    m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2834
                    m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2835
                    m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2836
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2837
                    m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2838
                    m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2839
                    m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2840
                    shift = shift_2nd;
2841
                    m128iAdd = _mm_set1_epi32(add_2nd);
2842
                }
2843
2844
            } else {
2845
                int k, m = 0;
2846
                _mm_storeu_si128((__m128i *) (src), m128iS0);
2847
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2848
                _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2849
                _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2850
                _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2851
                _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2852
                _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2853
                _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2854
                _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2855
                _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2856
                _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2857
                _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2858
                _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2859
                _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2860
                _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2861
                _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2862
                dst = (uint16_t*) _dst + (i * stride);
2863
2864
                for (k = 0; k < 8; k++) {
2865
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2866
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2867
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2868
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2869
                    dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2870
                    dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2871
                    dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2872
                    dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2873
2874
                    dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2875
                    dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2876
                    dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2877
                    dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2878
                    dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2879
                    dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2880
                    dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2881
                    dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2882
                    m += 1;
2883
                    dst += stride;
2884
                }
2885
                if (!i) {
2886
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2887
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2888
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2889
                    m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2890
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2891
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2892
                    m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2893
                    m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2894
                    m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2895
                    m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2896
                    m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2897
                    m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2898
                    m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2899
                    m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2900
                    m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2901
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2902
                }
2903
            }
2904
        }
2905
    }
2906
2907
}
2908
#endif
2909
2910
2911
#if HAVE_SSE4_1
2912
void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
2913
6.39k
        ptrdiff_t _stride) {
2914
6.39k
    uint8_t shift_2nd = 12; // 20 - Bit depth
2915
6.39k
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2916
6.39k
    int i, j;
2917
6.39k
    uint8_t *dst = (uint8_t*) _dst;
2918
6.39k
    ptrdiff_t stride = _stride / sizeof(uint8_t);
2919
6.39k
    int shift;
2920
6.39k
    const int16_t *src = coeffs;
2921
2922
6.39k
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2923
6.39k
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2924
6.39k
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2925
6.39k
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2926
6.39k
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2927
6.39k
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2928
6.39k
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2929
6.39k
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2930
6.39k
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2931
6.39k
            EEE0l, EEE1l, EEE0h, EEE1h;
2932
6.39k
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2933
6.39k
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2934
6.39k
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2935
6.39k
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2936
6.39k
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2937
6.39k
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2938
6.39k
            EE4l, EE7h, EE6h, EE5h, EE4h;
2939
2940
6.39k
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2941
6.39k
    __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2942
6.39k
    __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2943
6.39k
    __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2944
2945
2946
6.39k
    m128iS0 = _mm_load_si128((__m128i *) (src));
2947
6.39k
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2948
6.39k
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2949
6.39k
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2950
6.39k
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2951
6.39k
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2952
6.39k
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2953
6.39k
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2954
6.39k
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2955
6.39k
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2956
6.39k
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2957
6.39k
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2958
6.39k
    m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2959
6.39k
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2960
6.39k
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2961
6.39k
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2962
6.39k
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2963
6.39k
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2964
6.39k
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2965
6.39k
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2966
6.39k
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2967
6.39k
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2968
6.39k
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2969
6.39k
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2970
6.39k
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2971
6.39k
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2972
6.39k
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2973
6.39k
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2974
6.39k
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2975
6.39k
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2976
6.39k
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2977
6.39k
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2978
2979
6.39k
    shift = shift_1st;
2980
6.39k
    m128iAdd = _mm_set1_epi32(add_1st);
2981
2982
19.1k
    for (j = 0; j < 2; j++) {
2983
63.9k
        for (i = 0; i < 32; i += 8) {
2984
51.1k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2985
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
2986
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2987
51.1k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2988
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
2989
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2990
2991
51.1k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2992
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
2993
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
2994
51.1k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2995
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
2996
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
2997
2998
51.1k
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2999
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3000
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3001
51.1k
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3002
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3003
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3004
3005
51.1k
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3006
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3007
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3008
51.1k
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3009
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3010
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3011
3012
51.1k
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3013
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3014
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3015
51.1k
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3016
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3017
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3018
3019
51.1k
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3020
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3021
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3022
51.1k
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3023
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3024
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3025
3026
51.1k
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3027
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3028
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3029
51.1k
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3030
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3031
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3032
3033
51.1k
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3034
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3035
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3036
51.1k
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3037
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3038
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3039
3040
51.1k
            O0l = _mm_add_epi32(E0l, E1l);
3041
51.1k
            O0l = _mm_add_epi32(O0l, E2l);
3042
51.1k
            O0l = _mm_add_epi32(O0l, E3l);
3043
51.1k
            O0l = _mm_add_epi32(O0l, E4l);
3044
51.1k
            O0l = _mm_add_epi32(O0l, E5l);
3045
51.1k
            O0l = _mm_add_epi32(O0l, E6l);
3046
51.1k
            O0l = _mm_add_epi32(O0l, E7l);
3047
3048
51.1k
            O0h = _mm_add_epi32(E0h, E1h);
3049
51.1k
            O0h = _mm_add_epi32(O0h, E2h);
3050
51.1k
            O0h = _mm_add_epi32(O0h, E3h);
3051
51.1k
            O0h = _mm_add_epi32(O0h, E4h);
3052
51.1k
            O0h = _mm_add_epi32(O0h, E5h);
3053
51.1k
            O0h = _mm_add_epi32(O0h, E6h);
3054
51.1k
            O0h = _mm_add_epi32(O0h, E7h);
3055
3056
            /* Compute O1*/
3057
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3058
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3059
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3060
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3061
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3062
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3063
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3064
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3065
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3066
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3067
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3068
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3069
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3070
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3071
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3072
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3073
3074
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3075
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3076
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3077
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3078
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3079
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3080
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3081
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3082
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3083
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3084
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3085
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3086
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3087
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3088
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3089
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3090
3091
51.1k
            O1l = _mm_add_epi32(E0l, E1l);
3092
51.1k
            O1l = _mm_add_epi32(O1l, E2l);
3093
51.1k
            O1l = _mm_add_epi32(O1l, E3l);
3094
51.1k
            O1l = _mm_add_epi32(O1l, E4l);
3095
51.1k
            O1l = _mm_add_epi32(O1l, E5l);
3096
51.1k
            O1l = _mm_add_epi32(O1l, E6l);
3097
51.1k
            O1l = _mm_add_epi32(O1l, E7l);
3098
3099
51.1k
            O1h = _mm_add_epi32(E0h, E1h);
3100
51.1k
            O1h = _mm_add_epi32(O1h, E2h);
3101
51.1k
            O1h = _mm_add_epi32(O1h, E3h);
3102
51.1k
            O1h = _mm_add_epi32(O1h, E4h);
3103
51.1k
            O1h = _mm_add_epi32(O1h, E5h);
3104
51.1k
            O1h = _mm_add_epi32(O1h, E6h);
3105
51.1k
            O1h = _mm_add_epi32(O1h, E7h);
3106
            /* Compute O2*/
3107
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3108
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3109
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3110
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3111
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3112
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3113
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3114
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3115
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3116
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3117
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3118
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3119
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3120
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3121
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3122
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3123
3124
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3125
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3126
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3127
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3128
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3129
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3130
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3131
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3132
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3133
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3134
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3135
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3136
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3137
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3138
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3139
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3140
3141
51.1k
            O2l = _mm_add_epi32(E0l, E1l);
3142
51.1k
            O2l = _mm_add_epi32(O2l, E2l);
3143
51.1k
            O2l = _mm_add_epi32(O2l, E3l);
3144
51.1k
            O2l = _mm_add_epi32(O2l, E4l);
3145
51.1k
            O2l = _mm_add_epi32(O2l, E5l);
3146
51.1k
            O2l = _mm_add_epi32(O2l, E6l);
3147
51.1k
            O2l = _mm_add_epi32(O2l, E7l);
3148
3149
51.1k
            O2h = _mm_add_epi32(E0h, E1h);
3150
51.1k
            O2h = _mm_add_epi32(O2h, E2h);
3151
51.1k
            O2h = _mm_add_epi32(O2h, E3h);
3152
51.1k
            O2h = _mm_add_epi32(O2h, E4h);
3153
51.1k
            O2h = _mm_add_epi32(O2h, E5h);
3154
51.1k
            O2h = _mm_add_epi32(O2h, E6h);
3155
51.1k
            O2h = _mm_add_epi32(O2h, E7h);
3156
            /* Compute O3*/
3157
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3158
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3159
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3160
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3161
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3162
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3163
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3164
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3165
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3166
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3167
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3168
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3169
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3170
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3171
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3172
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3173
3174
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3175
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3176
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3177
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3178
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3179
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3180
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3181
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3182
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3183
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3184
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3185
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3186
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3187
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3188
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3189
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3190
3191
51.1k
            O3l = _mm_add_epi32(E0l, E1l);
3192
51.1k
            O3l = _mm_add_epi32(O3l, E2l);
3193
51.1k
            O3l = _mm_add_epi32(O3l, E3l);
3194
51.1k
            O3l = _mm_add_epi32(O3l, E4l);
3195
51.1k
            O3l = _mm_add_epi32(O3l, E5l);
3196
51.1k
            O3l = _mm_add_epi32(O3l, E6l);
3197
51.1k
            O3l = _mm_add_epi32(O3l, E7l);
3198
3199
51.1k
            O3h = _mm_add_epi32(E0h, E1h);
3200
51.1k
            O3h = _mm_add_epi32(O3h, E2h);
3201
51.1k
            O3h = _mm_add_epi32(O3h, E3h);
3202
51.1k
            O3h = _mm_add_epi32(O3h, E4h);
3203
51.1k
            O3h = _mm_add_epi32(O3h, E5h);
3204
51.1k
            O3h = _mm_add_epi32(O3h, E6h);
3205
51.1k
            O3h = _mm_add_epi32(O3h, E7h);
3206
            /* Compute O4*/
3207
3208
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3209
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3210
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3211
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3212
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3213
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3214
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3215
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3216
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3217
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3218
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3219
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3220
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3221
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3222
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3223
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3224
3225
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3226
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3227
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3228
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3229
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3230
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3231
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3232
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3233
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3234
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3235
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3236
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3237
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3238
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3239
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3240
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3241
3242
51.1k
            O4l = _mm_add_epi32(E0l, E1l);
3243
51.1k
            O4l = _mm_add_epi32(O4l, E2l);
3244
51.1k
            O4l = _mm_add_epi32(O4l, E3l);
3245
51.1k
            O4l = _mm_add_epi32(O4l, E4l);
3246
51.1k
            O4l = _mm_add_epi32(O4l, E5l);
3247
51.1k
            O4l = _mm_add_epi32(O4l, E6l);
3248
51.1k
            O4l = _mm_add_epi32(O4l, E7l);
3249
3250
51.1k
            O4h = _mm_add_epi32(E0h, E1h);
3251
51.1k
            O4h = _mm_add_epi32(O4h, E2h);
3252
51.1k
            O4h = _mm_add_epi32(O4h, E3h);
3253
51.1k
            O4h = _mm_add_epi32(O4h, E4h);
3254
51.1k
            O4h = _mm_add_epi32(O4h, E5h);
3255
51.1k
            O4h = _mm_add_epi32(O4h, E6h);
3256
51.1k
            O4h = _mm_add_epi32(O4h, E7h);
3257
3258
            /* Compute O5*/
3259
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3260
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3261
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3262
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3263
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3264
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3265
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3266
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3267
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3268
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3269
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3270
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3271
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3272
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3273
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3274
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3275
3276
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3277
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3278
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3279
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3280
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3281
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3282
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3283
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3284
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3285
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3286
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3287
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3288
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3289
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3290
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3291
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3292
3293
51.1k
            O5l = _mm_add_epi32(E0l, E1l);
3294
51.1k
            O5l = _mm_add_epi32(O5l, E2l);
3295
51.1k
            O5l = _mm_add_epi32(O5l, E3l);
3296
51.1k
            O5l = _mm_add_epi32(O5l, E4l);
3297
51.1k
            O5l = _mm_add_epi32(O5l, E5l);
3298
51.1k
            O5l = _mm_add_epi32(O5l, E6l);
3299
51.1k
            O5l = _mm_add_epi32(O5l, E7l);
3300
3301
51.1k
            O5h = _mm_add_epi32(E0h, E1h);
3302
51.1k
            O5h = _mm_add_epi32(O5h, E2h);
3303
51.1k
            O5h = _mm_add_epi32(O5h, E3h);
3304
51.1k
            O5h = _mm_add_epi32(O5h, E4h);
3305
51.1k
            O5h = _mm_add_epi32(O5h, E5h);
3306
51.1k
            O5h = _mm_add_epi32(O5h, E6h);
3307
51.1k
            O5h = _mm_add_epi32(O5h, E7h);
3308
3309
            /* Compute O6*/
3310
3311
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3312
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3313
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3314
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3315
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3316
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3317
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3318
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3319
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3320
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3321
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3322
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3323
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3324
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3325
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3326
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3327
3328
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3329
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3330
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3331
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3332
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3333
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3334
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3335
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3336
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3337
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3338
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3339
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3340
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3341
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3342
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3343
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3344
3345
51.1k
            O6l = _mm_add_epi32(E0l, E1l);
3346
51.1k
            O6l = _mm_add_epi32(O6l, E2l);
3347
51.1k
            O6l = _mm_add_epi32(O6l, E3l);
3348
51.1k
            O6l = _mm_add_epi32(O6l, E4l);
3349
51.1k
            O6l = _mm_add_epi32(O6l, E5l);
3350
51.1k
            O6l = _mm_add_epi32(O6l, E6l);
3351
51.1k
            O6l = _mm_add_epi32(O6l, E7l);
3352
3353
51.1k
            O6h = _mm_add_epi32(E0h, E1h);
3354
51.1k
            O6h = _mm_add_epi32(O6h, E2h);
3355
51.1k
            O6h = _mm_add_epi32(O6h, E3h);
3356
51.1k
            O6h = _mm_add_epi32(O6h, E4h);
3357
51.1k
            O6h = _mm_add_epi32(O6h, E5h);
3358
51.1k
            O6h = _mm_add_epi32(O6h, E6h);
3359
51.1k
            O6h = _mm_add_epi32(O6h, E7h);
3360
3361
            /* Compute O7*/
3362
3363
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3364
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3365
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3366
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3367
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3368
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3369
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3370
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3371
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3372
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3373
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3374
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3375
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3376
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3377
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3378
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3379
3380
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3381
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3382
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3383
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3384
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3385
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3386
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3387
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3388
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3389
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3390
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3391
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3392
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3393
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3394
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3395
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3396
3397
51.1k
            O7l = _mm_add_epi32(E0l, E1l);
3398
51.1k
            O7l = _mm_add_epi32(O7l, E2l);
3399
51.1k
            O7l = _mm_add_epi32(O7l, E3l);
3400
51.1k
            O7l = _mm_add_epi32(O7l, E4l);
3401
51.1k
            O7l = _mm_add_epi32(O7l, E5l);
3402
51.1k
            O7l = _mm_add_epi32(O7l, E6l);
3403
51.1k
            O7l = _mm_add_epi32(O7l, E7l);
3404
3405
51.1k
            O7h = _mm_add_epi32(E0h, E1h);
3406
51.1k
            O7h = _mm_add_epi32(O7h, E2h);
3407
51.1k
            O7h = _mm_add_epi32(O7h, E3h);
3408
51.1k
            O7h = _mm_add_epi32(O7h, E4h);
3409
51.1k
            O7h = _mm_add_epi32(O7h, E5h);
3410
51.1k
            O7h = _mm_add_epi32(O7h, E6h);
3411
51.1k
            O7h = _mm_add_epi32(O7h, E7h);
3412
3413
            /* Compute O8*/
3414
3415
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3416
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3417
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3418
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3419
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3420
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3421
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3422
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3423
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3424
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3425
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3426
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3427
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3428
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3429
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3430
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3431
3432
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3433
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3434
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3435
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3436
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3437
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3438
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3439
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3440
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3441
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3442
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3443
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3444
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3445
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3446
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3447
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3448
3449
51.1k
            O8l = _mm_add_epi32(E0l, E1l);
3450
51.1k
            O8l = _mm_add_epi32(O8l, E2l);
3451
51.1k
            O8l = _mm_add_epi32(O8l, E3l);
3452
51.1k
            O8l = _mm_add_epi32(O8l, E4l);
3453
51.1k
            O8l = _mm_add_epi32(O8l, E5l);
3454
51.1k
            O8l = _mm_add_epi32(O8l, E6l);
3455
51.1k
            O8l = _mm_add_epi32(O8l, E7l);
3456
3457
51.1k
            O8h = _mm_add_epi32(E0h, E1h);
3458
51.1k
            O8h = _mm_add_epi32(O8h, E2h);
3459
51.1k
            O8h = _mm_add_epi32(O8h, E3h);
3460
51.1k
            O8h = _mm_add_epi32(O8h, E4h);
3461
51.1k
            O8h = _mm_add_epi32(O8h, E5h);
3462
51.1k
            O8h = _mm_add_epi32(O8h, E6h);
3463
51.1k
            O8h = _mm_add_epi32(O8h, E7h);
3464
3465
            /* Compute O9*/
3466
3467
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3468
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3469
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3470
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3471
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3472
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3473
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3474
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3475
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3476
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3477
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3478
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3479
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3480
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3481
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3482
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3483
3484
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3485
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3486
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3487
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3488
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3489
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3490
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3491
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3492
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3493
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3494
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3495
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3496
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3497
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3498
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3499
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3500
3501
51.1k
            O9l = _mm_add_epi32(E0l, E1l);
3502
51.1k
            O9l = _mm_add_epi32(O9l, E2l);
3503
51.1k
            O9l = _mm_add_epi32(O9l, E3l);
3504
51.1k
            O9l = _mm_add_epi32(O9l, E4l);
3505
51.1k
            O9l = _mm_add_epi32(O9l, E5l);
3506
51.1k
            O9l = _mm_add_epi32(O9l, E6l);
3507
51.1k
            O9l = _mm_add_epi32(O9l, E7l);
3508
3509
51.1k
            O9h = _mm_add_epi32(E0h, E1h);
3510
51.1k
            O9h = _mm_add_epi32(O9h, E2h);
3511
51.1k
            O9h = _mm_add_epi32(O9h, E3h);
3512
51.1k
            O9h = _mm_add_epi32(O9h, E4h);
3513
51.1k
            O9h = _mm_add_epi32(O9h, E5h);
3514
51.1k
            O9h = _mm_add_epi32(O9h, E6h);
3515
51.1k
            O9h = _mm_add_epi32(O9h, E7h);
3516
3517
            /* Compute 10*/
3518
3519
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3520
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3521
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3522
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3523
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3524
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3525
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3526
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3527
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3528
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3529
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3530
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3531
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3532
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3533
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3534
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3535
3536
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3537
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3538
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3539
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3540
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3541
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3542
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3543
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3544
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3545
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3546
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3547
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3548
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3549
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3550
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3551
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3552
3553
51.1k
            O10l = _mm_add_epi32(E0l, E1l);
3554
51.1k
            O10l = _mm_add_epi32(O10l, E2l);
3555
51.1k
            O10l = _mm_add_epi32(O10l, E3l);
3556
51.1k
            O10l = _mm_add_epi32(O10l, E4l);
3557
51.1k
            O10l = _mm_add_epi32(O10l, E5l);
3558
51.1k
            O10l = _mm_add_epi32(O10l, E6l);
3559
51.1k
            O10l = _mm_add_epi32(O10l, E7l);
3560
3561
51.1k
            O10h = _mm_add_epi32(E0h, E1h);
3562
51.1k
            O10h = _mm_add_epi32(O10h, E2h);
3563
51.1k
            O10h = _mm_add_epi32(O10h, E3h);
3564
51.1k
            O10h = _mm_add_epi32(O10h, E4h);
3565
51.1k
            O10h = _mm_add_epi32(O10h, E5h);
3566
51.1k
            O10h = _mm_add_epi32(O10h, E6h);
3567
51.1k
            O10h = _mm_add_epi32(O10h, E7h);
3568
3569
            /* Compute 11*/
3570
3571
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3572
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3573
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3574
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3575
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3576
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3577
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3578
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3579
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3580
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3581
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3582
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3583
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3584
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3585
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3586
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3587
3588
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3589
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3590
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3591
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3592
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3593
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3594
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3595
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3596
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3597
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3598
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3599
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3600
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3601
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3602
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3603
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3604
3605
51.1k
            O11l = _mm_add_epi32(E0l, E1l);
3606
51.1k
            O11l = _mm_add_epi32(O11l, E2l);
3607
51.1k
            O11l = _mm_add_epi32(O11l, E3l);
3608
51.1k
            O11l = _mm_add_epi32(O11l, E4l);
3609
51.1k
            O11l = _mm_add_epi32(O11l, E5l);
3610
51.1k
            O11l = _mm_add_epi32(O11l, E6l);
3611
51.1k
            O11l = _mm_add_epi32(O11l, E7l);
3612
3613
51.1k
            O11h = _mm_add_epi32(E0h, E1h);
3614
51.1k
            O11h = _mm_add_epi32(O11h, E2h);
3615
51.1k
            O11h = _mm_add_epi32(O11h, E3h);
3616
51.1k
            O11h = _mm_add_epi32(O11h, E4h);
3617
51.1k
            O11h = _mm_add_epi32(O11h, E5h);
3618
51.1k
            O11h = _mm_add_epi32(O11h, E6h);
3619
51.1k
            O11h = _mm_add_epi32(O11h, E7h);
3620
3621
            /* Compute 12*/
3622
3623
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3624
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3625
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3626
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3627
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3628
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3629
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3630
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3631
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3632
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3633
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3634
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3635
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3636
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3637
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3638
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3639
3640
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3641
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3642
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3643
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3644
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3645
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3646
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3647
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3648
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3649
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3650
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3651
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3652
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3653
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3654
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3655
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3656
3657
51.1k
            O12l = _mm_add_epi32(E0l, E1l);
3658
51.1k
            O12l = _mm_add_epi32(O12l, E2l);
3659
51.1k
            O12l = _mm_add_epi32(O12l, E3l);
3660
51.1k
            O12l = _mm_add_epi32(O12l, E4l);
3661
51.1k
            O12l = _mm_add_epi32(O12l, E5l);
3662
51.1k
            O12l = _mm_add_epi32(O12l, E6l);
3663
51.1k
            O12l = _mm_add_epi32(O12l, E7l);
3664
3665
51.1k
            O12h = _mm_add_epi32(E0h, E1h);
3666
51.1k
            O12h = _mm_add_epi32(O12h, E2h);
3667
51.1k
            O12h = _mm_add_epi32(O12h, E3h);
3668
51.1k
            O12h = _mm_add_epi32(O12h, E4h);
3669
51.1k
            O12h = _mm_add_epi32(O12h, E5h);
3670
51.1k
            O12h = _mm_add_epi32(O12h, E6h);
3671
51.1k
            O12h = _mm_add_epi32(O12h, E7h);
3672
3673
            /* Compute 13*/
3674
3675
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3676
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3677
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3678
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3679
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3680
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3681
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3682
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3683
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3684
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3685
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3686
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3687
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3688
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3689
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3690
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3691
3692
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3693
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3694
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3695
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3696
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3697
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3698
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3699
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3700
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3701
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3702
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3703
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3704
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3705
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3706
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3707
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3708
3709
51.1k
            O13l = _mm_add_epi32(E0l, E1l);
3710
51.1k
            O13l = _mm_add_epi32(O13l, E2l);
3711
51.1k
            O13l = _mm_add_epi32(O13l, E3l);
3712
51.1k
            O13l = _mm_add_epi32(O13l, E4l);
3713
51.1k
            O13l = _mm_add_epi32(O13l, E5l);
3714
51.1k
            O13l = _mm_add_epi32(O13l, E6l);
3715
51.1k
            O13l = _mm_add_epi32(O13l, E7l);
3716
3717
51.1k
            O13h = _mm_add_epi32(E0h, E1h);
3718
51.1k
            O13h = _mm_add_epi32(O13h, E2h);
3719
51.1k
            O13h = _mm_add_epi32(O13h, E3h);
3720
51.1k
            O13h = _mm_add_epi32(O13h, E4h);
3721
51.1k
            O13h = _mm_add_epi32(O13h, E5h);
3722
51.1k
            O13h = _mm_add_epi32(O13h, E6h);
3723
51.1k
            O13h = _mm_add_epi32(O13h, E7h);
3724
3725
            /* Compute O14  */
3726
3727
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3728
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3729
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3730
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3731
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3732
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3733
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3734
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3735
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3736
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3737
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3738
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3739
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3740
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3741
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3742
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3743
3744
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3745
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3746
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3747
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3748
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3749
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3750
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3751
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3752
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3753
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3754
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3755
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3756
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3757
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3758
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3759
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3760
3761
51.1k
            O14l = _mm_add_epi32(E0l, E1l);
3762
51.1k
            O14l = _mm_add_epi32(O14l, E2l);
3763
51.1k
            O14l = _mm_add_epi32(O14l, E3l);
3764
51.1k
            O14l = _mm_add_epi32(O14l, E4l);
3765
51.1k
            O14l = _mm_add_epi32(O14l, E5l);
3766
51.1k
            O14l = _mm_add_epi32(O14l, E6l);
3767
51.1k
            O14l = _mm_add_epi32(O14l, E7l);
3768
3769
51.1k
            O14h = _mm_add_epi32(E0h, E1h);
3770
51.1k
            O14h = _mm_add_epi32(O14h, E2h);
3771
51.1k
            O14h = _mm_add_epi32(O14h, E3h);
3772
51.1k
            O14h = _mm_add_epi32(O14h, E4h);
3773
51.1k
            O14h = _mm_add_epi32(O14h, E5h);
3774
51.1k
            O14h = _mm_add_epi32(O14h, E6h);
3775
51.1k
            O14h = _mm_add_epi32(O14h, E7h);
3776
3777
            /* Compute O15*/
3778
3779
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3780
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3781
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3782
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3783
51.1k
            E1l = _mm_madd_epi16(m128Tmp2,
3784
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3785
51.1k
            E1h = _mm_madd_epi16(m128Tmp3,
3786
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3787
51.1k
            E2l = _mm_madd_epi16(m128Tmp4,
3788
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3789
51.1k
            E2h = _mm_madd_epi16(m128Tmp5,
3790
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3791
51.1k
            E3l = _mm_madd_epi16(m128Tmp6,
3792
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3793
51.1k
            E3h = _mm_madd_epi16(m128Tmp7,
3794
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3795
3796
51.1k
            E4l = _mm_madd_epi16(m128Tmp8,
3797
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3798
51.1k
            E4h = _mm_madd_epi16(m128Tmp9,
3799
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3800
51.1k
            E5l = _mm_madd_epi16(m128Tmp10,
3801
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3802
51.1k
            E5h = _mm_madd_epi16(m128Tmp11,
3803
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3804
51.1k
            E6l = _mm_madd_epi16(m128Tmp12,
3805
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3806
51.1k
            E6h = _mm_madd_epi16(m128Tmp13,
3807
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3808
51.1k
            E7l = _mm_madd_epi16(m128Tmp14,
3809
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3810
51.1k
            E7h = _mm_madd_epi16(m128Tmp15,
3811
51.1k
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3812
3813
51.1k
            O15l = _mm_add_epi32(E0l, E1l);
3814
51.1k
            O15l = _mm_add_epi32(O15l, E2l);
3815
51.1k
            O15l = _mm_add_epi32(O15l, E3l);
3816
51.1k
            O15l = _mm_add_epi32(O15l, E4l);
3817
51.1k
            O15l = _mm_add_epi32(O15l, E5l);
3818
51.1k
            O15l = _mm_add_epi32(O15l, E6l);
3819
51.1k
            O15l = _mm_add_epi32(O15l, E7l);
3820
3821
51.1k
            O15h = _mm_add_epi32(E0h, E1h);
3822
51.1k
            O15h = _mm_add_epi32(O15h, E2h);
3823
51.1k
            O15h = _mm_add_epi32(O15h, E3h);
3824
51.1k
            O15h = _mm_add_epi32(O15h, E4h);
3825
51.1k
            O15h = _mm_add_epi32(O15h, E5h);
3826
51.1k
            O15h = _mm_add_epi32(O15h, E6h);
3827
51.1k
            O15h = _mm_add_epi32(O15h, E7h);
3828
            /*  Compute E0  */
3829
3830
51.1k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3831
51.1k
            E0l = _mm_madd_epi16(m128Tmp0,
3832
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3833
51.1k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3834
51.1k
            E0h = _mm_madd_epi16(m128Tmp1,
3835
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3836
3837
51.1k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3838
51.1k
            E0l = _mm_add_epi32(E0l,
3839
51.1k
                    _mm_madd_epi16(m128Tmp2,
3840
51.1k
                            _mm_load_si128(
3841
51.1k
                                    (__m128i *) (transform16x16_1[1][0]))));
3842
51.1k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3843
51.1k
            E0h = _mm_add_epi32(E0h,
3844
51.1k
                    _mm_madd_epi16(m128Tmp3,
3845
51.1k
                            _mm_load_si128(
3846
51.1k
                                    (__m128i *) (transform16x16_1[1][0]))));
3847
3848
51.1k
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3849
51.1k
            E0l = _mm_add_epi32(E0l,
3850
51.1k
                    _mm_madd_epi16(m128Tmp4,
3851
51.1k
                            _mm_load_si128(
3852
51.1k
                                    (__m128i *) (transform16x16_1[2][0]))));
3853
51.1k
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3854
51.1k
            E0h = _mm_add_epi32(E0h,
3855
51.1k
                    _mm_madd_epi16(m128Tmp5,
3856
51.1k
                            _mm_load_si128(
3857
51.1k
                                    (__m128i *) (transform16x16_1[2][0]))));
3858
3859
51.1k
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3860
51.1k
            E0l = _mm_add_epi32(E0l,
3861
51.1k
                    _mm_madd_epi16(m128Tmp6,
3862
51.1k
                            _mm_load_si128(
3863
51.1k
                                    (__m128i *) (transform16x16_1[3][0]))));
3864
51.1k
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3865
51.1k
            E0h = _mm_add_epi32(E0h,
3866
51.1k
                    _mm_madd_epi16(m128Tmp7,
3867
51.1k
                            _mm_load_si128(
3868
51.1k
                                    (__m128i *) (transform16x16_1[3][0]))));
3869
3870
            /*  Compute E1  */
3871
51.1k
            E1l = _mm_madd_epi16(m128Tmp0,
3872
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3873
51.1k
            E1h = _mm_madd_epi16(m128Tmp1,
3874
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3875
51.1k
            E1l = _mm_add_epi32(E1l,
3876
51.1k
                    _mm_madd_epi16(m128Tmp2,
3877
51.1k
                            _mm_load_si128(
3878
51.1k
                                    (__m128i *) (transform16x16_1[1][1]))));
3879
51.1k
            E1h = _mm_add_epi32(E1h,
3880
51.1k
                    _mm_madd_epi16(m128Tmp3,
3881
51.1k
                            _mm_load_si128(
3882
51.1k
                                    (__m128i *) (transform16x16_1[1][1]))));
3883
51.1k
            E1l = _mm_add_epi32(E1l,
3884
51.1k
                    _mm_madd_epi16(m128Tmp4,
3885
51.1k
                            _mm_load_si128(
3886
51.1k
                                    (__m128i *) (transform16x16_1[2][1]))));
3887
51.1k
            E1h = _mm_add_epi32(E1h,
3888
51.1k
                    _mm_madd_epi16(m128Tmp5,
3889
51.1k
                            _mm_load_si128(
3890
51.1k
                                    (__m128i *) (transform16x16_1[2][1]))));
3891
51.1k
            E1l = _mm_add_epi32(E1l,
3892
51.1k
                    _mm_madd_epi16(m128Tmp6,
3893
51.1k
                            _mm_load_si128(
3894
51.1k
                                    (__m128i *) (transform16x16_1[3][1]))));
3895
51.1k
            E1h = _mm_add_epi32(E1h,
3896
51.1k
                    _mm_madd_epi16(m128Tmp7,
3897
51.1k
                            _mm_load_si128(
3898
51.1k
                                    (__m128i *) (transform16x16_1[3][1]))));
3899
3900
            /*  Compute E2  */
3901
51.1k
            E2l = _mm_madd_epi16(m128Tmp0,
3902
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3903
51.1k
            E2h = _mm_madd_epi16(m128Tmp1,
3904
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3905
51.1k
            E2l = _mm_add_epi32(E2l,
3906
51.1k
                    _mm_madd_epi16(m128Tmp2,
3907
51.1k
                            _mm_load_si128(
3908
51.1k
                                    (__m128i *) (transform16x16_1[1][2]))));
3909
51.1k
            E2h = _mm_add_epi32(E2h,
3910
51.1k
                    _mm_madd_epi16(m128Tmp3,
3911
51.1k
                            _mm_load_si128(
3912
51.1k
                                    (__m128i *) (transform16x16_1[1][2]))));
3913
51.1k
            E2l = _mm_add_epi32(E2l,
3914
51.1k
                    _mm_madd_epi16(m128Tmp4,
3915
51.1k
                            _mm_load_si128(
3916
51.1k
                                    (__m128i *) (transform16x16_1[2][2]))));
3917
51.1k
            E2h = _mm_add_epi32(E2h,
3918
51.1k
                    _mm_madd_epi16(m128Tmp5,
3919
51.1k
                            _mm_load_si128(
3920
51.1k
                                    (__m128i *) (transform16x16_1[2][2]))));
3921
51.1k
            E2l = _mm_add_epi32(E2l,
3922
51.1k
                    _mm_madd_epi16(m128Tmp6,
3923
51.1k
                            _mm_load_si128(
3924
51.1k
                                    (__m128i *) (transform16x16_1[3][2]))));
3925
51.1k
            E2h = _mm_add_epi32(E2h,
3926
51.1k
                    _mm_madd_epi16(m128Tmp7,
3927
51.1k
                            _mm_load_si128(
3928
51.1k
                                    (__m128i *) (transform16x16_1[3][2]))));
3929
3930
            /*  Compute E3  */
3931
51.1k
            E3l = _mm_madd_epi16(m128Tmp0,
3932
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3933
51.1k
            E3h = _mm_madd_epi16(m128Tmp1,
3934
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3935
51.1k
            E3l = _mm_add_epi32(E3l,
3936
51.1k
                    _mm_madd_epi16(m128Tmp2,
3937
51.1k
                            _mm_load_si128(
3938
51.1k
                                    (__m128i *) (transform16x16_1[1][3]))));
3939
51.1k
            E3h = _mm_add_epi32(E3h,
3940
51.1k
                    _mm_madd_epi16(m128Tmp3,
3941
51.1k
                            _mm_load_si128(
3942
51.1k
                                    (__m128i *) (transform16x16_1[1][3]))));
3943
51.1k
            E3l = _mm_add_epi32(E3l,
3944
51.1k
                    _mm_madd_epi16(m128Tmp4,
3945
51.1k
                            _mm_load_si128(
3946
51.1k
                                    (__m128i *) (transform16x16_1[2][3]))));
3947
51.1k
            E3h = _mm_add_epi32(E3h,
3948
51.1k
                    _mm_madd_epi16(m128Tmp5,
3949
51.1k
                            _mm_load_si128(
3950
51.1k
                                    (__m128i *) (transform16x16_1[2][3]))));
3951
51.1k
            E3l = _mm_add_epi32(E3l,
3952
51.1k
                    _mm_madd_epi16(m128Tmp6,
3953
51.1k
                            _mm_load_si128(
3954
51.1k
                                    (__m128i *) (transform16x16_1[3][3]))));
3955
51.1k
            E3h = _mm_add_epi32(E3h,
3956
51.1k
                    _mm_madd_epi16(m128Tmp7,
3957
51.1k
                            _mm_load_si128(
3958
51.1k
                                    (__m128i *) (transform16x16_1[3][3]))));
3959
3960
            /*  Compute E4  */
3961
51.1k
            E4l = _mm_madd_epi16(m128Tmp0,
3962
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3963
51.1k
            E4h = _mm_madd_epi16(m128Tmp1,
3964
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3965
51.1k
            E4l = _mm_add_epi32(E4l,
3966
51.1k
                    _mm_madd_epi16(m128Tmp2,
3967
51.1k
                            _mm_load_si128(
3968
51.1k
                                    (__m128i *) (transform16x16_1[1][4]))));
3969
51.1k
            E4h = _mm_add_epi32(E4h,
3970
51.1k
                    _mm_madd_epi16(m128Tmp3,
3971
51.1k
                            _mm_load_si128(
3972
51.1k
                                    (__m128i *) (transform16x16_1[1][4]))));
3973
51.1k
            E4l = _mm_add_epi32(E4l,
3974
51.1k
                    _mm_madd_epi16(m128Tmp4,
3975
51.1k
                            _mm_load_si128(
3976
51.1k
                                    (__m128i *) (transform16x16_1[2][4]))));
3977
51.1k
            E4h = _mm_add_epi32(E4h,
3978
51.1k
                    _mm_madd_epi16(m128Tmp5,
3979
51.1k
                            _mm_load_si128(
3980
51.1k
                                    (__m128i *) (transform16x16_1[2][4]))));
3981
51.1k
            E4l = _mm_add_epi32(E4l,
3982
51.1k
                    _mm_madd_epi16(m128Tmp6,
3983
51.1k
                            _mm_load_si128(
3984
51.1k
                                    (__m128i *) (transform16x16_1[3][4]))));
3985
51.1k
            E4h = _mm_add_epi32(E4h,
3986
51.1k
                    _mm_madd_epi16(m128Tmp7,
3987
51.1k
                            _mm_load_si128(
3988
51.1k
                                    (__m128i *) (transform16x16_1[3][4]))));
3989
3990
            /*  Compute E3  */
3991
51.1k
            E5l = _mm_madd_epi16(m128Tmp0,
3992
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3993
51.1k
            E5h = _mm_madd_epi16(m128Tmp1,
3994
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3995
51.1k
            E5l = _mm_add_epi32(E5l,
3996
51.1k
                    _mm_madd_epi16(m128Tmp2,
3997
51.1k
                            _mm_load_si128(
3998
51.1k
                                    (__m128i *) (transform16x16_1[1][5]))));
3999
51.1k
            E5h = _mm_add_epi32(E5h,
4000
51.1k
                    _mm_madd_epi16(m128Tmp3,
4001
51.1k
                            _mm_load_si128(
4002
51.1k
                                    (__m128i *) (transform16x16_1[1][5]))));
4003
51.1k
            E5l = _mm_add_epi32(E5l,
4004
51.1k
                    _mm_madd_epi16(m128Tmp4,
4005
51.1k
                            _mm_load_si128(
4006
51.1k
                                    (__m128i *) (transform16x16_1[2][5]))));
4007
51.1k
            E5h = _mm_add_epi32(E5h,
4008
51.1k
                    _mm_madd_epi16(m128Tmp5,
4009
51.1k
                            _mm_load_si128(
4010
51.1k
                                    (__m128i *) (transform16x16_1[2][5]))));
4011
51.1k
            E5l = _mm_add_epi32(E5l,
4012
51.1k
                    _mm_madd_epi16(m128Tmp6,
4013
51.1k
                            _mm_load_si128(
4014
51.1k
                                    (__m128i *) (transform16x16_1[3][5]))));
4015
51.1k
            E5h = _mm_add_epi32(E5h,
4016
51.1k
                    _mm_madd_epi16(m128Tmp7,
4017
51.1k
                            _mm_load_si128(
4018
51.1k
                                    (__m128i *) (transform16x16_1[3][5]))));
4019
4020
            /*  Compute E6  */
4021
51.1k
            E6l = _mm_madd_epi16(m128Tmp0,
4022
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4023
51.1k
            E6h = _mm_madd_epi16(m128Tmp1,
4024
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4025
51.1k
            E6l = _mm_add_epi32(E6l,
4026
51.1k
                    _mm_madd_epi16(m128Tmp2,
4027
51.1k
                            _mm_load_si128(
4028
51.1k
                                    (__m128i *) (transform16x16_1[1][6]))));
4029
51.1k
            E6h = _mm_add_epi32(E6h,
4030
51.1k
                    _mm_madd_epi16(m128Tmp3,
4031
51.1k
                            _mm_load_si128(
4032
51.1k
                                    (__m128i *) (transform16x16_1[1][6]))));
4033
51.1k
            E6l = _mm_add_epi32(E6l,
4034
51.1k
                    _mm_madd_epi16(m128Tmp4,
4035
51.1k
                            _mm_load_si128(
4036
51.1k
                                    (__m128i *) (transform16x16_1[2][6]))));
4037
51.1k
            E6h = _mm_add_epi32(E6h,
4038
51.1k
                    _mm_madd_epi16(m128Tmp5,
4039
51.1k
                            _mm_load_si128(
4040
51.1k
                                    (__m128i *) (transform16x16_1[2][6]))));
4041
51.1k
            E6l = _mm_add_epi32(E6l,
4042
51.1k
                    _mm_madd_epi16(m128Tmp6,
4043
51.1k
                            _mm_load_si128(
4044
51.1k
                                    (__m128i *) (transform16x16_1[3][6]))));
4045
51.1k
            E6h = _mm_add_epi32(E6h,
4046
51.1k
                    _mm_madd_epi16(m128Tmp7,
4047
51.1k
                            _mm_load_si128(
4048
51.1k
                                    (__m128i *) (transform16x16_1[3][6]))));
4049
4050
            /*  Compute E7  */
4051
51.1k
            E7l = _mm_madd_epi16(m128Tmp0,
4052
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4053
51.1k
            E7h = _mm_madd_epi16(m128Tmp1,
4054
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4055
51.1k
            E7l = _mm_add_epi32(E7l,
4056
51.1k
                    _mm_madd_epi16(m128Tmp2,
4057
51.1k
                            _mm_load_si128(
4058
51.1k
                                    (__m128i *) (transform16x16_1[1][7]))));
4059
51.1k
            E7h = _mm_add_epi32(E7h,
4060
51.1k
                    _mm_madd_epi16(m128Tmp3,
4061
51.1k
                            _mm_load_si128(
4062
51.1k
                                    (__m128i *) (transform16x16_1[1][7]))));
4063
51.1k
            E7l = _mm_add_epi32(E7l,
4064
51.1k
                    _mm_madd_epi16(m128Tmp4,
4065
51.1k
                            _mm_load_si128(
4066
51.1k
                                    (__m128i *) (transform16x16_1[2][7]))));
4067
51.1k
            E7h = _mm_add_epi32(E7h,
4068
51.1k
                    _mm_madd_epi16(m128Tmp5,
4069
51.1k
                            _mm_load_si128(
4070
51.1k
                                    (__m128i *) (transform16x16_1[2][7]))));
4071
51.1k
            E7l = _mm_add_epi32(E7l,
4072
51.1k
                    _mm_madd_epi16(m128Tmp6,
4073
51.1k
                            _mm_load_si128(
4074
51.1k
                                    (__m128i *) (transform16x16_1[3][7]))));
4075
51.1k
            E7h = _mm_add_epi32(E7h,
4076
51.1k
                    _mm_madd_epi16(m128Tmp7,
4077
51.1k
                            _mm_load_si128(
4078
51.1k
                                    (__m128i *) (transform16x16_1[3][7]))));
4079
4080
            /*  Compute EE0 and EEE */
4081
4082
51.1k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4083
51.1k
            E00l = _mm_madd_epi16(m128Tmp0,
4084
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4085
51.1k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4086
51.1k
            E00h = _mm_madd_epi16(m128Tmp1,
4087
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4088
4089
51.1k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4090
51.1k
            E00l = _mm_add_epi32(E00l,
4091
51.1k
                    _mm_madd_epi16(m128Tmp2,
4092
51.1k
                            _mm_load_si128(
4093
51.1k
                                    (__m128i *) (transform16x16_2[1][0]))));
4094
51.1k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4095
51.1k
            E00h = _mm_add_epi32(E00h,
4096
51.1k
                    _mm_madd_epi16(m128Tmp3,
4097
51.1k
                            _mm_load_si128(
4098
51.1k
                                    (__m128i *) (transform16x16_2[1][0]))));
4099
4100
51.1k
            E01l = _mm_madd_epi16(m128Tmp0,
4101
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4102
51.1k
            E01h = _mm_madd_epi16(m128Tmp1,
4103
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4104
51.1k
            E01l = _mm_add_epi32(E01l,
4105
51.1k
                    _mm_madd_epi16(m128Tmp2,
4106
51.1k
                            _mm_load_si128(
4107
51.1k
                                    (__m128i *) (transform16x16_2[1][1]))));
4108
51.1k
            E01h = _mm_add_epi32(E01h,
4109
51.1k
                    _mm_madd_epi16(m128Tmp3,
4110
51.1k
                            _mm_load_si128(
4111
51.1k
                                    (__m128i *) (transform16x16_2[1][1]))));
4112
4113
51.1k
            E02l = _mm_madd_epi16(m128Tmp0,
4114
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4115
51.1k
            E02h = _mm_madd_epi16(m128Tmp1,
4116
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4117
51.1k
            E02l = _mm_add_epi32(E02l,
4118
51.1k
                    _mm_madd_epi16(m128Tmp2,
4119
51.1k
                            _mm_load_si128(
4120
51.1k
                                    (__m128i *) (transform16x16_2[1][2]))));
4121
51.1k
            E02h = _mm_add_epi32(E02h,
4122
51.1k
                    _mm_madd_epi16(m128Tmp3,
4123
51.1k
                            _mm_load_si128(
4124
51.1k
                                    (__m128i *) (transform16x16_2[1][2]))));
4125
4126
51.1k
            E03l = _mm_madd_epi16(m128Tmp0,
4127
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4128
51.1k
            E03h = _mm_madd_epi16(m128Tmp1,
4129
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4130
51.1k
            E03l = _mm_add_epi32(E03l,
4131
51.1k
                    _mm_madd_epi16(m128Tmp2,
4132
51.1k
                            _mm_load_si128(
4133
51.1k
                                    (__m128i *) (transform16x16_2[1][3]))));
4134
51.1k
            E03h = _mm_add_epi32(E03h,
4135
51.1k
                    _mm_madd_epi16(m128Tmp3,
4136
51.1k
                            _mm_load_si128(
4137
51.1k
                                    (__m128i *) (transform16x16_2[1][3]))));
4138
4139
            /*  Compute EE0 and EEE */
4140
4141
51.1k
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4142
51.1k
            EE0l = _mm_madd_epi16(m128Tmp0,
4143
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4144
51.1k
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4145
51.1k
            EE0h = _mm_madd_epi16(m128Tmp1,
4146
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4147
4148
51.1k
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4149
51.1k
            EEE0l = _mm_madd_epi16(m128Tmp2,
4150
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4151
51.1k
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4152
51.1k
            EEE0h = _mm_madd_epi16(m128Tmp3,
4153
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4154
4155
51.1k
            EE1l = _mm_madd_epi16(m128Tmp0,
4156
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4157
51.1k
            EE1h = _mm_madd_epi16(m128Tmp1,
4158
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4159
4160
51.1k
            EEE1l = _mm_madd_epi16(m128Tmp2,
4161
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4162
51.1k
            EEE1h = _mm_madd_epi16(m128Tmp3,
4163
51.1k
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4164
4165
            /*  Compute EE    */
4166
4167
51.1k
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
4168
51.1k
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
4169
51.1k
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
4170
51.1k
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
4171
4172
51.1k
            EE0l = _mm_add_epi32(EEE0l, EE0l);
4173
51.1k
            EE1l = _mm_add_epi32(EEE1l, EE1l);
4174
51.1k
            EE0h = _mm_add_epi32(EEE0h, EE0h);
4175
51.1k
            EE1h = _mm_add_epi32(EEE1h, EE1h);
4176
            /**/
4177
4178
51.1k
            EE7l = _mm_sub_epi32(EE0l, E00l);
4179
51.1k
            EE6l = _mm_sub_epi32(EE1l, E01l);
4180
51.1k
            EE5l = _mm_sub_epi32(EE2l, E02l);
4181
51.1k
            EE4l = _mm_sub_epi32(EE3l, E03l);
4182
4183
51.1k
            EE7h = _mm_sub_epi32(EE0h, E00h);
4184
51.1k
            EE6h = _mm_sub_epi32(EE1h, E01h);
4185
51.1k
            EE5h = _mm_sub_epi32(EE2h, E02h);
4186
51.1k
            EE4h = _mm_sub_epi32(EE3h, E03h);
4187
4188
51.1k
            EE0l = _mm_add_epi32(EE0l, E00l);
4189
51.1k
            EE1l = _mm_add_epi32(EE1l, E01l);
4190
51.1k
            EE2l = _mm_add_epi32(EE2l, E02l);
4191
51.1k
            EE3l = _mm_add_epi32(EE3l, E03l);
4192
4193
51.1k
            EE0h = _mm_add_epi32(EE0h, E00h);
4194
51.1k
            EE1h = _mm_add_epi32(EE1h, E01h);
4195
51.1k
            EE2h = _mm_add_epi32(EE2h, E02h);
4196
51.1k
            EE3h = _mm_add_epi32(EE3h, E03h);
4197
            /*      Compute E       */
4198
4199
51.1k
            E15l = _mm_sub_epi32(EE0l, E0l);
4200
51.1k
            E15l = _mm_add_epi32(E15l, m128iAdd);
4201
51.1k
            E14l = _mm_sub_epi32(EE1l, E1l);
4202
51.1k
            E14l = _mm_add_epi32(E14l, m128iAdd);
4203
51.1k
            E13l = _mm_sub_epi32(EE2l, E2l);
4204
51.1k
            E13l = _mm_add_epi32(E13l, m128iAdd);
4205
51.1k
            E12l = _mm_sub_epi32(EE3l, E3l);
4206
51.1k
            E12l = _mm_add_epi32(E12l, m128iAdd);
4207
51.1k
            E11l = _mm_sub_epi32(EE4l, E4l);
4208
51.1k
            E11l = _mm_add_epi32(E11l, m128iAdd);
4209
51.1k
            E10l = _mm_sub_epi32(EE5l, E5l);
4210
51.1k
            E10l = _mm_add_epi32(E10l, m128iAdd);
4211
51.1k
            E9l = _mm_sub_epi32(EE6l, E6l);
4212
51.1k
            E9l = _mm_add_epi32(E9l, m128iAdd);
4213
51.1k
            E8l = _mm_sub_epi32(EE7l, E7l);
4214
51.1k
            E8l = _mm_add_epi32(E8l, m128iAdd);
4215
4216
51.1k
            E0l = _mm_add_epi32(EE0l, E0l);
4217
51.1k
            E0l = _mm_add_epi32(E0l, m128iAdd);
4218
51.1k
            E1l = _mm_add_epi32(EE1l, E1l);
4219
51.1k
            E1l = _mm_add_epi32(E1l, m128iAdd);
4220
51.1k
            E2l = _mm_add_epi32(EE2l, E2l);
4221
51.1k
            E2l = _mm_add_epi32(E2l, m128iAdd);
4222
51.1k
            E3l = _mm_add_epi32(EE3l, E3l);
4223
51.1k
            E3l = _mm_add_epi32(E3l, m128iAdd);
4224
51.1k
            E4l = _mm_add_epi32(EE4l, E4l);
4225
51.1k
            E4l = _mm_add_epi32(E4l, m128iAdd);
4226
51.1k
            E5l = _mm_add_epi32(EE5l, E5l);
4227
51.1k
            E5l = _mm_add_epi32(E5l, m128iAdd);
4228
51.1k
            E6l = _mm_add_epi32(EE6l, E6l);
4229
51.1k
            E6l = _mm_add_epi32(E6l, m128iAdd);
4230
51.1k
            E7l = _mm_add_epi32(EE7l, E7l);
4231
51.1k
            E7l = _mm_add_epi32(E7l, m128iAdd);
4232
4233
51.1k
            E15h = _mm_sub_epi32(EE0h, E0h);
4234
51.1k
            E15h = _mm_add_epi32(E15h, m128iAdd);
4235
51.1k
            E14h = _mm_sub_epi32(EE1h, E1h);
4236
51.1k
            E14h = _mm_add_epi32(E14h, m128iAdd);
4237
51.1k
            E13h = _mm_sub_epi32(EE2h, E2h);
4238
51.1k
            E13h = _mm_add_epi32(E13h, m128iAdd);
4239
51.1k
            E12h = _mm_sub_epi32(EE3h, E3h);
4240
51.1k
            E12h = _mm_add_epi32(E12h, m128iAdd);
4241
51.1k
            E11h = _mm_sub_epi32(EE4h, E4h);
4242
51.1k
            E11h = _mm_add_epi32(E11h, m128iAdd);
4243
51.1k
            E10h = _mm_sub_epi32(EE5h, E5h);
4244
51.1k
            E10h = _mm_add_epi32(E10h, m128iAdd);
4245
51.1k
            E9h = _mm_sub_epi32(EE6h, E6h);
4246
51.1k
            E9h = _mm_add_epi32(E9h, m128iAdd);
4247
51.1k
            E8h = _mm_sub_epi32(EE7h, E7h);
4248
51.1k
            E8h = _mm_add_epi32(E8h, m128iAdd);
4249
4250
51.1k
            E0h = _mm_add_epi32(EE0h, E0h);
4251
51.1k
            E0h = _mm_add_epi32(E0h, m128iAdd);
4252
51.1k
            E1h = _mm_add_epi32(EE1h, E1h);
4253
51.1k
            E1h = _mm_add_epi32(E1h, m128iAdd);
4254
51.1k
            E2h = _mm_add_epi32(EE2h, E2h);
4255
51.1k
            E2h = _mm_add_epi32(E2h, m128iAdd);
4256
51.1k
            E3h = _mm_add_epi32(EE3h, E3h);
4257
51.1k
            E3h = _mm_add_epi32(E3h, m128iAdd);
4258
51.1k
            E4h = _mm_add_epi32(EE4h, E4h);
4259
51.1k
            E4h = _mm_add_epi32(E4h, m128iAdd);
4260
51.1k
            E5h = _mm_add_epi32(EE5h, E5h);
4261
51.1k
            E5h = _mm_add_epi32(E5h, m128iAdd);
4262
51.1k
            E6h = _mm_add_epi32(EE6h, E6h);
4263
51.1k
            E6h = _mm_add_epi32(E6h, m128iAdd);
4264
51.1k
            E7h = _mm_add_epi32(EE7h, E7h);
4265
51.1k
            E7h = _mm_add_epi32(E7h, m128iAdd);
4266
4267
51.1k
            m128iS0 = _mm_packs_epi32(
4268
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4269
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4270
51.1k
            m128iS1 = _mm_packs_epi32(
4271
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4272
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4273
51.1k
            m128iS2 = _mm_packs_epi32(
4274
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4275
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4276
51.1k
            m128iS3 = _mm_packs_epi32(
4277
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4278
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4279
51.1k
            m128iS4 = _mm_packs_epi32(
4280
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4281
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4282
51.1k
            m128iS5 = _mm_packs_epi32(
4283
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4284
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4285
51.1k
            m128iS6 = _mm_packs_epi32(
4286
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4287
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4288
51.1k
            m128iS7 = _mm_packs_epi32(
4289
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4290
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4291
51.1k
            m128iS8 = _mm_packs_epi32(
4292
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4293
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4294
51.1k
            m128iS9 = _mm_packs_epi32(
4295
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4296
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4297
51.1k
            m128iS10 = _mm_packs_epi32(
4298
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4299
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4300
51.1k
            m128iS11 = _mm_packs_epi32(
4301
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4302
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4303
51.1k
            m128iS12 = _mm_packs_epi32(
4304
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4305
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4306
51.1k
            m128iS13 = _mm_packs_epi32(
4307
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4308
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4309
51.1k
            m128iS14 = _mm_packs_epi32(
4310
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4311
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4312
51.1k
            m128iS15 = _mm_packs_epi32(
4313
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4314
51.1k
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4315
4316
51.1k
            m128iS31 = _mm_packs_epi32(
4317
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4318
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4319
51.1k
            m128iS30 = _mm_packs_epi32(
4320
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4321
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4322
51.1k
            m128iS29 = _mm_packs_epi32(
4323
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4324
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4325
51.1k
            m128iS28 = _mm_packs_epi32(
4326
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4327
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4328
51.1k
            m128iS27 = _mm_packs_epi32(
4329
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4330
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4331
51.1k
            m128iS26 = _mm_packs_epi32(
4332
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4333
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4334
51.1k
            m128iS25 = _mm_packs_epi32(
4335
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4336
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4337
51.1k
            m128iS24 = _mm_packs_epi32(
4338
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4339
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4340
51.1k
            m128iS23 = _mm_packs_epi32(
4341
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4342
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4343
51.1k
            m128iS22 = _mm_packs_epi32(
4344
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4345
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4346
51.1k
            m128iS21 = _mm_packs_epi32(
4347
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4348
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4349
51.1k
            m128iS20 = _mm_packs_epi32(
4350
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4351
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4352
51.1k
            m128iS19 = _mm_packs_epi32(
4353
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4354
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4355
51.1k
            m128iS18 = _mm_packs_epi32(
4356
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4357
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4358
51.1k
            m128iS17 = _mm_packs_epi32(
4359
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4360
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4361
51.1k
            m128iS16 = _mm_packs_epi32(
4362
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4363
51.1k
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4364
4365
51.1k
            if (!j) {
4366
                /*      Inverse the matrix      */
4367
25.5k
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4368
25.5k
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4369
25.5k
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4370
25.5k
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4371
25.5k
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4372
25.5k
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4373
25.5k
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4374
25.5k
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4375
25.5k
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4376
25.5k
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4377
25.5k
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4378
25.5k
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4379
25.5k
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4380
25.5k
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4381
25.5k
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4382
25.5k
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4383
4384
25.5k
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4385
25.5k
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4386
25.5k
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4387
25.5k
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4388
25.5k
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4389
25.5k
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4390
25.5k
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4391
25.5k
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4392
25.5k
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4393
25.5k
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4394
25.5k
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4395
25.5k
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4396
25.5k
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4397
25.5k
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4398
25.5k
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4399
25.5k
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4400
4401
25.5k
                E0h = _mm_unpacklo_epi16(E0l, E8l);
4402
25.5k
                E1h = _mm_unpacklo_epi16(E1l, E9l);
4403
25.5k
                E2h = _mm_unpacklo_epi16(E2l, E10l);
4404
25.5k
                E3h = _mm_unpacklo_epi16(E3l, E11l);
4405
25.5k
                E4h = _mm_unpacklo_epi16(E4l, E12l);
4406
25.5k
                E5h = _mm_unpacklo_epi16(E5l, E13l);
4407
25.5k
                E6h = _mm_unpacklo_epi16(E6l, E14l);
4408
25.5k
                E7h = _mm_unpacklo_epi16(E7l, E15l);
4409
4410
25.5k
                E8h = _mm_unpackhi_epi16(E0l, E8l);
4411
25.5k
                E9h = _mm_unpackhi_epi16(E1l, E9l);
4412
25.5k
                E10h = _mm_unpackhi_epi16(E2l, E10l);
4413
25.5k
                E11h = _mm_unpackhi_epi16(E3l, E11l);
4414
25.5k
                E12h = _mm_unpackhi_epi16(E4l, E12l);
4415
25.5k
                E13h = _mm_unpackhi_epi16(E5l, E13l);
4416
25.5k
                E14h = _mm_unpackhi_epi16(E6l, E14l);
4417
25.5k
                E15h = _mm_unpackhi_epi16(E7l, E15l);
4418
4419
25.5k
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4420
25.5k
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4421
25.5k
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4422
25.5k
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4423
4424
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4425
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4426
25.5k
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4427
25.5k
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4428
4429
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4430
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4431
25.5k
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4432
25.5k
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4433
4434
25.5k
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4435
25.5k
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4436
25.5k
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4437
25.5k
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4438
4439
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4440
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4441
25.5k
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4442
25.5k
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4443
4444
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4445
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4446
25.5k
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4447
25.5k
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4448
4449
25.5k
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4450
25.5k
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4451
25.5k
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4452
25.5k
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4453
4454
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4455
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4456
25.5k
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4457
25.5k
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4458
4459
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4460
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4461
25.5k
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4462
25.5k
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4463
4464
25.5k
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4465
25.5k
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4466
25.5k
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4467
25.5k
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4468
4469
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4470
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4471
25.5k
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4472
25.5k
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4473
4474
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4475
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4476
25.5k
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4477
25.5k
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4478
4479
                /*  */
4480
25.5k
                E0h = _mm_unpacklo_epi16(O0l, O8l);
4481
25.5k
                E1h = _mm_unpacklo_epi16(O1l, O9l);
4482
25.5k
                E2h = _mm_unpacklo_epi16(O2l, O10l);
4483
25.5k
                E3h = _mm_unpacklo_epi16(O3l, O11l);
4484
25.5k
                E4h = _mm_unpacklo_epi16(O4l, O12l);
4485
25.5k
                E5h = _mm_unpacklo_epi16(O5l, O13l);
4486
25.5k
                E6h = _mm_unpacklo_epi16(O6l, O14l);
4487
25.5k
                E7h = _mm_unpacklo_epi16(O7l, O15l);
4488
4489
25.5k
                E8h = _mm_unpackhi_epi16(O0l, O8l);
4490
25.5k
                E9h = _mm_unpackhi_epi16(O1l, O9l);
4491
25.5k
                E10h = _mm_unpackhi_epi16(O2l, O10l);
4492
25.5k
                E11h = _mm_unpackhi_epi16(O3l, O11l);
4493
25.5k
                E12h = _mm_unpackhi_epi16(O4l, O12l);
4494
25.5k
                E13h = _mm_unpackhi_epi16(O5l, O13l);
4495
25.5k
                E14h = _mm_unpackhi_epi16(O6l, O14l);
4496
25.5k
                E15h = _mm_unpackhi_epi16(O7l, O15l);
4497
4498
25.5k
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4499
25.5k
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4500
25.5k
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4501
25.5k
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4502
4503
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4504
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4505
25.5k
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4506
25.5k
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4507
4508
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4509
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4510
25.5k
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4511
25.5k
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4512
4513
25.5k
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4514
25.5k
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4515
25.5k
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4516
25.5k
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4517
4518
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4519
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4520
25.5k
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4521
25.5k
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4522
4523
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4524
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4525
25.5k
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4526
25.5k
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4527
4528
25.5k
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4529
25.5k
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4530
25.5k
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4531
25.5k
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4532
4533
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4534
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4535
25.5k
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4536
25.5k
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4537
4538
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4539
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4540
25.5k
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4541
25.5k
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4542
4543
25.5k
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4544
25.5k
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4545
25.5k
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4546
25.5k
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4547
4548
25.5k
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4549
25.5k
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4550
25.5k
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4551
25.5k
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4552
4553
25.5k
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4554
25.5k
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4555
25.5k
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4556
25.5k
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4557
4558
25.5k
                if(i==0){
4559
6.39k
                    int k = 8;
4560
6.39k
                    r0=m128iS0;
4561
6.39k
                    r1=m128iS1;
4562
6.39k
                    r2=m128iS2;
4563
6.39k
                    r3=m128iS3;
4564
6.39k
                    r4=m128iS4;
4565
6.39k
                    r5=m128iS5;
4566
6.39k
                    r6=m128iS6;
4567
6.39k
                    r7=m128iS7;
4568
6.39k
                    r8=m128iS8;
4569
6.39k
                    r9=m128iS9;
4570
6.39k
                    r10=m128iS10;
4571
6.39k
                    r11=m128iS11;
4572
6.39k
                    r12=m128iS12;
4573
6.39k
                    r13=m128iS13;
4574
6.39k
                    r14=m128iS14;
4575
6.39k
                    r15=m128iS15;
4576
6.39k
                    r16=m128iS16;
4577
6.39k
                    r17=m128iS17;
4578
6.39k
                    r18=m128iS18;
4579
6.39k
                    r19=m128iS19;
4580
6.39k
                    r20=m128iS20;
4581
6.39k
                    r21=m128iS21;
4582
6.39k
                    r22=m128iS22;
4583
6.39k
                    r23=m128iS23;
4584
6.39k
                    r24=m128iS24;
4585
6.39k
                    r25=m128iS25;
4586
6.39k
                    r26=m128iS26;
4587
6.39k
                    r27=m128iS27;
4588
6.39k
                    r28=m128iS28;
4589
6.39k
                    r29=m128iS29;
4590
6.39k
                    r30=m128iS30;
4591
6.39k
                    r31=m128iS31;
4592
6.39k
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
4593
6.39k
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4594
6.39k
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4595
6.39k
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4596
6.39k
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4597
6.39k
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4598
6.39k
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4599
6.39k
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4600
6.39k
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4601
6.39k
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4602
6.39k
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4603
6.39k
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4604
6.39k
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4605
6.39k
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4606
6.39k
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4607
6.39k
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4608
4609
6.39k
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4610
6.39k
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4611
6.39k
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4612
6.39k
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4613
6.39k
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4614
6.39k
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4615
6.39k
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4616
6.39k
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4617
6.39k
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4618
6.39k
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4619
6.39k
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4620
6.39k
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4621
6.39k
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4622
6.39k
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4623
6.39k
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4624
6.39k
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4625
4626
19.1k
                }else if(i ==8){
4627
4628
6.39k
                    r32=m128iS0;
4629
6.39k
                    r33=m128iS1;
4630
6.39k
                    r34=m128iS2;
4631
6.39k
                    r35=m128iS3;
4632
6.39k
                    r36=m128iS4;
4633
6.39k
                    r37=m128iS5;
4634
6.39k
                    r38=m128iS6;
4635
6.39k
                    r39=m128iS7;
4636
6.39k
                    r40=m128iS8;
4637
6.39k
                    r41=m128iS9;
4638
6.39k
                    r42=m128iS10;
4639
6.39k
                    r43=m128iS11;
4640
6.39k
                    r44=m128iS12;
4641
6.39k
                    r45=m128iS13;
4642
6.39k
                    r46=m128iS14;
4643
6.39k
                    r47=m128iS15;
4644
6.39k
                    r48=m128iS16;
4645
6.39k
                    r49=m128iS17;
4646
6.39k
                    r50=m128iS18;
4647
6.39k
                    r51=m128iS19;
4648
6.39k
                    r52=m128iS20;
4649
6.39k
                    r53=m128iS21;
4650
6.39k
                    r54=m128iS22;
4651
6.39k
                    r55=m128iS23;
4652
6.39k
                    r56=m128iS24;
4653
6.39k
                    r57=m128iS25;
4654
6.39k
                    r58=m128iS26;
4655
6.39k
                    r59=m128iS27;
4656
6.39k
                    r60=m128iS28;
4657
6.39k
                    r61=m128iS29;
4658
6.39k
                    r62=m128iS30;
4659
6.39k
                    r63=m128iS31;
4660
4661
6.39k
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4662
6.39k
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4663
6.39k
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4664
6.39k
                    m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4665
6.39k
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4666
6.39k
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4667
6.39k
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4668
6.39k
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4669
6.39k
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4670
6.39k
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4671
6.39k
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4672
6.39k
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4673
6.39k
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4674
6.39k
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4675
6.39k
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4676
6.39k
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4677
4678
6.39k
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4679
6.39k
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4680
6.39k
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4681
6.39k
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4682
6.39k
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4683
6.39k
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4684
6.39k
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4685
6.39k
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4686
6.39k
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4687
6.39k
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4688
6.39k
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4689
6.39k
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4690
6.39k
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4691
6.39k
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4692
6.39k
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4693
6.39k
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4694
4695
4696
12.7k
                }else if(i ==16){
4697
4698
6.39k
                    r64=m128iS0;
4699
6.39k
                    r65=m128iS1;
4700
6.39k
                    r66=m128iS2;
4701
6.39k
                    r67=m128iS3;
4702
6.39k
                    r68=m128iS4;
4703
6.39k
                    r69=m128iS5;
4704
6.39k
                    r70=m128iS6;
4705
6.39k
                    r71=m128iS7;
4706
6.39k
                    r72=m128iS8;
4707
6.39k
                    r73=m128iS9;
4708
6.39k
                    r74=m128iS10;
4709
6.39k
                    r75=m128iS11;
4710
6.39k
                    r76=m128iS12;
4711
6.39k
                    r77=m128iS13;
4712
6.39k
                    r78=m128iS14;
4713
6.39k
                    r79=m128iS15;
4714
6.39k
                    r80=m128iS16;
4715
6.39k
                    r81=m128iS17;
4716
6.39k
                    r82=m128iS18;
4717
6.39k
                    r83=m128iS19;
4718
6.39k
                    r84=m128iS20;
4719
6.39k
                    r85=m128iS21;
4720
6.39k
                    r86=m128iS22;
4721
6.39k
                    r87=m128iS23;
4722
6.39k
                    r88=m128iS24;
4723
6.39k
                    r89=m128iS25;
4724
6.39k
                    r90=m128iS26;
4725
6.39k
                    r91=m128iS27;
4726
6.39k
                    r92=m128iS28;
4727
6.39k
                    r93=m128iS29;
4728
6.39k
                    r94=m128iS30;
4729
6.39k
                    r95=m128iS31;
4730
4731
6.39k
                    m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4732
6.39k
                    m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4733
6.39k
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4734
6.39k
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4735
6.39k
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4736
6.39k
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4737
6.39k
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4738
6.39k
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4739
6.39k
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4740
6.39k
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4741
6.39k
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4742
6.39k
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4743
6.39k
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4744
6.39k
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4745
6.39k
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4746
6.39k
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4747
4748
6.39k
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4749
6.39k
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4750
6.39k
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4751
6.39k
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4752
6.39k
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4753
6.39k
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4754
6.39k
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4755
6.39k
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4756
6.39k
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4757
6.39k
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4758
6.39k
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4759
6.39k
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4760
6.39k
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4761
6.39k
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4762
6.39k
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4763
6.39k
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4764
4765
6.39k
                }else{
4766
6.39k
                    r96=m128iS0;
4767
6.39k
                    r97=m128iS1;
4768
6.39k
                    r98=m128iS2;
4769
6.39k
                    r99=m128iS3;
4770
6.39k
                    r100=m128iS4;
4771
6.39k
                    r101=m128iS5;
4772
6.39k
                    r102=m128iS6;
4773
6.39k
                    r103=m128iS7;
4774
6.39k
                    r104=m128iS8;
4775
6.39k
                    r105=m128iS9;
4776
6.39k
                    r106=m128iS10;
4777
6.39k
                    r107=m128iS11;
4778
6.39k
                    r108=m128iS12;
4779
6.39k
                    r109=m128iS13;
4780
6.39k
                    r110=m128iS14;
4781
6.39k
                    r111=m128iS15;
4782
6.39k
                    r112=m128iS16;
4783
6.39k
                    r113=m128iS17;
4784
6.39k
                    r114=m128iS18;
4785
6.39k
                    r115=m128iS19;
4786
6.39k
                    r116=m128iS20;
4787
6.39k
                    r117=m128iS21;
4788
6.39k
                    r118=m128iS22;
4789
6.39k
                    r119=m128iS23;
4790
6.39k
                    r120=m128iS24;
4791
6.39k
                    r121=m128iS25;
4792
6.39k
                    r122=m128iS26;
4793
6.39k
                    r123=m128iS27;
4794
6.39k
                    r124=m128iS28;
4795
6.39k
                    r125=m128iS29;
4796
6.39k
                    r126=m128iS30;
4797
6.39k
                    r127=m128iS31;
4798
4799
                    //load data for next j :
4800
6.39k
                    m128iS0 =  r0;
4801
6.39k
                    m128iS1 =  r4;
4802
6.39k
                    m128iS2 =  r8;
4803
6.39k
                    m128iS3 =  r12;
4804
6.39k
                    m128iS4 =  r16;
4805
6.39k
                    m128iS5 =  r20;
4806
6.39k
                    m128iS6 =  r24;
4807
6.39k
                    m128iS7 =  r28;
4808
6.39k
                    m128iS8 =  r32;
4809
6.39k
                    m128iS9 =  r36;
4810
6.39k
                    m128iS10 = r40;
4811
6.39k
                    m128iS11 = r44;
4812
6.39k
                    m128iS12 = r48;
4813
6.39k
                    m128iS13 = r52;
4814
6.39k
                    m128iS14 = r56;
4815
6.39k
                    m128iS15 = r60;
4816
6.39k
                    m128iS16 = r64;
4817
6.39k
                    m128iS17 = r68;
4818
6.39k
                    m128iS18 = r72;
4819
6.39k
                    m128iS19 = r76;
4820
6.39k
                    m128iS20 = r80;
4821
6.39k
                    m128iS21 = r84;
4822
6.39k
                    m128iS22 = r88;
4823
6.39k
                    m128iS23 = r92;
4824
6.39k
                    m128iS24 = r96;
4825
6.39k
                    m128iS25 = r100;
4826
6.39k
                    m128iS26 = r104;
4827
6.39k
                    m128iS27 = r108;
4828
6.39k
                    m128iS28 = r112;
4829
6.39k
                    m128iS29 = r116;
4830
6.39k
                    m128iS30 = r120;
4831
6.39k
                    m128iS31 =r124;
4832
6.39k
                    shift = shift_2nd;
4833
6.39k
                    m128iAdd = _mm_set1_epi32(add_2nd);
4834
4835
4836
6.39k
                }
4837
4838
25.5k
            } else {
4839
4840
                //Transpose Matrix
4841
4842
25.5k
                E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4843
25.5k
                E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4844
25.5k
                E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4845
25.5k
                E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4846
25.5k
                E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4847
25.5k
                E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4848
25.5k
                E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4849
25.5k
                E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4850
25.5k
                E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4851
25.5k
                E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4852
25.5k
                E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4853
25.5k
                E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4854
25.5k
                E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4855
25.5k
                E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4856
25.5k
                E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4857
25.5k
                E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4858
4859
4860
25.5k
                E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4861
25.5k
                E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4862
25.5k
                E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4863
25.5k
                E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4864
25.5k
                E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4865
25.5k
                E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4866
25.5k
                E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4867
25.5k
                E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4868
25.5k
                E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4869
25.5k
                E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4870
25.5k
                E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4871
25.5k
                E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4872
25.5k
                E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4873
25.5k
                E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4874
25.5k
                E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4875
25.5k
                E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4876
4877
25.5k
                m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4878
25.5k
                m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4879
25.5k
                m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4880
25.5k
                m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4881
25.5k
                m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4882
25.5k
                m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4883
25.5k
                m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4884
25.5k
                m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4885
4886
25.5k
                m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4887
25.5k
                m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4888
4889
4890
25.5k
                m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4891
25.5k
                m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4892
4893
                //second row
4894
4895
25.5k
                m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4896
25.5k
                m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4897
4898
25.5k
                m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4899
25.5k
                m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4900
4901
               //third row
4902
4903
25.5k
                m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4904
25.5k
                m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4905
25.5k
                m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4906
25.5k
                m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4907
25.5k
                m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4908
25.5k
                m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4909
25.5k
                m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4910
25.5k
                m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4911
4912
4913
25.5k
                m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4914
25.5k
                m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4915
4916
25.5k
                m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4917
25.5k
                m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4918
4919
                //fourth row
4920
4921
25.5k
                m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4922
25.5k
                m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4923
4924
25.5k
                m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4925
25.5k
                m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4926
4927
                //fifth row
4928
4929
25.5k
                m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4930
25.5k
                m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4931
25.5k
                m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4932
25.5k
                m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4933
25.5k
                m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4934
25.5k
                m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4935
25.5k
                m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4936
25.5k
                m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4937
4938
25.5k
                m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4939
25.5k
                m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4940
4941
4942
25.5k
                m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4943
25.5k
                m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4944
4945
                //sixth row
4946
4947
25.5k
                m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4948
25.5k
                m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4949
4950
4951
25.5k
                m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4952
25.5k
                m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4953
4954
               //seventh row
4955
4956
25.5k
                m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4957
25.5k
                m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4958
25.5k
                m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4959
25.5k
                m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4960
25.5k
                m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4961
25.5k
                m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4962
25.5k
                m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4963
25.5k
                m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4964
4965
4966
25.5k
                m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4967
25.5k
                m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4968
4969
4970
25.5k
                m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4971
25.5k
                m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4972
4973
                //last row
4974
4975
4976
25.5k
                m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4977
25.5k
                m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4978
4979
25.5k
                m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4980
25.5k
                m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4981
4982
4983
25.5k
                m128Tmp0=_mm_setzero_si128();
4984
4985
4986
                //store
4987
25.5k
                dst = (uint8_t*) _dst + i*stride;
4988
4989
4990
25.5k
                E0l= _mm_load_si128((__m128i*)dst); //16 values
4991
25.5k
                E1l= _mm_load_si128((__m128i*)(dst+16));
4992
25.5k
                E2l= _mm_load_si128((__m128i*)(dst+stride));
4993
25.5k
                E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4994
25.5k
                E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4995
25.5k
                E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4996
25.5k
                E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4997
25.5k
                E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4998
25.5k
                E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4999
25.5k
                E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
5000
25.5k
                E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5001
25.5k
                E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5002
25.5k
                E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5003
25.5k
                E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5004
25.5k
                E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5005
25.5k
                E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5006
5007
25.5k
                m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5008
25.5k
                m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5009
25.5k
                m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5010
5011
25.5k
                m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5012
25.5k
                m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5013
25.5k
                m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5014
5015
25.5k
                m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5016
25.5k
                m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5017
25.5k
                m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5018
5019
25.5k
                m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5020
25.5k
                m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5021
25.5k
                m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5022
5023
25.5k
                m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5024
25.5k
                m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5025
25.5k
                m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5026
5027
25.5k
                m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5028
25.5k
                m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5029
25.5k
                m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5030
5031
25.5k
                m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5032
25.5k
                m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5033
25.5k
                m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5034
5035
25.5k
                m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5036
25.5k
                m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5037
25.5k
                m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5038
5039
25.5k
                m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5040
25.5k
                m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5041
25.5k
                m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5042
5043
25.5k
                m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5044
25.5k
                m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5045
25.5k
                m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5046
5047
25.5k
                m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5048
25.5k
                m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5049
25.5k
                m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5050
5051
25.5k
                m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5052
25.5k
                m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5053
25.5k
                m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5054
5055
25.5k
                m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5056
25.5k
                m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5057
25.5k
                m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5058
5059
25.5k
                m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5060
25.5k
                m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5061
25.5k
                m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5062
5063
25.5k
                m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5064
25.5k
                m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5065
25.5k
                m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5066
5067
25.5k
                m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5068
25.5k
                m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5069
25.5k
                m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5070
5071
5072
25.5k
                _mm_store_si128((__m128i*)dst,m128iS0);
5073
25.5k
                _mm_store_si128((__m128i*)(dst+16),m128iS2);
5074
25.5k
                _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5075
25.5k
                _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5076
25.5k
                _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5077
25.5k
                _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5078
25.5k
                _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5079
25.5k
                _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5080
25.5k
                _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5081
25.5k
                _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5082
25.5k
                _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5083
25.5k
                _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5084
25.5k
                _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5085
25.5k
                _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5086
25.5k
                _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5087
25.5k
                _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5088
5089
5090
25.5k
                if(i==0){
5091
                    //load next values :
5092
6.39k
                    m128iS0 =  r1;
5093
6.39k
                    m128iS1 =  r5;
5094
6.39k
                    m128iS2 =  r9;
5095
6.39k
                    m128iS3 =  r13;
5096
6.39k
                    m128iS4 =  r17;
5097
6.39k
                    m128iS5 =  r21;
5098
6.39k
                    m128iS6 =  r25;
5099
6.39k
                    m128iS7 =  r29;
5100
6.39k
                    m128iS8 =  r33;
5101
6.39k
                    m128iS9 =  r37;
5102
6.39k
                    m128iS10 = r41;
5103
6.39k
                    m128iS11 = r45;
5104
6.39k
                    m128iS12 = r49;
5105
6.39k
                    m128iS13 = r53;
5106
6.39k
                    m128iS14 = r57;
5107
6.39k
                    m128iS15 = r61;
5108
6.39k
                    m128iS16 = r65;
5109
6.39k
                    m128iS17 = r69;
5110
6.39k
                    m128iS18 = r73;
5111
6.39k
                    m128iS19 = r77;
5112
6.39k
                    m128iS20 = r81;
5113
6.39k
                    m128iS21 = r85;
5114
6.39k
                    m128iS22 = r89;
5115
6.39k
                    m128iS23 = r93;
5116
6.39k
                    m128iS24 = r97;
5117
6.39k
                    m128iS25 = r101;
5118
6.39k
                    m128iS26 = r105;
5119
6.39k
                    m128iS27 = r109;
5120
6.39k
                    m128iS28 = r113;
5121
6.39k
                    m128iS29 = r117;
5122
6.39k
                    m128iS30 = r121;
5123
6.39k
                    m128iS31 =r125;
5124
5125
19.1k
                }else if(i ==8){
5126
                    //load next values :
5127
6.39k
                    m128iS0 =  r2;
5128
6.39k
                    m128iS1 =  r6;
5129
6.39k
                    m128iS2 =  r10;
5130
6.39k
                    m128iS3 =  r14;
5131
6.39k
                    m128iS4 =  r18;
5132
6.39k
                    m128iS5 =  r22;
5133
6.39k
                    m128iS6 =  r26;
5134
6.39k
                    m128iS7 =  r30;
5135
6.39k
                    m128iS8 =  r34;
5136
6.39k
                    m128iS9 =  r38;
5137
6.39k
                    m128iS10 = r42;
5138
6.39k
                    m128iS11 = r46;
5139
6.39k
                    m128iS12 = r50;
5140
6.39k
                    m128iS13 = r54;
5141
6.39k
                    m128iS14 = r58;
5142
6.39k
                    m128iS15 = r62;
5143
6.39k
                    m128iS16 = r66;
5144
6.39k
                    m128iS17 = r70;
5145
6.39k
                    m128iS18 = r74;
5146
6.39k
                    m128iS19 = r78;
5147
6.39k
                    m128iS20 = r82;
5148
6.39k
                    m128iS21 = r86;
5149
6.39k
                    m128iS22 = r90;
5150
6.39k
                    m128iS23 = r94;
5151
6.39k
                    m128iS24 = r98;
5152
6.39k
                    m128iS25 = r102;
5153
6.39k
                    m128iS26 = r106;
5154
6.39k
                    m128iS27 = r110;
5155
6.39k
                    m128iS28 = r114;
5156
6.39k
                    m128iS29 = r118;
5157
6.39k
                    m128iS30 = r122;
5158
6.39k
                    m128iS31 =r126;
5159
5160
12.7k
                }else if(i==16)
5161
6.39k
                {
5162
                    //load next values :
5163
6.39k
                    m128iS0 =  r3;
5164
6.39k
                    m128iS1 =  r7;
5165
6.39k
                    m128iS2 =  r11;
5166
6.39k
                    m128iS3 =  r15;
5167
6.39k
                    m128iS4 =  r19;
5168
6.39k
                    m128iS5 =  r23;
5169
6.39k
                    m128iS6 =  r27;
5170
6.39k
                    m128iS7 =  r31;
5171
6.39k
                    m128iS8 =  r35;
5172
6.39k
                    m128iS9 =  r39;
5173
6.39k
                    m128iS10 = r43;
5174
6.39k
                    m128iS11 = r47;
5175
6.39k
                    m128iS12 = r51;
5176
6.39k
                    m128iS13 = r55;
5177
6.39k
                    m128iS14 = r59;
5178
6.39k
                    m128iS15 = r63;
5179
6.39k
                    m128iS16 = r67;
5180
6.39k
                    m128iS17 = r71;
5181
6.39k
                    m128iS18 = r75;
5182
6.39k
                    m128iS19 = r79;
5183
6.39k
                    m128iS20 = r83;
5184
6.39k
                    m128iS21 = r87;
5185
6.39k
                    m128iS22 = r91;
5186
6.39k
                    m128iS23 = r95;
5187
6.39k
                    m128iS24 = r99;
5188
6.39k
                    m128iS25 = r103;
5189
6.39k
                    m128iS26 = r107;
5190
6.39k
                    m128iS27 = r111;
5191
6.39k
                    m128iS28 = r115;
5192
6.39k
                    m128iS29 = r119;
5193
6.39k
                    m128iS30 = r123;
5194
6.39k
                    m128iS31 =r127;
5195
6.39k
                }
5196
25.5k
            }
5197
51.1k
        }
5198
12.7k
    }
5199
6.39k
}
5200
#endif
5201
5202
5203
#if 0
5204
void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
5205
        ptrdiff_t _stride) {
5206
    int i, j;
5207
    uint16_t *dst = (uint16_t*) _dst;
5208
    ptrdiff_t stride = _stride / 2;
5209
    int shift;
5210
    uint8_t shift_2nd = 10; //20 - bit depth
5211
    uint16_t add_2nd = 1<<9; //shift2 - 1
5212
    int16_t *src = coeffs;
5213
5214
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5215
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5216
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5217
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5218
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5219
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5220
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5221
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5222
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5223
            EEE0l, EEE1l, EEE0h, EEE1h;
5224
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5225
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5226
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5227
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5228
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5229
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5230
            EE4l, EE7h, EE6h, EE5h, EE4h;
5231
    m128iS0 = _mm_load_si128((__m128i *) (src));
5232
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5233
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5234
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5235
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5236
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5237
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5238
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5239
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5240
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5241
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5242
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5243
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5244
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5245
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5246
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5247
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5248
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5249
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5250
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5251
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5252
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5253
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5254
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5255
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5256
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5257
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5258
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5259
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5260
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5261
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5262
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5263
5264
    shift = shift_1st;
5265
    m128iAdd = _mm_set1_epi32(add_1st);
5266
5267
    for (j = 0; j < 2; j++) {
5268
        for (i = 0; i < 32; i += 8) {
5269
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5270
            E0l = _mm_madd_epi16(m128Tmp0,
5271
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5272
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5273
            E0h = _mm_madd_epi16(m128Tmp1,
5274
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5275
5276
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5277
            E1l = _mm_madd_epi16(m128Tmp2,
5278
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5279
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5280
            E1h = _mm_madd_epi16(m128Tmp3,
5281
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5282
5283
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5284
            E2l = _mm_madd_epi16(m128Tmp4,
5285
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5286
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5287
            E2h = _mm_madd_epi16(m128Tmp5,
5288
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5289
5290
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5291
            E3l = _mm_madd_epi16(m128Tmp6,
5292
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5293
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5294
            E3h = _mm_madd_epi16(m128Tmp7,
5295
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5296
5297
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5298
            E4l = _mm_madd_epi16(m128Tmp8,
5299
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5300
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5301
            E4h = _mm_madd_epi16(m128Tmp9,
5302
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5303
5304
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5305
            E5l = _mm_madd_epi16(m128Tmp10,
5306
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5307
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5308
            E5h = _mm_madd_epi16(m128Tmp11,
5309
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5310
5311
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5312
            E6l = _mm_madd_epi16(m128Tmp12,
5313
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5314
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5315
            E6h = _mm_madd_epi16(m128Tmp13,
5316
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5317
5318
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5319
            E7l = _mm_madd_epi16(m128Tmp14,
5320
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5321
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5322
            E7h = _mm_madd_epi16(m128Tmp15,
5323
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5324
5325
            O0l = _mm_add_epi32(E0l, E1l);
5326
            O0l = _mm_add_epi32(O0l, E2l);
5327
            O0l = _mm_add_epi32(O0l, E3l);
5328
            O0l = _mm_add_epi32(O0l, E4l);
5329
            O0l = _mm_add_epi32(O0l, E5l);
5330
            O0l = _mm_add_epi32(O0l, E6l);
5331
            O0l = _mm_add_epi32(O0l, E7l);
5332
5333
            O0h = _mm_add_epi32(E0h, E1h);
5334
            O0h = _mm_add_epi32(O0h, E2h);
5335
            O0h = _mm_add_epi32(O0h, E3h);
5336
            O0h = _mm_add_epi32(O0h, E4h);
5337
            O0h = _mm_add_epi32(O0h, E5h);
5338
            O0h = _mm_add_epi32(O0h, E6h);
5339
            O0h = _mm_add_epi32(O0h, E7h);
5340
5341
            /* Compute O1*/
5342
            E0l = _mm_madd_epi16(m128Tmp0,
5343
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5344
            E0h = _mm_madd_epi16(m128Tmp1,
5345
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5346
            E1l = _mm_madd_epi16(m128Tmp2,
5347
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5348
            E1h = _mm_madd_epi16(m128Tmp3,
5349
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5350
            E2l = _mm_madd_epi16(m128Tmp4,
5351
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5352
            E2h = _mm_madd_epi16(m128Tmp5,
5353
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5354
            E3l = _mm_madd_epi16(m128Tmp6,
5355
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5356
            E3h = _mm_madd_epi16(m128Tmp7,
5357
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5358
5359
            E4l = _mm_madd_epi16(m128Tmp8,
5360
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5361
            E4h = _mm_madd_epi16(m128Tmp9,
5362
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5363
            E5l = _mm_madd_epi16(m128Tmp10,
5364
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5365
            E5h = _mm_madd_epi16(m128Tmp11,
5366
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5367
            E6l = _mm_madd_epi16(m128Tmp12,
5368
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5369
            E6h = _mm_madd_epi16(m128Tmp13,
5370
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5371
            E7l = _mm_madd_epi16(m128Tmp14,
5372
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5373
            E7h = _mm_madd_epi16(m128Tmp15,
5374
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5375
5376
            O1l = _mm_add_epi32(E0l, E1l);
5377
            O1l = _mm_add_epi32(O1l, E2l);
5378
            O1l = _mm_add_epi32(O1l, E3l);
5379
            O1l = _mm_add_epi32(O1l, E4l);
5380
            O1l = _mm_add_epi32(O1l, E5l);
5381
            O1l = _mm_add_epi32(O1l, E6l);
5382
            O1l = _mm_add_epi32(O1l, E7l);
5383
5384
            O1h = _mm_add_epi32(E0h, E1h);
5385
            O1h = _mm_add_epi32(O1h, E2h);
5386
            O1h = _mm_add_epi32(O1h, E3h);
5387
            O1h = _mm_add_epi32(O1h, E4h);
5388
            O1h = _mm_add_epi32(O1h, E5h);
5389
            O1h = _mm_add_epi32(O1h, E6h);
5390
            O1h = _mm_add_epi32(O1h, E7h);
5391
            /* Compute O2*/
5392
            E0l = _mm_madd_epi16(m128Tmp0,
5393
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5394
            E0h = _mm_madd_epi16(m128Tmp1,
5395
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5396
            E1l = _mm_madd_epi16(m128Tmp2,
5397
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5398
            E1h = _mm_madd_epi16(m128Tmp3,
5399
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5400
            E2l = _mm_madd_epi16(m128Tmp4,
5401
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5402
            E2h = _mm_madd_epi16(m128Tmp5,
5403
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5404
            E3l = _mm_madd_epi16(m128Tmp6,
5405
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5406
            E3h = _mm_madd_epi16(m128Tmp7,
5407
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5408
5409
            E4l = _mm_madd_epi16(m128Tmp8,
5410
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5411
            E4h = _mm_madd_epi16(m128Tmp9,
5412
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5413
            E5l = _mm_madd_epi16(m128Tmp10,
5414
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5415
            E5h = _mm_madd_epi16(m128Tmp11,
5416
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5417
            E6l = _mm_madd_epi16(m128Tmp12,
5418
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5419
            E6h = _mm_madd_epi16(m128Tmp13,
5420
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5421
            E7l = _mm_madd_epi16(m128Tmp14,
5422
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5423
            E7h = _mm_madd_epi16(m128Tmp15,
5424
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5425
5426
            O2l = _mm_add_epi32(E0l, E1l);
5427
            O2l = _mm_add_epi32(O2l, E2l);
5428
            O2l = _mm_add_epi32(O2l, E3l);
5429
            O2l = _mm_add_epi32(O2l, E4l);
5430
            O2l = _mm_add_epi32(O2l, E5l);
5431
            O2l = _mm_add_epi32(O2l, E6l);
5432
            O2l = _mm_add_epi32(O2l, E7l);
5433
5434
            O2h = _mm_add_epi32(E0h, E1h);
5435
            O2h = _mm_add_epi32(O2h, E2h);
5436
            O2h = _mm_add_epi32(O2h, E3h);
5437
            O2h = _mm_add_epi32(O2h, E4h);
5438
            O2h = _mm_add_epi32(O2h, E5h);
5439
            O2h = _mm_add_epi32(O2h, E6h);
5440
            O2h = _mm_add_epi32(O2h, E7h);
5441
            /* Compute O3*/
5442
            E0l = _mm_madd_epi16(m128Tmp0,
5443
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5444
            E0h = _mm_madd_epi16(m128Tmp1,
5445
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5446
            E1l = _mm_madd_epi16(m128Tmp2,
5447
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5448
            E1h = _mm_madd_epi16(m128Tmp3,
5449
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5450
            E2l = _mm_madd_epi16(m128Tmp4,
5451
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5452
            E2h = _mm_madd_epi16(m128Tmp5,
5453
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5454
            E3l = _mm_madd_epi16(m128Tmp6,
5455
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5456
            E3h = _mm_madd_epi16(m128Tmp7,
5457
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5458
5459
            E4l = _mm_madd_epi16(m128Tmp8,
5460
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5461
            E4h = _mm_madd_epi16(m128Tmp9,
5462
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5463
            E5l = _mm_madd_epi16(m128Tmp10,
5464
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5465
            E5h = _mm_madd_epi16(m128Tmp11,
5466
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5467
            E6l = _mm_madd_epi16(m128Tmp12,
5468
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5469
            E6h = _mm_madd_epi16(m128Tmp13,
5470
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5471
            E7l = _mm_madd_epi16(m128Tmp14,
5472
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5473
            E7h = _mm_madd_epi16(m128Tmp15,
5474
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5475
5476
            O3l = _mm_add_epi32(E0l, E1l);
5477
            O3l = _mm_add_epi32(O3l, E2l);
5478
            O3l = _mm_add_epi32(O3l, E3l);
5479
            O3l = _mm_add_epi32(O3l, E4l);
5480
            O3l = _mm_add_epi32(O3l, E5l);
5481
            O3l = _mm_add_epi32(O3l, E6l);
5482
            O3l = _mm_add_epi32(O3l, E7l);
5483
5484
            O3h = _mm_add_epi32(E0h, E1h);
5485
            O3h = _mm_add_epi32(O3h, E2h);
5486
            O3h = _mm_add_epi32(O3h, E3h);
5487
            O3h = _mm_add_epi32(O3h, E4h);
5488
            O3h = _mm_add_epi32(O3h, E5h);
5489
            O3h = _mm_add_epi32(O3h, E6h);
5490
            O3h = _mm_add_epi32(O3h, E7h);
5491
            /* Compute O4*/
5492
5493
            E0l = _mm_madd_epi16(m128Tmp0,
5494
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5495
            E0h = _mm_madd_epi16(m128Tmp1,
5496
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5497
            E1l = _mm_madd_epi16(m128Tmp2,
5498
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5499
            E1h = _mm_madd_epi16(m128Tmp3,
5500
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5501
            E2l = _mm_madd_epi16(m128Tmp4,
5502
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5503
            E2h = _mm_madd_epi16(m128Tmp5,
5504
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5505
            E3l = _mm_madd_epi16(m128Tmp6,
5506
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5507
            E3h = _mm_madd_epi16(m128Tmp7,
5508
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5509
5510
            E4l = _mm_madd_epi16(m128Tmp8,
5511
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5512
            E4h = _mm_madd_epi16(m128Tmp9,
5513
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5514
            E5l = _mm_madd_epi16(m128Tmp10,
5515
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5516
            E5h = _mm_madd_epi16(m128Tmp11,
5517
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5518
            E6l = _mm_madd_epi16(m128Tmp12,
5519
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5520
            E6h = _mm_madd_epi16(m128Tmp13,
5521
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5522
            E7l = _mm_madd_epi16(m128Tmp14,
5523
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5524
            E7h = _mm_madd_epi16(m128Tmp15,
5525
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5526
5527
            O4l = _mm_add_epi32(E0l, E1l);
5528
            O4l = _mm_add_epi32(O4l, E2l);
5529
            O4l = _mm_add_epi32(O4l, E3l);
5530
            O4l = _mm_add_epi32(O4l, E4l);
5531
            O4l = _mm_add_epi32(O4l, E5l);
5532
            O4l = _mm_add_epi32(O4l, E6l);
5533
            O4l = _mm_add_epi32(O4l, E7l);
5534
5535
            O4h = _mm_add_epi32(E0h, E1h);
5536
            O4h = _mm_add_epi32(O4h, E2h);
5537
            O4h = _mm_add_epi32(O4h, E3h);
5538
            O4h = _mm_add_epi32(O4h, E4h);
5539
            O4h = _mm_add_epi32(O4h, E5h);
5540
            O4h = _mm_add_epi32(O4h, E6h);
5541
            O4h = _mm_add_epi32(O4h, E7h);
5542
5543
            /* Compute O5*/
5544
            E0l = _mm_madd_epi16(m128Tmp0,
5545
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5546
            E0h = _mm_madd_epi16(m128Tmp1,
5547
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5548
            E1l = _mm_madd_epi16(m128Tmp2,
5549
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5550
            E1h = _mm_madd_epi16(m128Tmp3,
5551
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5552
            E2l = _mm_madd_epi16(m128Tmp4,
5553
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5554
            E2h = _mm_madd_epi16(m128Tmp5,
5555
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5556
            E3l = _mm_madd_epi16(m128Tmp6,
5557
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5558
            E3h = _mm_madd_epi16(m128Tmp7,
5559
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5560
5561
            E4l = _mm_madd_epi16(m128Tmp8,
5562
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5563
            E4h = _mm_madd_epi16(m128Tmp9,
5564
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5565
            E5l = _mm_madd_epi16(m128Tmp10,
5566
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5567
            E5h = _mm_madd_epi16(m128Tmp11,
5568
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5569
            E6l = _mm_madd_epi16(m128Tmp12,
5570
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5571
            E6h = _mm_madd_epi16(m128Tmp13,
5572
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5573
            E7l = _mm_madd_epi16(m128Tmp14,
5574
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5575
            E7h = _mm_madd_epi16(m128Tmp15,
5576
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5577
5578
            O5l = _mm_add_epi32(E0l, E1l);
5579
            O5l = _mm_add_epi32(O5l, E2l);
5580
            O5l = _mm_add_epi32(O5l, E3l);
5581
            O5l = _mm_add_epi32(O5l, E4l);
5582
            O5l = _mm_add_epi32(O5l, E5l);
5583
            O5l = _mm_add_epi32(O5l, E6l);
5584
            O5l = _mm_add_epi32(O5l, E7l);
5585
5586
            O5h = _mm_add_epi32(E0h, E1h);
5587
            O5h = _mm_add_epi32(O5h, E2h);
5588
            O5h = _mm_add_epi32(O5h, E3h);
5589
            O5h = _mm_add_epi32(O5h, E4h);
5590
            O5h = _mm_add_epi32(O5h, E5h);
5591
            O5h = _mm_add_epi32(O5h, E6h);
5592
            O5h = _mm_add_epi32(O5h, E7h);
5593
5594
            /* Compute O6*/
5595
5596
            E0l = _mm_madd_epi16(m128Tmp0,
5597
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5598
            E0h = _mm_madd_epi16(m128Tmp1,
5599
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5600
            E1l = _mm_madd_epi16(m128Tmp2,
5601
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5602
            E1h = _mm_madd_epi16(m128Tmp3,
5603
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5604
            E2l = _mm_madd_epi16(m128Tmp4,
5605
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5606
            E2h = _mm_madd_epi16(m128Tmp5,
5607
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5608
            E3l = _mm_madd_epi16(m128Tmp6,
5609
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5610
            E3h = _mm_madd_epi16(m128Tmp7,
5611
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5612
5613
            E4l = _mm_madd_epi16(m128Tmp8,
5614
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5615
            E4h = _mm_madd_epi16(m128Tmp9,
5616
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5617
            E5l = _mm_madd_epi16(m128Tmp10,
5618
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5619
            E5h = _mm_madd_epi16(m128Tmp11,
5620
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5621
            E6l = _mm_madd_epi16(m128Tmp12,
5622
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5623
            E6h = _mm_madd_epi16(m128Tmp13,
5624
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5625
            E7l = _mm_madd_epi16(m128Tmp14,
5626
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5627
            E7h = _mm_madd_epi16(m128Tmp15,
5628
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5629
5630
            O6l = _mm_add_epi32(E0l, E1l);
5631
            O6l = _mm_add_epi32(O6l, E2l);
5632
            O6l = _mm_add_epi32(O6l, E3l);
5633
            O6l = _mm_add_epi32(O6l, E4l);
5634
            O6l = _mm_add_epi32(O6l, E5l);
5635
            O6l = _mm_add_epi32(O6l, E6l);
5636
            O6l = _mm_add_epi32(O6l, E7l);
5637
5638
            O6h = _mm_add_epi32(E0h, E1h);
5639
            O6h = _mm_add_epi32(O6h, E2h);
5640
            O6h = _mm_add_epi32(O6h, E3h);
5641
            O6h = _mm_add_epi32(O6h, E4h);
5642
            O6h = _mm_add_epi32(O6h, E5h);
5643
            O6h = _mm_add_epi32(O6h, E6h);
5644
            O6h = _mm_add_epi32(O6h, E7h);
5645
5646
            /* Compute O7*/
5647
5648
            E0l = _mm_madd_epi16(m128Tmp0,
5649
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5650
            E0h = _mm_madd_epi16(m128Tmp1,
5651
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5652
            E1l = _mm_madd_epi16(m128Tmp2,
5653
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5654
            E1h = _mm_madd_epi16(m128Tmp3,
5655
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5656
            E2l = _mm_madd_epi16(m128Tmp4,
5657
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5658
            E2h = _mm_madd_epi16(m128Tmp5,
5659
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5660
            E3l = _mm_madd_epi16(m128Tmp6,
5661
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5662
            E3h = _mm_madd_epi16(m128Tmp7,
5663
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5664
5665
            E4l = _mm_madd_epi16(m128Tmp8,
5666
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5667
            E4h = _mm_madd_epi16(m128Tmp9,
5668
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5669
            E5l = _mm_madd_epi16(m128Tmp10,
5670
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5671
            E5h = _mm_madd_epi16(m128Tmp11,
5672
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5673
            E6l = _mm_madd_epi16(m128Tmp12,
5674
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5675
            E6h = _mm_madd_epi16(m128Tmp13,
5676
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5677
            E7l = _mm_madd_epi16(m128Tmp14,
5678
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5679
            E7h = _mm_madd_epi16(m128Tmp15,
5680
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5681
5682
            O7l = _mm_add_epi32(E0l, E1l);
5683
            O7l = _mm_add_epi32(O7l, E2l);
5684
            O7l = _mm_add_epi32(O7l, E3l);
5685
            O7l = _mm_add_epi32(O7l, E4l);
5686
            O7l = _mm_add_epi32(O7l, E5l);
5687
            O7l = _mm_add_epi32(O7l, E6l);
5688
            O7l = _mm_add_epi32(O7l, E7l);
5689
5690
            O7h = _mm_add_epi32(E0h, E1h);
5691
            O7h = _mm_add_epi32(O7h, E2h);
5692
            O7h = _mm_add_epi32(O7h, E3h);
5693
            O7h = _mm_add_epi32(O7h, E4h);
5694
            O7h = _mm_add_epi32(O7h, E5h);
5695
            O7h = _mm_add_epi32(O7h, E6h);
5696
            O7h = _mm_add_epi32(O7h, E7h);
5697
5698
            /* Compute O8*/
5699
5700
            E0l = _mm_madd_epi16(m128Tmp0,
5701
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5702
            E0h = _mm_madd_epi16(m128Tmp1,
5703
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5704
            E1l = _mm_madd_epi16(m128Tmp2,
5705
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5706
            E1h = _mm_madd_epi16(m128Tmp3,
5707
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5708
            E2l = _mm_madd_epi16(m128Tmp4,
5709
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5710
            E2h = _mm_madd_epi16(m128Tmp5,
5711
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5712
            E3l = _mm_madd_epi16(m128Tmp6,
5713
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5714
            E3h = _mm_madd_epi16(m128Tmp7,
5715
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5716
5717
            E4l = _mm_madd_epi16(m128Tmp8,
5718
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5719
            E4h = _mm_madd_epi16(m128Tmp9,
5720
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5721
            E5l = _mm_madd_epi16(m128Tmp10,
5722
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5723
            E5h = _mm_madd_epi16(m128Tmp11,
5724
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5725
            E6l = _mm_madd_epi16(m128Tmp12,
5726
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5727
            E6h = _mm_madd_epi16(m128Tmp13,
5728
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5729
            E7l = _mm_madd_epi16(m128Tmp14,
5730
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5731
            E7h = _mm_madd_epi16(m128Tmp15,
5732
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5733
5734
            O8l = _mm_add_epi32(E0l, E1l);
5735
            O8l = _mm_add_epi32(O8l, E2l);
5736
            O8l = _mm_add_epi32(O8l, E3l);
5737
            O8l = _mm_add_epi32(O8l, E4l);
5738
            O8l = _mm_add_epi32(O8l, E5l);
5739
            O8l = _mm_add_epi32(O8l, E6l);
5740
            O8l = _mm_add_epi32(O8l, E7l);
5741
5742
            O8h = _mm_add_epi32(E0h, E1h);
5743
            O8h = _mm_add_epi32(O8h, E2h);
5744
            O8h = _mm_add_epi32(O8h, E3h);
5745
            O8h = _mm_add_epi32(O8h, E4h);
5746
            O8h = _mm_add_epi32(O8h, E5h);
5747
            O8h = _mm_add_epi32(O8h, E6h);
5748
            O8h = _mm_add_epi32(O8h, E7h);
5749
5750
            /* Compute O9*/
5751
5752
            E0l = _mm_madd_epi16(m128Tmp0,
5753
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5754
            E0h = _mm_madd_epi16(m128Tmp1,
5755
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5756
            E1l = _mm_madd_epi16(m128Tmp2,
5757
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5758
            E1h = _mm_madd_epi16(m128Tmp3,
5759
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5760
            E2l = _mm_madd_epi16(m128Tmp4,
5761
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5762
            E2h = _mm_madd_epi16(m128Tmp5,
5763
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5764
            E3l = _mm_madd_epi16(m128Tmp6,
5765
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5766
            E3h = _mm_madd_epi16(m128Tmp7,
5767
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5768
5769
            E4l = _mm_madd_epi16(m128Tmp8,
5770
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5771
            E4h = _mm_madd_epi16(m128Tmp9,
5772
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5773
            E5l = _mm_madd_epi16(m128Tmp10,
5774
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5775
            E5h = _mm_madd_epi16(m128Tmp11,
5776
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5777
            E6l = _mm_madd_epi16(m128Tmp12,
5778
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5779
            E6h = _mm_madd_epi16(m128Tmp13,
5780
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5781
            E7l = _mm_madd_epi16(m128Tmp14,
5782
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5783
            E7h = _mm_madd_epi16(m128Tmp15,
5784
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5785
5786
            O9l = _mm_add_epi32(E0l, E1l);
5787
            O9l = _mm_add_epi32(O9l, E2l);
5788
            O9l = _mm_add_epi32(O9l, E3l);
5789
            O9l = _mm_add_epi32(O9l, E4l);
5790
            O9l = _mm_add_epi32(O9l, E5l);
5791
            O9l = _mm_add_epi32(O9l, E6l);
5792
            O9l = _mm_add_epi32(O9l, E7l);
5793
5794
            O9h = _mm_add_epi32(E0h, E1h);
5795
            O9h = _mm_add_epi32(O9h, E2h);
5796
            O9h = _mm_add_epi32(O9h, E3h);
5797
            O9h = _mm_add_epi32(O9h, E4h);
5798
            O9h = _mm_add_epi32(O9h, E5h);
5799
            O9h = _mm_add_epi32(O9h, E6h);
5800
            O9h = _mm_add_epi32(O9h, E7h);
5801
5802
            /* Compute 10*/
5803
5804
            E0l = _mm_madd_epi16(m128Tmp0,
5805
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5806
            E0h = _mm_madd_epi16(m128Tmp1,
5807
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5808
            E1l = _mm_madd_epi16(m128Tmp2,
5809
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5810
            E1h = _mm_madd_epi16(m128Tmp3,
5811
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5812
            E2l = _mm_madd_epi16(m128Tmp4,
5813
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5814
            E2h = _mm_madd_epi16(m128Tmp5,
5815
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5816
            E3l = _mm_madd_epi16(m128Tmp6,
5817
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5818
            E3h = _mm_madd_epi16(m128Tmp7,
5819
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5820
5821
            E4l = _mm_madd_epi16(m128Tmp8,
5822
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5823
            E4h = _mm_madd_epi16(m128Tmp9,
5824
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5825
            E5l = _mm_madd_epi16(m128Tmp10,
5826
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5827
            E5h = _mm_madd_epi16(m128Tmp11,
5828
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5829
            E6l = _mm_madd_epi16(m128Tmp12,
5830
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5831
            E6h = _mm_madd_epi16(m128Tmp13,
5832
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5833
            E7l = _mm_madd_epi16(m128Tmp14,
5834
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5835
            E7h = _mm_madd_epi16(m128Tmp15,
5836
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5837
5838
            O10l = _mm_add_epi32(E0l, E1l);
5839
            O10l = _mm_add_epi32(O10l, E2l);
5840
            O10l = _mm_add_epi32(O10l, E3l);
5841
            O10l = _mm_add_epi32(O10l, E4l);
5842
            O10l = _mm_add_epi32(O10l, E5l);
5843
            O10l = _mm_add_epi32(O10l, E6l);
5844
            O10l = _mm_add_epi32(O10l, E7l);
5845
5846
            O10h = _mm_add_epi32(E0h, E1h);
5847
            O10h = _mm_add_epi32(O10h, E2h);
5848
            O10h = _mm_add_epi32(O10h, E3h);
5849
            O10h = _mm_add_epi32(O10h, E4h);
5850
            O10h = _mm_add_epi32(O10h, E5h);
5851
            O10h = _mm_add_epi32(O10h, E6h);
5852
            O10h = _mm_add_epi32(O10h, E7h);
5853
5854
            /* Compute 11*/
5855
5856
            E0l = _mm_madd_epi16(m128Tmp0,
5857
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5858
            E0h = _mm_madd_epi16(m128Tmp1,
5859
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5860
            E1l = _mm_madd_epi16(m128Tmp2,
5861
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5862
            E1h = _mm_madd_epi16(m128Tmp3,
5863
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5864
            E2l = _mm_madd_epi16(m128Tmp4,
5865
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5866
            E2h = _mm_madd_epi16(m128Tmp5,
5867
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5868
            E3l = _mm_madd_epi16(m128Tmp6,
5869
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5870
            E3h = _mm_madd_epi16(m128Tmp7,
5871
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5872
5873
            E4l = _mm_madd_epi16(m128Tmp8,
5874
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5875
            E4h = _mm_madd_epi16(m128Tmp9,
5876
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5877
            E5l = _mm_madd_epi16(m128Tmp10,
5878
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5879
            E5h = _mm_madd_epi16(m128Tmp11,
5880
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5881
            E6l = _mm_madd_epi16(m128Tmp12,
5882
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5883
            E6h = _mm_madd_epi16(m128Tmp13,
5884
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5885
            E7l = _mm_madd_epi16(m128Tmp14,
5886
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5887
            E7h = _mm_madd_epi16(m128Tmp15,
5888
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5889
5890
            O11l = _mm_add_epi32(E0l, E1l);
5891
            O11l = _mm_add_epi32(O11l, E2l);
5892
            O11l = _mm_add_epi32(O11l, E3l);
5893
            O11l = _mm_add_epi32(O11l, E4l);
5894
            O11l = _mm_add_epi32(O11l, E5l);
5895
            O11l = _mm_add_epi32(O11l, E6l);
5896
            O11l = _mm_add_epi32(O11l, E7l);
5897
5898
            O11h = _mm_add_epi32(E0h, E1h);
5899
            O11h = _mm_add_epi32(O11h, E2h);
5900
            O11h = _mm_add_epi32(O11h, E3h);
5901
            O11h = _mm_add_epi32(O11h, E4h);
5902
            O11h = _mm_add_epi32(O11h, E5h);
5903
            O11h = _mm_add_epi32(O11h, E6h);
5904
            O11h = _mm_add_epi32(O11h, E7h);
5905
5906
            /* Compute 12*/
5907
5908
            E0l = _mm_madd_epi16(m128Tmp0,
5909
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5910
            E0h = _mm_madd_epi16(m128Tmp1,
5911
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5912
            E1l = _mm_madd_epi16(m128Tmp2,
5913
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5914
            E1h = _mm_madd_epi16(m128Tmp3,
5915
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5916
            E2l = _mm_madd_epi16(m128Tmp4,
5917
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5918
            E2h = _mm_madd_epi16(m128Tmp5,
5919
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5920
            E3l = _mm_madd_epi16(m128Tmp6,
5921
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5922
            E3h = _mm_madd_epi16(m128Tmp7,
5923
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5924
5925
            E4l = _mm_madd_epi16(m128Tmp8,
5926
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5927
            E4h = _mm_madd_epi16(m128Tmp9,
5928
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5929
            E5l = _mm_madd_epi16(m128Tmp10,
5930
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5931
            E5h = _mm_madd_epi16(m128Tmp11,
5932
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5933
            E6l = _mm_madd_epi16(m128Tmp12,
5934
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5935
            E6h = _mm_madd_epi16(m128Tmp13,
5936
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5937
            E7l = _mm_madd_epi16(m128Tmp14,
5938
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5939
            E7h = _mm_madd_epi16(m128Tmp15,
5940
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5941
5942
            O12l = _mm_add_epi32(E0l, E1l);
5943
            O12l = _mm_add_epi32(O12l, E2l);
5944
            O12l = _mm_add_epi32(O12l, E3l);
5945
            O12l = _mm_add_epi32(O12l, E4l);
5946
            O12l = _mm_add_epi32(O12l, E5l);
5947
            O12l = _mm_add_epi32(O12l, E6l);
5948
            O12l = _mm_add_epi32(O12l, E7l);
5949
5950
            O12h = _mm_add_epi32(E0h, E1h);
5951
            O12h = _mm_add_epi32(O12h, E2h);
5952
            O12h = _mm_add_epi32(O12h, E3h);
5953
            O12h = _mm_add_epi32(O12h, E4h);
5954
            O12h = _mm_add_epi32(O12h, E5h);
5955
            O12h = _mm_add_epi32(O12h, E6h);
5956
            O12h = _mm_add_epi32(O12h, E7h);
5957
5958
            /* Compute 13*/
5959
5960
            E0l = _mm_madd_epi16(m128Tmp0,
5961
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5962
            E0h = _mm_madd_epi16(m128Tmp1,
5963
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5964
            E1l = _mm_madd_epi16(m128Tmp2,
5965
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5966
            E1h = _mm_madd_epi16(m128Tmp3,
5967
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5968
            E2l = _mm_madd_epi16(m128Tmp4,
5969
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5970
            E2h = _mm_madd_epi16(m128Tmp5,
5971
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5972
            E3l = _mm_madd_epi16(m128Tmp6,
5973
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5974
            E3h = _mm_madd_epi16(m128Tmp7,
5975
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5976
5977
            E4l = _mm_madd_epi16(m128Tmp8,
5978
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5979
            E4h = _mm_madd_epi16(m128Tmp9,
5980
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5981
            E5l = _mm_madd_epi16(m128Tmp10,
5982
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5983
            E5h = _mm_madd_epi16(m128Tmp11,
5984
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5985
            E6l = _mm_madd_epi16(m128Tmp12,
5986
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5987
            E6h = _mm_madd_epi16(m128Tmp13,
5988
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5989
            E7l = _mm_madd_epi16(m128Tmp14,
5990
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
5991
            E7h = _mm_madd_epi16(m128Tmp15,
5992
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
5993
5994
            O13l = _mm_add_epi32(E0l, E1l);
5995
            O13l = _mm_add_epi32(O13l, E2l);
5996
            O13l = _mm_add_epi32(O13l, E3l);
5997
            O13l = _mm_add_epi32(O13l, E4l);
5998
            O13l = _mm_add_epi32(O13l, E5l);
5999
            O13l = _mm_add_epi32(O13l, E6l);
6000
            O13l = _mm_add_epi32(O13l, E7l);
6001
6002
            O13h = _mm_add_epi32(E0h, E1h);
6003
            O13h = _mm_add_epi32(O13h, E2h);
6004
            O13h = _mm_add_epi32(O13h, E3h);
6005
            O13h = _mm_add_epi32(O13h, E4h);
6006
            O13h = _mm_add_epi32(O13h, E5h);
6007
            O13h = _mm_add_epi32(O13h, E6h);
6008
            O13h = _mm_add_epi32(O13h, E7h);
6009
6010
            /* Compute O14  */
6011
6012
            E0l = _mm_madd_epi16(m128Tmp0,
6013
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6014
            E0h = _mm_madd_epi16(m128Tmp1,
6015
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6016
            E1l = _mm_madd_epi16(m128Tmp2,
6017
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6018
            E1h = _mm_madd_epi16(m128Tmp3,
6019
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6020
            E2l = _mm_madd_epi16(m128Tmp4,
6021
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6022
            E2h = _mm_madd_epi16(m128Tmp5,
6023
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6024
            E3l = _mm_madd_epi16(m128Tmp6,
6025
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6026
            E3h = _mm_madd_epi16(m128Tmp7,
6027
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6028
6029
            E4l = _mm_madd_epi16(m128Tmp8,
6030
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6031
            E4h = _mm_madd_epi16(m128Tmp9,
6032
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6033
            E5l = _mm_madd_epi16(m128Tmp10,
6034
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6035
            E5h = _mm_madd_epi16(m128Tmp11,
6036
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6037
            E6l = _mm_madd_epi16(m128Tmp12,
6038
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6039
            E6h = _mm_madd_epi16(m128Tmp13,
6040
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6041
            E7l = _mm_madd_epi16(m128Tmp14,
6042
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6043
            E7h = _mm_madd_epi16(m128Tmp15,
6044
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6045
6046
            O14l = _mm_add_epi32(E0l, E1l);
6047
            O14l = _mm_add_epi32(O14l, E2l);
6048
            O14l = _mm_add_epi32(O14l, E3l);
6049
            O14l = _mm_add_epi32(O14l, E4l);
6050
            O14l = _mm_add_epi32(O14l, E5l);
6051
            O14l = _mm_add_epi32(O14l, E6l);
6052
            O14l = _mm_add_epi32(O14l, E7l);
6053
6054
            O14h = _mm_add_epi32(E0h, E1h);
6055
            O14h = _mm_add_epi32(O14h, E2h);
6056
            O14h = _mm_add_epi32(O14h, E3h);
6057
            O14h = _mm_add_epi32(O14h, E4h);
6058
            O14h = _mm_add_epi32(O14h, E5h);
6059
            O14h = _mm_add_epi32(O14h, E6h);
6060
            O14h = _mm_add_epi32(O14h, E7h);
6061
6062
            /* Compute O15*/
6063
6064
            E0l = _mm_madd_epi16(m128Tmp0,
6065
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6066
            E0h = _mm_madd_epi16(m128Tmp1,
6067
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6068
            E1l = _mm_madd_epi16(m128Tmp2,
6069
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6070
            E1h = _mm_madd_epi16(m128Tmp3,
6071
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6072
            E2l = _mm_madd_epi16(m128Tmp4,
6073
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6074
            E2h = _mm_madd_epi16(m128Tmp5,
6075
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6076
            E3l = _mm_madd_epi16(m128Tmp6,
6077
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6078
            E3h = _mm_madd_epi16(m128Tmp7,
6079
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6080
6081
            E4l = _mm_madd_epi16(m128Tmp8,
6082
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6083
            E4h = _mm_madd_epi16(m128Tmp9,
6084
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6085
            E5l = _mm_madd_epi16(m128Tmp10,
6086
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6087
            E5h = _mm_madd_epi16(m128Tmp11,
6088
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6089
            E6l = _mm_madd_epi16(m128Tmp12,
6090
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6091
            E6h = _mm_madd_epi16(m128Tmp13,
6092
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6093
            E7l = _mm_madd_epi16(m128Tmp14,
6094
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6095
            E7h = _mm_madd_epi16(m128Tmp15,
6096
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6097
6098
            O15l = _mm_add_epi32(E0l, E1l);
6099
            O15l = _mm_add_epi32(O15l, E2l);
6100
            O15l = _mm_add_epi32(O15l, E3l);
6101
            O15l = _mm_add_epi32(O15l, E4l);
6102
            O15l = _mm_add_epi32(O15l, E5l);
6103
            O15l = _mm_add_epi32(O15l, E6l);
6104
            O15l = _mm_add_epi32(O15l, E7l);
6105
6106
            O15h = _mm_add_epi32(E0h, E1h);
6107
            O15h = _mm_add_epi32(O15h, E2h);
6108
            O15h = _mm_add_epi32(O15h, E3h);
6109
            O15h = _mm_add_epi32(O15h, E4h);
6110
            O15h = _mm_add_epi32(O15h, E5h);
6111
            O15h = _mm_add_epi32(O15h, E6h);
6112
            O15h = _mm_add_epi32(O15h, E7h);
6113
            /*  Compute E0  */
6114
6115
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6116
            E0l = _mm_madd_epi16(m128Tmp0,
6117
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6118
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6119
            E0h = _mm_madd_epi16(m128Tmp1,
6120
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6121
6122
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6123
            E0l = _mm_add_epi32(E0l,
6124
                    _mm_madd_epi16(m128Tmp2,
6125
                            _mm_load_si128(
6126
                                    (__m128i *) (transform16x16_1[1][0]))));
6127
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6128
            E0h = _mm_add_epi32(E0h,
6129
                    _mm_madd_epi16(m128Tmp3,
6130
                            _mm_load_si128(
6131
                                    (__m128i *) (transform16x16_1[1][0]))));
6132
6133
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6134
            E0l = _mm_add_epi32(E0l,
6135
                    _mm_madd_epi16(m128Tmp4,
6136
                            _mm_load_si128(
6137
                                    (__m128i *) (transform16x16_1[2][0]))));
6138
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6139
            E0h = _mm_add_epi32(E0h,
6140
                    _mm_madd_epi16(m128Tmp5,
6141
                            _mm_load_si128(
6142
                                    (__m128i *) (transform16x16_1[2][0]))));
6143
6144
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6145
            E0l = _mm_add_epi32(E0l,
6146
                    _mm_madd_epi16(m128Tmp6,
6147
                            _mm_load_si128(
6148
                                    (__m128i *) (transform16x16_1[3][0]))));
6149
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6150
            E0h = _mm_add_epi32(E0h,
6151
                    _mm_madd_epi16(m128Tmp7,
6152
                            _mm_load_si128(
6153
                                    (__m128i *) (transform16x16_1[3][0]))));
6154
6155
            /*  Compute E1  */
6156
            E1l = _mm_madd_epi16(m128Tmp0,
6157
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6158
            E1h = _mm_madd_epi16(m128Tmp1,
6159
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6160
            E1l = _mm_add_epi32(E1l,
6161
                    _mm_madd_epi16(m128Tmp2,
6162
                            _mm_load_si128(
6163
                                    (__m128i *) (transform16x16_1[1][1]))));
6164
            E1h = _mm_add_epi32(E1h,
6165
                    _mm_madd_epi16(m128Tmp3,
6166
                            _mm_load_si128(
6167
                                    (__m128i *) (transform16x16_1[1][1]))));
6168
            E1l = _mm_add_epi32(E1l,
6169
                    _mm_madd_epi16(m128Tmp4,
6170
                            _mm_load_si128(
6171
                                    (__m128i *) (transform16x16_1[2][1]))));
6172
            E1h = _mm_add_epi32(E1h,
6173
                    _mm_madd_epi16(m128Tmp5,
6174
                            _mm_load_si128(
6175
                                    (__m128i *) (transform16x16_1[2][1]))));
6176
            E1l = _mm_add_epi32(E1l,
6177
                    _mm_madd_epi16(m128Tmp6,
6178
                            _mm_load_si128(
6179
                                    (__m128i *) (transform16x16_1[3][1]))));
6180
            E1h = _mm_add_epi32(E1h,
6181
                    _mm_madd_epi16(m128Tmp7,
6182
                            _mm_load_si128(
6183
                                    (__m128i *) (transform16x16_1[3][1]))));
6184
6185
            /*  Compute E2  */
6186
            E2l = _mm_madd_epi16(m128Tmp0,
6187
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6188
            E2h = _mm_madd_epi16(m128Tmp1,
6189
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6190
            E2l = _mm_add_epi32(E2l,
6191
                    _mm_madd_epi16(m128Tmp2,
6192
                            _mm_load_si128(
6193
                                    (__m128i *) (transform16x16_1[1][2]))));
6194
            E2h = _mm_add_epi32(E2h,
6195
                    _mm_madd_epi16(m128Tmp3,
6196
                            _mm_load_si128(
6197
                                    (__m128i *) (transform16x16_1[1][2]))));
6198
            E2l = _mm_add_epi32(E2l,
6199
                    _mm_madd_epi16(m128Tmp4,
6200
                            _mm_load_si128(
6201
                                    (__m128i *) (transform16x16_1[2][2]))));
6202
            E2h = _mm_add_epi32(E2h,
6203
                    _mm_madd_epi16(m128Tmp5,
6204
                            _mm_load_si128(
6205
                                    (__m128i *) (transform16x16_1[2][2]))));
6206
            E2l = _mm_add_epi32(E2l,
6207
                    _mm_madd_epi16(m128Tmp6,
6208
                            _mm_load_si128(
6209
                                    (__m128i *) (transform16x16_1[3][2]))));
6210
            E2h = _mm_add_epi32(E2h,
6211
                    _mm_madd_epi16(m128Tmp7,
6212
                            _mm_load_si128(
6213
                                    (__m128i *) (transform16x16_1[3][2]))));
6214
6215
            /*  Compute E3  */
6216
            E3l = _mm_madd_epi16(m128Tmp0,
6217
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6218
            E3h = _mm_madd_epi16(m128Tmp1,
6219
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6220
            E3l = _mm_add_epi32(E3l,
6221
                    _mm_madd_epi16(m128Tmp2,
6222
                            _mm_load_si128(
6223
                                    (__m128i *) (transform16x16_1[1][3]))));
6224
            E3h = _mm_add_epi32(E3h,
6225
                    _mm_madd_epi16(m128Tmp3,
6226
                            _mm_load_si128(
6227
                                    (__m128i *) (transform16x16_1[1][3]))));
6228
            E3l = _mm_add_epi32(E3l,
6229
                    _mm_madd_epi16(m128Tmp4,
6230
                            _mm_load_si128(
6231
                                    (__m128i *) (transform16x16_1[2][3]))));
6232
            E3h = _mm_add_epi32(E3h,
6233
                    _mm_madd_epi16(m128Tmp5,
6234
                            _mm_load_si128(
6235
                                    (__m128i *) (transform16x16_1[2][3]))));
6236
            E3l = _mm_add_epi32(E3l,
6237
                    _mm_madd_epi16(m128Tmp6,
6238
                            _mm_load_si128(
6239
                                    (__m128i *) (transform16x16_1[3][3]))));
6240
            E3h = _mm_add_epi32(E3h,
6241
                    _mm_madd_epi16(m128Tmp7,
6242
                            _mm_load_si128(
6243
                                    (__m128i *) (transform16x16_1[3][3]))));
6244
6245
            /*  Compute E4  */
6246
            E4l = _mm_madd_epi16(m128Tmp0,
6247
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6248
            E4h = _mm_madd_epi16(m128Tmp1,
6249
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6250
            E4l = _mm_add_epi32(E4l,
6251
                    _mm_madd_epi16(m128Tmp2,
6252
                            _mm_load_si128(
6253
                                    (__m128i *) (transform16x16_1[1][4]))));
6254
            E4h = _mm_add_epi32(E4h,
6255
                    _mm_madd_epi16(m128Tmp3,
6256
                            _mm_load_si128(
6257
                                    (__m128i *) (transform16x16_1[1][4]))));
6258
            E4l = _mm_add_epi32(E4l,
6259
                    _mm_madd_epi16(m128Tmp4,
6260
                            _mm_load_si128(
6261
                                    (__m128i *) (transform16x16_1[2][4]))));
6262
            E4h = _mm_add_epi32(E4h,
6263
                    _mm_madd_epi16(m128Tmp5,
6264
                            _mm_load_si128(
6265
                                    (__m128i *) (transform16x16_1[2][4]))));
6266
            E4l = _mm_add_epi32(E4l,
6267
                    _mm_madd_epi16(m128Tmp6,
6268
                            _mm_load_si128(
6269
                                    (__m128i *) (transform16x16_1[3][4]))));
6270
            E4h = _mm_add_epi32(E4h,
6271
                    _mm_madd_epi16(m128Tmp7,
6272
                            _mm_load_si128(
6273
                                    (__m128i *) (transform16x16_1[3][4]))));
6274
6275
            /*  Compute E3  */
6276
            E5l = _mm_madd_epi16(m128Tmp0,
6277
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6278
            E5h = _mm_madd_epi16(m128Tmp1,
6279
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6280
            E5l = _mm_add_epi32(E5l,
6281
                    _mm_madd_epi16(m128Tmp2,
6282
                            _mm_load_si128(
6283
                                    (__m128i *) (transform16x16_1[1][5]))));
6284
            E5h = _mm_add_epi32(E5h,
6285
                    _mm_madd_epi16(m128Tmp3,
6286
                            _mm_load_si128(
6287
                                    (__m128i *) (transform16x16_1[1][5]))));
6288
            E5l = _mm_add_epi32(E5l,
6289
                    _mm_madd_epi16(m128Tmp4,
6290
                            _mm_load_si128(
6291
                                    (__m128i *) (transform16x16_1[2][5]))));
6292
            E5h = _mm_add_epi32(E5h,
6293
                    _mm_madd_epi16(m128Tmp5,
6294
                            _mm_load_si128(
6295
                                    (__m128i *) (transform16x16_1[2][5]))));
6296
            E5l = _mm_add_epi32(E5l,
6297
                    _mm_madd_epi16(m128Tmp6,
6298
                            _mm_load_si128(
6299
                                    (__m128i *) (transform16x16_1[3][5]))));
6300
            E5h = _mm_add_epi32(E5h,
6301
                    _mm_madd_epi16(m128Tmp7,
6302
                            _mm_load_si128(
6303
                                    (__m128i *) (transform16x16_1[3][5]))));
6304
6305
            /*  Compute E6  */
6306
            E6l = _mm_madd_epi16(m128Tmp0,
6307
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6308
            E6h = _mm_madd_epi16(m128Tmp1,
6309
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6310
            E6l = _mm_add_epi32(E6l,
6311
                    _mm_madd_epi16(m128Tmp2,
6312
                            _mm_load_si128(
6313
                                    (__m128i *) (transform16x16_1[1][6]))));
6314
            E6h = _mm_add_epi32(E6h,
6315
                    _mm_madd_epi16(m128Tmp3,
6316
                            _mm_load_si128(
6317
                                    (__m128i *) (transform16x16_1[1][6]))));
6318
            E6l = _mm_add_epi32(E6l,
6319
                    _mm_madd_epi16(m128Tmp4,
6320
                            _mm_load_si128(
6321
                                    (__m128i *) (transform16x16_1[2][6]))));
6322
            E6h = _mm_add_epi32(E6h,
6323
                    _mm_madd_epi16(m128Tmp5,
6324
                            _mm_load_si128(
6325
                                    (__m128i *) (transform16x16_1[2][6]))));
6326
            E6l = _mm_add_epi32(E6l,
6327
                    _mm_madd_epi16(m128Tmp6,
6328
                            _mm_load_si128(
6329
                                    (__m128i *) (transform16x16_1[3][6]))));
6330
            E6h = _mm_add_epi32(E6h,
6331
                    _mm_madd_epi16(m128Tmp7,
6332
                            _mm_load_si128(
6333
                                    (__m128i *) (transform16x16_1[3][6]))));
6334
6335
            /*  Compute E7  */
6336
            E7l = _mm_madd_epi16(m128Tmp0,
6337
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6338
            E7h = _mm_madd_epi16(m128Tmp1,
6339
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6340
            E7l = _mm_add_epi32(E7l,
6341
                    _mm_madd_epi16(m128Tmp2,
6342
                            _mm_load_si128(
6343
                                    (__m128i *) (transform16x16_1[1][7]))));
6344
            E7h = _mm_add_epi32(E7h,
6345
                    _mm_madd_epi16(m128Tmp3,
6346
                            _mm_load_si128(
6347
                                    (__m128i *) (transform16x16_1[1][7]))));
6348
            E7l = _mm_add_epi32(E7l,
6349
                    _mm_madd_epi16(m128Tmp4,
6350
                            _mm_load_si128(
6351
                                    (__m128i *) (transform16x16_1[2][7]))));
6352
            E7h = _mm_add_epi32(E7h,
6353
                    _mm_madd_epi16(m128Tmp5,
6354
                            _mm_load_si128(
6355
                                    (__m128i *) (transform16x16_1[2][7]))));
6356
            E7l = _mm_add_epi32(E7l,
6357
                    _mm_madd_epi16(m128Tmp6,
6358
                            _mm_load_si128(
6359
                                    (__m128i *) (transform16x16_1[3][7]))));
6360
            E7h = _mm_add_epi32(E7h,
6361
                    _mm_madd_epi16(m128Tmp7,
6362
                            _mm_load_si128(
6363
                                    (__m128i *) (transform16x16_1[3][7]))));
6364
6365
            /*  Compute EE0 and EEE */
6366
6367
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6368
            E00l = _mm_madd_epi16(m128Tmp0,
6369
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6370
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6371
            E00h = _mm_madd_epi16(m128Tmp1,
6372
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6373
6374
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6375
            E00l = _mm_add_epi32(E00l,
6376
                    _mm_madd_epi16(m128Tmp2,
6377
                            _mm_load_si128(
6378
                                    (__m128i *) (transform16x16_2[1][0]))));
6379
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6380
            E00h = _mm_add_epi32(E00h,
6381
                    _mm_madd_epi16(m128Tmp3,
6382
                            _mm_load_si128(
6383
                                    (__m128i *) (transform16x16_2[1][0]))));
6384
6385
            E01l = _mm_madd_epi16(m128Tmp0,
6386
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6387
            E01h = _mm_madd_epi16(m128Tmp1,
6388
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6389
            E01l = _mm_add_epi32(E01l,
6390
                    _mm_madd_epi16(m128Tmp2,
6391
                            _mm_load_si128(
6392
                                    (__m128i *) (transform16x16_2[1][1]))));
6393
            E01h = _mm_add_epi32(E01h,
6394
                    _mm_madd_epi16(m128Tmp3,
6395
                            _mm_load_si128(
6396
                                    (__m128i *) (transform16x16_2[1][1]))));
6397
6398
            E02l = _mm_madd_epi16(m128Tmp0,
6399
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6400
            E02h = _mm_madd_epi16(m128Tmp1,
6401
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6402
            E02l = _mm_add_epi32(E02l,
6403
                    _mm_madd_epi16(m128Tmp2,
6404
                            _mm_load_si128(
6405
                                    (__m128i *) (transform16x16_2[1][2]))));
6406
            E02h = _mm_add_epi32(E02h,
6407
                    _mm_madd_epi16(m128Tmp3,
6408
                            _mm_load_si128(
6409
                                    (__m128i *) (transform16x16_2[1][2]))));
6410
6411
            E03l = _mm_madd_epi16(m128Tmp0,
6412
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6413
            E03h = _mm_madd_epi16(m128Tmp1,
6414
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6415
            E03l = _mm_add_epi32(E03l,
6416
                    _mm_madd_epi16(m128Tmp2,
6417
                            _mm_load_si128(
6418
                                    (__m128i *) (transform16x16_2[1][3]))));
6419
            E03h = _mm_add_epi32(E03h,
6420
                    _mm_madd_epi16(m128Tmp3,
6421
                            _mm_load_si128(
6422
                                    (__m128i *) (transform16x16_2[1][3]))));
6423
6424
            /*  Compute EE0 and EEE */
6425
6426
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6427
            EE0l = _mm_madd_epi16(m128Tmp0,
6428
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6429
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6430
            EE0h = _mm_madd_epi16(m128Tmp1,
6431
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6432
6433
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6434
            EEE0l = _mm_madd_epi16(m128Tmp2,
6435
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6436
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6437
            EEE0h = _mm_madd_epi16(m128Tmp3,
6438
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6439
6440
            EE1l = _mm_madd_epi16(m128Tmp0,
6441
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6442
            EE1h = _mm_madd_epi16(m128Tmp1,
6443
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6444
6445
            EEE1l = _mm_madd_epi16(m128Tmp2,
6446
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6447
            EEE1h = _mm_madd_epi16(m128Tmp3,
6448
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6449
6450
            /*  Compute EE    */
6451
6452
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
6453
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
6454
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
6455
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
6456
6457
            EE0l = _mm_add_epi32(EEE0l, EE0l);
6458
            EE1l = _mm_add_epi32(EEE1l, EE1l);
6459
            EE0h = _mm_add_epi32(EEE0h, EE0h);
6460
            EE1h = _mm_add_epi32(EEE1h, EE1h);
6461
            /**/
6462
6463
            EE7l = _mm_sub_epi32(EE0l, E00l);
6464
            EE6l = _mm_sub_epi32(EE1l, E01l);
6465
            EE5l = _mm_sub_epi32(EE2l, E02l);
6466
            EE4l = _mm_sub_epi32(EE3l, E03l);
6467
6468
            EE7h = _mm_sub_epi32(EE0h, E00h);
6469
            EE6h = _mm_sub_epi32(EE1h, E01h);
6470
            EE5h = _mm_sub_epi32(EE2h, E02h);
6471
            EE4h = _mm_sub_epi32(EE3h, E03h);
6472
6473
            EE0l = _mm_add_epi32(EE0l, E00l);
6474
            EE1l = _mm_add_epi32(EE1l, E01l);
6475
            EE2l = _mm_add_epi32(EE2l, E02l);
6476
            EE3l = _mm_add_epi32(EE3l, E03l);
6477
6478
            EE0h = _mm_add_epi32(EE0h, E00h);
6479
            EE1h = _mm_add_epi32(EE1h, E01h);
6480
            EE2h = _mm_add_epi32(EE2h, E02h);
6481
            EE3h = _mm_add_epi32(EE3h, E03h);
6482
            /*      Compute E       */
6483
6484
            E15l = _mm_sub_epi32(EE0l, E0l);
6485
            E15l = _mm_add_epi32(E15l, m128iAdd);
6486
            E14l = _mm_sub_epi32(EE1l, E1l);
6487
            E14l = _mm_add_epi32(E14l, m128iAdd);
6488
            E13l = _mm_sub_epi32(EE2l, E2l);
6489
            E13l = _mm_add_epi32(E13l, m128iAdd);
6490
            E12l = _mm_sub_epi32(EE3l, E3l);
6491
            E12l = _mm_add_epi32(E12l, m128iAdd);
6492
            E11l = _mm_sub_epi32(EE4l, E4l);
6493
            E11l = _mm_add_epi32(E11l, m128iAdd);
6494
            E10l = _mm_sub_epi32(EE5l, E5l);
6495
            E10l = _mm_add_epi32(E10l, m128iAdd);
6496
            E9l = _mm_sub_epi32(EE6l, E6l);
6497
            E9l = _mm_add_epi32(E9l, m128iAdd);
6498
            E8l = _mm_sub_epi32(EE7l, E7l);
6499
            E8l = _mm_add_epi32(E8l, m128iAdd);
6500
6501
            E0l = _mm_add_epi32(EE0l, E0l);
6502
            E0l = _mm_add_epi32(E0l, m128iAdd);
6503
            E1l = _mm_add_epi32(EE1l, E1l);
6504
            E1l = _mm_add_epi32(E1l, m128iAdd);
6505
            E2l = _mm_add_epi32(EE2l, E2l);
6506
            E2l = _mm_add_epi32(E2l, m128iAdd);
6507
            E3l = _mm_add_epi32(EE3l, E3l);
6508
            E3l = _mm_add_epi32(E3l, m128iAdd);
6509
            E4l = _mm_add_epi32(EE4l, E4l);
6510
            E4l = _mm_add_epi32(E4l, m128iAdd);
6511
            E5l = _mm_add_epi32(EE5l, E5l);
6512
            E5l = _mm_add_epi32(E5l, m128iAdd);
6513
            E6l = _mm_add_epi32(EE6l, E6l);
6514
            E6l = _mm_add_epi32(E6l, m128iAdd);
6515
            E7l = _mm_add_epi32(EE7l, E7l);
6516
            E7l = _mm_add_epi32(E7l, m128iAdd);
6517
6518
            E15h = _mm_sub_epi32(EE0h, E0h);
6519
            E15h = _mm_add_epi32(E15h, m128iAdd);
6520
            E14h = _mm_sub_epi32(EE1h, E1h);
6521
            E14h = _mm_add_epi32(E14h, m128iAdd);
6522
            E13h = _mm_sub_epi32(EE2h, E2h);
6523
            E13h = _mm_add_epi32(E13h, m128iAdd);
6524
            E12h = _mm_sub_epi32(EE3h, E3h);
6525
            E12h = _mm_add_epi32(E12h, m128iAdd);
6526
            E11h = _mm_sub_epi32(EE4h, E4h);
6527
            E11h = _mm_add_epi32(E11h, m128iAdd);
6528
            E10h = _mm_sub_epi32(EE5h, E5h);
6529
            E10h = _mm_add_epi32(E10h, m128iAdd);
6530
            E9h = _mm_sub_epi32(EE6h, E6h);
6531
            E9h = _mm_add_epi32(E9h, m128iAdd);
6532
            E8h = _mm_sub_epi32(EE7h, E7h);
6533
            E8h = _mm_add_epi32(E8h, m128iAdd);
6534
6535
            E0h = _mm_add_epi32(EE0h, E0h);
6536
            E0h = _mm_add_epi32(E0h, m128iAdd);
6537
            E1h = _mm_add_epi32(EE1h, E1h);
6538
            E1h = _mm_add_epi32(E1h, m128iAdd);
6539
            E2h = _mm_add_epi32(EE2h, E2h);
6540
            E2h = _mm_add_epi32(E2h, m128iAdd);
6541
            E3h = _mm_add_epi32(EE3h, E3h);
6542
            E3h = _mm_add_epi32(E3h, m128iAdd);
6543
            E4h = _mm_add_epi32(EE4h, E4h);
6544
            E4h = _mm_add_epi32(E4h, m128iAdd);
6545
            E5h = _mm_add_epi32(EE5h, E5h);
6546
            E5h = _mm_add_epi32(E5h, m128iAdd);
6547
            E6h = _mm_add_epi32(EE6h, E6h);
6548
            E6h = _mm_add_epi32(E6h, m128iAdd);
6549
            E7h = _mm_add_epi32(EE7h, E7h);
6550
            E7h = _mm_add_epi32(E7h, m128iAdd);
6551
6552
            m128iS0 = _mm_packs_epi32(
6553
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6554
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6555
            m128iS1 = _mm_packs_epi32(
6556
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6557
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6558
            m128iS2 = _mm_packs_epi32(
6559
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6560
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6561
            m128iS3 = _mm_packs_epi32(
6562
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6563
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6564
            m128iS4 = _mm_packs_epi32(
6565
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6566
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6567
            m128iS5 = _mm_packs_epi32(
6568
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6569
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6570
            m128iS6 = _mm_packs_epi32(
6571
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6572
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6573
            m128iS7 = _mm_packs_epi32(
6574
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6575
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6576
            m128iS8 = _mm_packs_epi32(
6577
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6578
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6579
            m128iS9 = _mm_packs_epi32(
6580
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6581
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6582
            m128iS10 = _mm_packs_epi32(
6583
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6584
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6585
            m128iS11 = _mm_packs_epi32(
6586
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6587
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6588
            m128iS12 = _mm_packs_epi32(
6589
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6590
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6591
            m128iS13 = _mm_packs_epi32(
6592
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6593
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6594
            m128iS14 = _mm_packs_epi32(
6595
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6596
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6597
            m128iS15 = _mm_packs_epi32(
6598
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6599
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6600
6601
            m128iS31 = _mm_packs_epi32(
6602
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6603
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6604
            m128iS30 = _mm_packs_epi32(
6605
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6606
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6607
            m128iS29 = _mm_packs_epi32(
6608
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6609
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6610
            m128iS28 = _mm_packs_epi32(
6611
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6612
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6613
            m128iS27 = _mm_packs_epi32(
6614
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6615
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6616
            m128iS26 = _mm_packs_epi32(
6617
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6618
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6619
            m128iS25 = _mm_packs_epi32(
6620
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6621
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6622
            m128iS24 = _mm_packs_epi32(
6623
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6624
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6625
            m128iS23 = _mm_packs_epi32(
6626
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6627
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6628
            m128iS22 = _mm_packs_epi32(
6629
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6630
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6631
            m128iS21 = _mm_packs_epi32(
6632
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6633
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6634
            m128iS20 = _mm_packs_epi32(
6635
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6636
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6637
            m128iS19 = _mm_packs_epi32(
6638
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6639
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6640
            m128iS18 = _mm_packs_epi32(
6641
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6642
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6643
            m128iS17 = _mm_packs_epi32(
6644
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6645
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6646
            m128iS16 = _mm_packs_epi32(
6647
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6648
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6649
6650
            if (!j) {
6651
                /*      Inverse the matrix      */
6652
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6653
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6654
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6655
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6656
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6657
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6658
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6659
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6660
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6661
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6662
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6663
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6664
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6665
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6666
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6667
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6668
6669
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6670
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6671
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6672
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6673
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6674
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6675
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6676
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6677
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6678
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6679
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6680
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6681
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6682
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6683
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6684
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6685
6686
                E0h = _mm_unpacklo_epi16(E0l, E8l);
6687
                E1h = _mm_unpacklo_epi16(E1l, E9l);
6688
                E2h = _mm_unpacklo_epi16(E2l, E10l);
6689
                E3h = _mm_unpacklo_epi16(E3l, E11l);
6690
                E4h = _mm_unpacklo_epi16(E4l, E12l);
6691
                E5h = _mm_unpacklo_epi16(E5l, E13l);
6692
                E6h = _mm_unpacklo_epi16(E6l, E14l);
6693
                E7h = _mm_unpacklo_epi16(E7l, E15l);
6694
6695
                E8h = _mm_unpackhi_epi16(E0l, E8l);
6696
                E9h = _mm_unpackhi_epi16(E1l, E9l);
6697
                E10h = _mm_unpackhi_epi16(E2l, E10l);
6698
                E11h = _mm_unpackhi_epi16(E3l, E11l);
6699
                E12h = _mm_unpackhi_epi16(E4l, E12l);
6700
                E13h = _mm_unpackhi_epi16(E5l, E13l);
6701
                E14h = _mm_unpackhi_epi16(E6l, E14l);
6702
                E15h = _mm_unpackhi_epi16(E7l, E15l);
6703
6704
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6705
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6706
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6707
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6708
6709
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6710
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6711
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6712
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6713
6714
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6715
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6716
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6717
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6718
6719
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6720
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6721
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6722
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6723
6724
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6725
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6726
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6727
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6728
6729
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6730
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6731
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6732
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6733
6734
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6735
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6736
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6737
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6738
6739
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6740
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6741
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6742
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6743
6744
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6745
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6746
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6747
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6748
6749
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6750
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6751
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6752
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6753
6754
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6755
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6756
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6757
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6758
6759
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6760
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6761
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6762
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6763
6764
                /*  */
6765
                E0h = _mm_unpacklo_epi16(O0l, O8l);
6766
                E1h = _mm_unpacklo_epi16(O1l, O9l);
6767
                E2h = _mm_unpacklo_epi16(O2l, O10l);
6768
                E3h = _mm_unpacklo_epi16(O3l, O11l);
6769
                E4h = _mm_unpacklo_epi16(O4l, O12l);
6770
                E5h = _mm_unpacklo_epi16(O5l, O13l);
6771
                E6h = _mm_unpacklo_epi16(O6l, O14l);
6772
                E7h = _mm_unpacklo_epi16(O7l, O15l);
6773
6774
                E8h = _mm_unpackhi_epi16(O0l, O8l);
6775
                E9h = _mm_unpackhi_epi16(O1l, O9l);
6776
                E10h = _mm_unpackhi_epi16(O2l, O10l);
6777
                E11h = _mm_unpackhi_epi16(O3l, O11l);
6778
                E12h = _mm_unpackhi_epi16(O4l, O12l);
6779
                E13h = _mm_unpackhi_epi16(O5l, O13l);
6780
                E14h = _mm_unpackhi_epi16(O6l, O14l);
6781
                E15h = _mm_unpackhi_epi16(O7l, O15l);
6782
6783
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6784
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6785
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6786
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6787
6788
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6789
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6790
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6791
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6792
6793
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6794
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6795
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6796
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6797
6798
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6799
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6800
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6801
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6802
6803
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6804
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6805
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6806
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6807
6808
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6809
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6810
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6811
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6812
6813
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6814
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6815
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6816
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6817
6818
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6819
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6820
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6821
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6822
6823
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6824
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6825
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6826
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6827
6828
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6829
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6830
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6831
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6832
6833
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6834
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6835
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6836
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6837
6838
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6839
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6840
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6841
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6842
                /*  */
6843
                _mm_store_si128((__m128i *) (src + i), m128iS0);
6844
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6845
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6846
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6847
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6848
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6849
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6850
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6851
                _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6852
                _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6853
                _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6854
                _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6855
                _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6856
                _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6857
                _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6858
                _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6859
                _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6860
                _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6861
                _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6862
                _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6863
                _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6864
                _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6865
                _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6866
                _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6867
                _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6868
                _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6869
                _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6870
                _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6871
                _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6872
                _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6873
                _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6874
                _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6875
6876
                if (i <= 16) {
6877
                    int k = i + 8;
6878
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
6879
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6880
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6881
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6882
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6883
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6884
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6885
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6886
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6887
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6888
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6889
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6890
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6891
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6892
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6893
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6894
6895
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6896
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6897
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6898
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6899
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6900
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6901
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6902
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6903
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6904
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6905
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6906
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6907
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6908
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6909
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6910
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6911
                } else {
6912
                    m128iS0 = _mm_load_si128((__m128i *) (src));
6913
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6914
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6915
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6916
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6917
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6918
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6919
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6920
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6921
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6922
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6923
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6924
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6925
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6926
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6927
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6928
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6929
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6930
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6931
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6932
                    m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6933
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6934
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6935
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6936
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6937
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6938
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6939
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6940
                    m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6941
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6942
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6943
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6944
                    shift = shift_2nd;
6945
                    m128iAdd = _mm_set1_epi32(add_2nd);
6946
                }
6947
6948
            } else {
6949
                int k, m = 0;
6950
                _mm_storeu_si128((__m128i *) (src), m128iS0);
6951
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6952
                _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6953
                _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6954
                _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6955
                _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6956
                _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6957
                _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6958
                _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6959
                _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6960
                _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6961
                _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6962
                _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6963
                _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6964
                _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6965
                _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6966
6967
                _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6968
                _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6969
                _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6970
                _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6971
                _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6972
                _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6973
                _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6974
                _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6975
                _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6976
                _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6977
                _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6978
                _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6979
                _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6980
                _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6981
                _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6982
                _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6983
                dst = (uint16_t*) _dst + (i * stride);
6984
                for (k = 0; k < 8; k++) {
6985
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6986
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6987
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6988
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6989
                    dst[4] = av_clip_uintp2(
6990
                            dst[4] + src[m + 128],10);
6991
                    dst[5] = av_clip_uintp2(
6992
                            dst[5] + src[m + 128 + 8],10);
6993
                    dst[6] = av_clip_uintp2(
6994
                            dst[6] + src[m + 128 + 16],10);
6995
                    dst[7] = av_clip_uintp2(
6996
                            dst[7] + src[m + 128 + 24],10);
6997
6998
                    dst[8] = av_clip_uintp2(
6999
                            dst[8] + src[m + 256],10);
7000
                    dst[9] = av_clip_uintp2(
7001
                            dst[9] + src[m + 256 + 8],10);
7002
                    dst[10] = av_clip_uintp2(
7003
                            dst[10] + src[m + 256 + 16],10);
7004
                    dst[11] = av_clip_uintp2(
7005
                            dst[11] + src[m + 256 + 24],10);
7006
                    dst[12] = av_clip_uintp2(
7007
                            dst[12] + src[m + 384],10);
7008
                    dst[13] = av_clip_uintp2(
7009
                            dst[13] + src[m + 384 + 8],10);
7010
                    dst[14] = av_clip_uintp2(
7011
                            dst[14] + src[m + 384 + 16],10);
7012
                    dst[15] = av_clip_uintp2(
7013
                            dst[15] + src[m + 384 + 24],10);
7014
7015
                    dst[16] = av_clip_uintp2(
7016
                            dst[16] + src[m + 512],10);
7017
                    dst[17] = av_clip_uintp2(
7018
                            dst[17] + src[m + 512 + 8],10);
7019
                    dst[18] = av_clip_uintp2(
7020
                            dst[18] + src[m + 512 + 16],10);
7021
                    dst[19] = av_clip_uintp2(
7022
                            dst[19] + src[m + 512 + 24],10);
7023
                    dst[20] = av_clip_uintp2(
7024
                            dst[20] + src[m + 640],10);
7025
                    dst[21] = av_clip_uintp2(
7026
                            dst[21] + src[m + 640 + 8],10);
7027
                    dst[22] = av_clip_uintp2(
7028
                            dst[22] + src[m + 640 + 16],10);
7029
                    dst[23] = av_clip_uintp2(
7030
                            dst[23] + src[m + 640 + 24],10);
7031
7032
                    dst[24] = av_clip_uintp2(
7033
                            dst[24] + src[m + 768],10);
7034
                    dst[25] = av_clip_uintp2(
7035
                            dst[25] + src[m + 768 + 8],10);
7036
                    dst[26] = av_clip_uintp2(
7037
                            dst[26] + src[m + 768 + 16],10);
7038
                    dst[27] = av_clip_uintp2(
7039
                            dst[27] + src[m + 768 + 24],10);
7040
                    dst[28] = av_clip_uintp2(
7041
                            dst[28] + src[m + 896],10);
7042
                    dst[29] = av_clip_uintp2(
7043
                            dst[29] + src[m + 896 + 8],10);
7044
                    dst[30] = av_clip_uintp2(
7045
                            dst[30] + src[m + 896 + 16],10);
7046
                    dst[31] = av_clip_uintp2(
7047
                            dst[31] + src[m + 896 + 24],10);
7048
7049
                    m += 1;
7050
                    dst += stride;
7051
                }
7052
                if (i <= 16) {
7053
                    int k = (i + 8) * 4;
7054
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
7055
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7056
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7057
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7058
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7059
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7060
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7061
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7062
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7063
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7064
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7065
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7066
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7067
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7068
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7069
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7070
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7071
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7072
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7073
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7074
                    m128iS20 = _mm_loadu_si128(
7075
                            (__m128i *) (src + 512 + 16 + k));
7076
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7077
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7078
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7079
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7080
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7081
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7082
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7083
                    m128iS28 = _mm_loadu_si128(
7084
                            (__m128i *) (src + 512 + 24 + k));
7085
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7086
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7087
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7088
                }
7089
            }
7090
        }
7091
    }
7092
}
7093
#endif
7094