Coverage Report

Created: 2026-06-15 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse-dct.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013 openHEVC contributors
4
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5
 *
6
 * This file is part of libde265.
7
 *
8
 * libde265 is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU Lesser General Public License as
10
 * published by the Free Software Foundation, either version 3 of
11
 * the License, or (at your option) any later version.
12
 *
13
 * libde265 is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public License
19
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include "x86/sse-dct.h"
23
#include "libde265/util.h"
24
25
#ifdef HAVE_CONFIG_H
26
#include "config.h"
27
#endif
28
29
#include <emmintrin.h> // SSE2
30
#include <tmmintrin.h> // SSSE3
31
32
#if HAVE_SSE4_1
33
#include <smmintrin.h> // SSE4.1
34
#endif
35
36
37
ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
38
{
39
    {   29, +84, 29,  +84,  29, +84,  29, +84 },
40
    {  +74, +55, +74, +55, +74, +55, +74, +55 },
41
    {   55, -29,  55, -29,  55, -29,  55, -29 },
42
    {  +74, -84, +74, -84, +74, -84, +74, -84 },
43
    {   74, -74,  74, -74,  74, -74,  74, -74 },
44
    {    0, +74,   0, +74,   0, +74,   0, +74 },
45
    {   84, +55,  84, +55,  84, +55,  84, +55 },
46
    {  -74, -29, -74, -29, -74, -29, -74, -29 }
47
};
48
49
ALIGNED_16(static const int16_t) transform4x4[4][8] = {
50
    { 64,  64, 64,  64, 64,  64, 64,  64 },
51
    { 64, -64, 64, -64, 64, -64, 64, -64 },
52
    { 83,  36, 83,  36, 83,  36, 83,  36 },
53
    { 36, -83, 36, -83, 36, -83, 36, -83 }
54
};
55
56
ALIGNED_16(static const int16_t) transform8x8[12][8] =
57
{
58
    {  89,  75,  89,  75, 89,  75, 89,  75 },
59
    {  50,  18,  50,  18, 50,  18, 50,  18 },
60
    {  75, -18,  75, -18, 75, -18, 75, -18 },
61
    { -89, -50, -89, -50,-89, -50,-89, -50 },
62
    {  50, -89,  50, -89, 50, -89, 50, -89 },
63
    {  18,  75,  18,  75, 18,  75, 18,  75 },
64
    {  18, -50,  18, -50, 18, -50, 18, -50 },
65
    {  75, -89,  75, -89, 75, -89, 75, -89 },
66
    {  64,  64,  64,  64, 64,  64, 64,  64 },
67
    {  64, -64,  64, -64, 64, -64, 64, -64 },
68
    {  83,  36,  83,  36, 83,  36, 83,  36 },
69
    {  36, -83,  36, -83, 36, -83, 36, -83 }
70
};
71
72
ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
73
{
74
    {/*1-3*/ /*2-6*/
75
        { 90,  87,  90,  87,  90,  87,  90,  87 },
76
        { 87,  57,  87,  57,  87,  57,  87,  57 },
77
        { 80,   9,  80,   9,  80,   9,  80,   9 },
78
        { 70, -43,  70, -43,  70, -43,  70, -43 },
79
        { 57, -80,  57, -80,  57, -80,  57, -80 },
80
        { 43, -90,  43, -90,  43, -90,  43, -90 },
81
        { 25, -70,  25, -70,  25, -70,  25, -70 },
82
        { 9,  -25,   9, -25,   9, -25,   9, -25 },
83
    },{ /*5-7*/ /*10-14*/
84
        {  80,  70,  80,  70,  80,  70,  80,  70 },
85
        {   9, -43,   9, -43,   9, -43,   9, -43 },
86
        { -70, -87, -70, -87, -70, -87, -70, -87 },
87
        { -87,   9, -87,   9, -87,   9, -87,   9 },
88
        { -25,  90, -25,  90, -25,  90, -25,  90 },
89
        {  57,  25,  57,  25,  57,  25,  57,  25 },
90
        {  90, -80,  90, -80,  90, -80,  90, -80 },
91
        {  43, -57,  43, -57,  43, -57,  43, -57 },
92
    },{ /*9-11*/ /*18-22*/
93
        {  57,  43,  57,  43,  57,  43,  57,  43 },
94
        { -80, -90, -80, -90, -80, -90, -80, -90 },
95
        { -25,  57, -25,  57, -25,  57, -25,  57 },
96
        {  90,  25,  90,  25,  90,  25,  90,  25 },
97
        {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
98
        { -87,  70, -87,  70, -87,  70, -87,  70 },
99
        {  43,   9,  43,   9,  43,   9,  43,   9 },
100
        {  70, -80,  70, -80,  70, -80,  70, -80 },
101
    },{/*13-15*/ /*  26-30   */
102
        {  25,   9,  25,   9,  25,   9,  25,   9 },
103
        { -70, -25, -70, -25, -70, -25, -70, -25 },
104
        {  90,  43,  90,  43,  90,  43,  90,  43 },
105
        { -80, -57, -80, -57, -80, -57, -80, -57 },
106
        {  43,  70,  43,  70,  43,  70,  43,  70 },
107
        {  9,  -80,   9, -80,   9, -80,   9, -80 },
108
        { -57,  87, -57,  87, -57,  87, -57,  87 },
109
        {  87, -90,  87, -90,  87, -90,  87, -90 },
110
    }
111
};
112
113
ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
114
{
115
    { /*2-6*/ /*4-12*/
116
        { 89,  75,  89,  75, 89,  75, 89,  75 },
117
        { 75, -18,  75, -18, 75, -18, 75, -18 },
118
        { 50, -89,  50, -89, 50, -89, 50, -89 },
119
        { 18, -50,  18, -50, 18, -50, 18, -50 },
120
    },{ /*10-14*/  /*20-28*/
121
        {  50,  18,  50,  18,  50,  18,  50,  18 },
122
        { -89, -50, -89, -50, -89, -50, -89, -50 },
123
        {  18,  75,  18,  75,  18,  75,  18,  75 },
124
        {  75, -89,  75, -89,  75, -89,  75, -89 },
125
    }
126
};
127
128
ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
129
{
130
    {/*4-12*/ /*8-24*/
131
        {  83,  36,  83,  36,  83,  36,  83,  36 },
132
        {  36, -83,  36, -83,  36, -83,  36, -83 },
133
    },{ /*0-8*/  /*0-16*/
134
        { 64,  64, 64,  64, 64,  64, 64,  64 },
135
        { 64, -64, 64, -64, 64, -64, 64, -64 },
136
    }
137
};
138
139
140
ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
141
{
142
    { /*   1-3     */
143
        { 90,  90, 90,  90, 90,  90, 90,  90 },
144
        { 90,  82, 90,  82, 90,  82, 90,  82 },
145
        { 88,  67, 88,  67, 88,  67, 88,  67 },
146
        { 85,  46, 85,  46, 85,  46, 85,  46 },
147
        { 82,  22, 82,  22, 82,  22, 82,  22 },
148
        { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
149
        { 73, -31, 73, -31, 73, -31, 73, -31 },
150
        { 67, -54, 67, -54, 67, -54, 67, -54 },
151
        { 61, -73, 61, -73, 61, -73, 61, -73 },
152
        { 54, -85, 54, -85, 54, -85, 54, -85 },
153
        { 46, -90, 46, -90, 46, -90, 46, -90 },
154
        { 38, -88, 38, -88, 38, -88, 38, -88 },
155
        { 31, -78, 31, -78, 31, -78, 31, -78 },
156
        { 22, -61, 22, -61, 22, -61, 22, -61 },
157
        { 13, -38, 13, -38, 13, -38, 13, -38 },
158
        { 4,  -13,  4, -13,  4, -13,  4, -13 },
159
    },{/*  5-7 */
160
        {  88,  85,  88,  85,  88,  85,  88,  85 },
161
        {  67,  46,  67,  46,  67,  46,  67,  46 },
162
        {  31, -13,  31, -13,  31, -13,  31, -13 },
163
        { -13, -67, -13, -67, -13, -67, -13, -67 },
164
        { -54, -90, -54, -90, -54, -90, -54, -90 },
165
        { -82, -73, -82, -73, -82, -73, -82, -73 },
166
        { -90, -22, -90, -22, -90, -22, -90, -22 },
167
        { -78,  38, -78,  38, -78,  38, -78,  38 },
168
        { -46,  82, -46,  82, -46,  82, -46,  82 },
169
        {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
170
        {  38,  54,  38,  54,  38,  54,  38,  54 },
171
        {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
172
        {  90, -61,  90, -61,  90, -61,  90, -61 },
173
        {  85, -90,  85, -90,  85, -90,  85, -90 },
174
        {  61, -78,  61, -78,  61, -78,  61, -78 },
175
        {  22, -31,  22, -31,  22, -31,  22, -31 },
176
    },{/*  9-11   */
177
        {  82,  78,  82,  78,  82,  78,  82,  78 },
178
        {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
179
        { -54, -82, -54, -82, -54, -82, -54, -82 },
180
        { -90, -73, -90, -73, -90, -73, -90, -73 },
181
        { -61,  13, -61,  13, -61,  13, -61,  13 },
182
        {  13,  85,  13,  85,  13,  85,  13,  85 },
183
        {  78,  67,  78,  67,  78,  67,  78,  67 },
184
        {  85, -22,  85, -22,  85, -22,  85, -22 },
185
        {  31, -88,  31, -88,  31, -88,  31, -88 },
186
        { -46, -61, -46, -61, -46, -61, -46, -61 },
187
        { -90,  31, -90,  31, -90,  31, -90,  31 },
188
        { -67,  90, -67,  90, -67,  90, -67,  90 },
189
        {   4,  54,   4,  54,   4,  54,   4,  54 },
190
        {  73, -38,  73, -38,  73, -38,  73, -38 },
191
        {  88, -90,  88, -90,  88, -90,  88, -90 },
192
        {  38, -46,  38, -46,  38, -46,  38, -46 },
193
    },{/*  13-15   */
194
        {  73,  67,  73,  67,  73,  67,  73,  67 },
195
        { -31, -54, -31, -54, -31, -54, -31, -54 },
196
        { -90, -78, -90, -78, -90, -78, -90, -78 },
197
        { -22,  38, -22,  38, -22,  38, -22,  38 },
198
        {  78,  85,  78,  85,  78,  85,  78,  85 },
199
        {  67, -22,  67, -22,  67, -22,  67, -22 },
200
        { -38, -90, -38, -90, -38, -90, -38, -90 },
201
        { -90,   4, -90,   4, -90,   4, -90,   4 },
202
        { -13,  90, -13,  90, -13,  90, -13,  90 },
203
        {  82,  13,  82,  13,  82,  13,  82,  13 },
204
        {  61, -88,  61, -88,  61, -88,  61, -88 },
205
        { -46, -31, -46, -31, -46, -31, -46, -31 },
206
        { -88,  82, -88,  82, -88,  82, -88,  82 },
207
        { -4,   46, -4,   46, -4,   46, -4,   46 },
208
        {  85, -73,  85, -73,  85, -73,  85, -73 },
209
        {  54, -61,  54, -61,  54, -61,  54, -61 },
210
    },{/*  17-19   */
211
        {  61,  54,  61,  54,  61,  54,  61,  54 },
212
        { -73, -85, -73, -85, -73, -85, -73, -85 },
213
        { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
214
        {  82,  88,  82,  88,  82,  88,  82,  88 },
215
        {  31, -46,  31, -46,  31, -46,  31, -46 },
216
        { -88, -61, -88, -61, -88, -61, -88, -61 },
217
        { -13,  82, -13,  82, -13,  82, -13,  82 },
218
        {  90,  13,  90,  13,  90,  13,  90,  13 },
219
        { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
220
        { -90,  38, -90,  38, -90,  38, -90,  38 },
221
        {  22,  67,  22,  67,  22,  67,  22,  67 },
222
        {  85, -78,  85, -78,  85, -78,  85, -78 },
223
        { -38, -22, -38, -22, -38, -22, -38, -22 },
224
        { -78,  90, -78,  90, -78,  90, -78,  90 },
225
        {  54, -31,  54, -31,  54, -31,  54, -31 },
226
        {  67, -73,  67, -73,  67, -73,  67, -73 },
227
    },{ /*  21-23   */
228
        {  46,  38,  46,  38,  46,  38,  46,  38 },
229
        { -90, -88, -90, -88, -90, -88, -90, -88 },
230
        {  38,  73,  38,  73,  38,  73,  38,  73 },
231
        {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
232
        { -90, -67, -90, -67, -90, -67, -90, -67 },
233
        {  31,  90,  31,  90,  31,  90,  31,  90 },
234
        {  61, -46,  61, -46,  61, -46,  61, -46 },
235
        { -88, -31, -88, -31, -88, -31, -88, -31 },
236
        {  22,  85,  22,  85,  22,  85,  22,  85 },
237
        {  67, -78,  67, -78,  67, -78,  67, -78 },
238
        { -85,  13, -85,  13, -85,  13, -85,  13 },
239
        {  13,  61,  13,  61,  13,  61,  13,  61 },
240
        {  73, -90,  73, -90,  73, -90,  73, -90 },
241
        { -82,  54, -82,  54, -82,  54, -82,  54 },
242
        {   4,  22,   4,  22,   4,  22,   4,  22 },
243
        {  78, -82,  78, -82,  78, -82,  78, -82 },
244
    },{ /*  25-27   */
245
        {  31,  22,  31,  22,  31,  22,  31,  22 },
246
        { -78, -61, -78, -61, -78, -61, -78, -61 },
247
        {  90,  85,  90,  85,  90,  85,  90,  85 },
248
        { -61, -90, -61, -90, -61, -90, -61, -90 },
249
        {   4,  73,   4,  73,   4,  73,   4,  73 },
250
        {  54, -38,  54, -38,  54, -38,  54, -38 },
251
        { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
252
        {  82,  46,  82,  46,  82,  46,  82,  46 },
253
        { -38, -78, -38, -78, -38, -78, -38, -78 },
254
        { -22,  90, -22,  90, -22,  90, -22,  90 },
255
        {  73, -82,  73, -82,  73, -82,  73, -82 },
256
        { -90,  54, -90,  54, -90,  54, -90,  54 },
257
        {  67, -13,  67, -13,  67, -13,  67, -13 },
258
        { -13, -31, -13, -31, -13, -31, -13, -31 },
259
        { -46,  67, -46,  67, -46,  67, -46,  67 },
260
        {  85, -88,  85, -88,  85, -88,  85, -88 },
261
    },{/*  29-31   */
262
        {  13,   4,  13,   4,  13,   4,  13,   4 },
263
        { -38, -13, -38, -13, -38, -13, -38, -13 },
264
        {  61,  22,  61,  22,  61,  22,  61,  22 },
265
        { -78, -31, -78, -31, -78, -31, -78, -31 },
266
        {  88,  38,  88,  38,  88,  38,  88,  38 },
267
        { -90, -46, -90, -46, -90, -46, -90, -46 },
268
        {  85,  54,  85,  54,  85,  54,  85,  54 },
269
        { -73, -61, -73, -61, -73, -61, -73, -61 },
270
        {  54,  67,  54,  67,  54,  67,  54,  67 },
271
        { -31, -73, -31, -73, -31, -73, -31, -73 },
272
        {   4,  78,   4,  78,   4,  78,   4,  78 },
273
        {  22, -82,  22, -82,  22, -82,  22, -82 },
274
        { -46,  85, -46,  85, -46,  85, -46,  85 },
275
        {  67, -88,  67, -88,  67, -88,  67, -88 },
276
        { -82,  90, -82,  90, -82,  90, -82,  90 },
277
        {  90, -90,  90, -90,  90, -90,  90, -90 },
278
    }
279
};
280
281
0
#define shift_1st 7
282
0
#define add_1st (1 << (shift_1st - 1))
283
284
285
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
286
0
{
287
0
    uint8_t *dst = (uint8_t*)_dst;
288
0
    ptrdiff_t stride = _stride;
289
0
    int shift = 5;
290
0
    int offset = 16;
291
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r9;
292
293
0
    r9= _mm_setzero_si128();
294
    //r8= _mm_set_epi32(0,0,0,-1);
295
0
    r2= _mm_set1_epi16(offset);
296
297
0
    r0= _mm_load_si128((__m128i*)(coeffs));
298
0
    r1= _mm_load_si128((__m128i*)(coeffs+8));
299
300
301
0
    r0= _mm_adds_epi16(r0,r2);
302
0
    r1= _mm_adds_epi16(r1,r2);
303
304
0
    r0= _mm_srai_epi16(r0,shift);
305
0
    r1= _mm_srai_epi16(r1,shift);
306
307
0
    r3= _mm_loadl_epi64((__m128i*)(dst));
308
0
    r4= _mm_loadl_epi64((__m128i*)(dst + stride));
309
0
    r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
310
0
    r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
311
312
0
    r3= _mm_unpacklo_epi8(r3,r9);
313
0
    r4= _mm_unpacklo_epi8(r4,r9);
314
0
    r5= _mm_unpacklo_epi8(r5,r9);
315
0
    r6= _mm_unpacklo_epi8(r6,r9);
316
0
    r3= _mm_unpacklo_epi64(r3,r4);
317
0
    r4= _mm_unpacklo_epi64(r5,r6);
318
319
320
0
    r3= _mm_adds_epi16(r3,r0);
321
0
    r4= _mm_adds_epi16(r4,r1);
322
323
0
    r3= _mm_packus_epi16(r3,r4);
324
    //r8= _mm_set_epi32(0,0,0,-1);
325
326
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
327
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
328
329
0
    r3= _mm_srli_si128(r3,4);
330
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
331
0
    *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
332
333
0
    r3= _mm_srli_si128(r3,4);
334
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
335
0
    *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
336
337
0
    r3= _mm_srli_si128(r3,4);
338
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
339
0
    *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
340
0
}
341
342
343
344
#if HAVE_SSE4_1
345
void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
346
0
                                           ptrdiff_t _stride) {
347
348
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
349
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
350
351
0
    uint8_t *dst = (uint8_t*) _dst;
352
0
    ptrdiff_t stride = _stride;
353
0
    const int16_t *src = coeffs;
354
0
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
355
0
            m128iD;
356
0
    m128iAdd = _mm_set1_epi32(64);
357
358
0
    S0 = _mm_load_si128((__m128i *) (src));
359
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
360
361
0
    m128iAC = _mm_unpacklo_epi16(S0, S8);
362
0
    m128iBD = _mm_unpackhi_epi16(S0, S8);
363
364
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
365
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
366
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
367
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
368
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369
0
    S0 = _mm_add_epi32(S0, m128iAdd);
370
0
    S0 = _mm_srai_epi32(S0, shift_1st);
371
372
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
373
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
374
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
375
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
376
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377
0
    S8 = _mm_add_epi32(S8, m128iAdd);
378
0
    S8 = _mm_srai_epi32(S8, shift_1st);
379
380
0
    m128iA = _mm_packs_epi32(S0, S8);
381
382
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
383
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
384
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
385
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
386
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
387
0
    S0 = _mm_add_epi32(S0, m128iAdd);
388
0
    S0 = _mm_srai_epi32(S0, shift_1st);
389
390
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
391
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
392
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
393
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
394
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
395
0
    S8 = _mm_add_epi32(S8, m128iAdd);
396
0
    S8 = _mm_srai_epi32(S8, shift_1st);
397
398
0
    m128iD = _mm_packs_epi32(S0, S8);
399
400
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
401
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
402
403
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
404
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
405
406
    /*   ###################    */
407
0
    m128iAdd = _mm_set1_epi32(add_2nd);
408
409
0
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
410
0
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
411
412
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
413
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
414
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
415
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
416
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417
0
    S0 = _mm_add_epi32(S0, m128iAdd);
418
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
419
420
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
421
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
422
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
423
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
424
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425
0
    S8 = _mm_add_epi32(S8, m128iAdd);
426
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
427
428
0
    m128iA = _mm_packs_epi32(S0, S8);
429
430
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
431
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
432
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
433
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
434
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
435
0
    S0 = _mm_add_epi32(S0, m128iAdd);
436
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
437
438
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
439
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
440
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
441
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
442
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
443
0
    S8 = _mm_add_epi32(S8, m128iAdd);
444
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
445
446
0
    m128iD = _mm_packs_epi32(S0, S8);
447
448
//    _mm_storeu_si128((__m128i *) (src), m128iA);
449
//    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
450
451
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
452
0
    S8 = _mm_move_epi64(m128iD); //row 2
453
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
454
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
455
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
456
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
457
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
458
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
459
460
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
461
462
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
463
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
465
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468
469
0
    dst += stride;
470
471
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
472
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
474
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477
478
0
    dst += stride;
479
480
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
481
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
482
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
483
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
484
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
485
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
486
487
0
    dst += stride;
488
489
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
490
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
491
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
492
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
493
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
494
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
495
0
}
496
#endif // SSE4.1
497
498
#if 0
499
void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
500
        ptrdiff_t _stride) {
501
    int i,j;
502
    uint8_t shift_2nd = 10; // 20 - Bit depth
503
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
504
505
    uint16_t *dst = (uint16_t*) _dst;
506
    ptrdiff_t stride = _stride/(sizeof(uint16_t));
507
    int16_t *src = coeffs;
508
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
509
            m128iD;
510
511
    m128iAdd = _mm_set1_epi32(64);
512
513
    S0 = _mm_loadu_si128((__m128i *) (src));
514
    S8 = _mm_loadu_si128((__m128i *) (src + 8));
515
516
    m128iAC = _mm_unpacklo_epi16(S0, S8);
517
    m128iBD = _mm_unpackhi_epi16(S0, S8);
518
519
    m128iTmp1 = _mm_madd_epi16(m128iAC,
520
            _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
521
    m128iTmp2 = _mm_madd_epi16(m128iBD,
522
            _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
523
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524
    S0 = _mm_add_epi32(S0, m128iAdd);
525
    S0 = _mm_srai_epi32(S0, shift_1st);
526
527
    m128iTmp1 = _mm_madd_epi16(m128iAC,
528
            _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
529
    m128iTmp2 = _mm_madd_epi16(m128iBD,
530
            _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
531
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532
    S8 = _mm_add_epi32(S8, m128iAdd);
533
    S8 = _mm_srai_epi32(S8, shift_1st);
534
535
    m128iA = _mm_packs_epi32(S0, S8);
536
537
    m128iTmp1 = _mm_madd_epi16(m128iAC,
538
            _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
539
    m128iTmp2 = _mm_madd_epi16(m128iBD,
540
            _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
541
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
542
    S0 = _mm_add_epi32(S0, m128iAdd);
543
    S0 = _mm_srai_epi32(S0, shift_1st);
544
545
    m128iTmp1 = _mm_madd_epi16(m128iAC,
546
            _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
547
    m128iTmp2 = _mm_madd_epi16(m128iBD,
548
            _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
549
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
550
    S8 = _mm_add_epi32(S8, m128iAdd);
551
    S8 = _mm_srai_epi32(S8, shift_1st);
552
553
    m128iD = _mm_packs_epi32(S0, S8);
554
555
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
556
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
557
558
    m128iA = _mm_unpacklo_epi16(S0, S8);
559
    m128iD = _mm_unpackhi_epi16(S0, S8);
560
561
    /*   ###################    */
562
    m128iAdd = _mm_set1_epi32(add_2nd);
563
564
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
565
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
566
567
    m128iTmp1 = _mm_madd_epi16(m128iAC,
568
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
569
    m128iTmp2 = _mm_madd_epi16(m128iBD,
570
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
571
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572
    S0 = _mm_add_epi32(S0, m128iAdd);
573
    S0 = _mm_srai_epi32(S0, shift_2nd);
574
575
    m128iTmp1 = _mm_madd_epi16(m128iAC,
576
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
577
    m128iTmp2 = _mm_madd_epi16(m128iBD,
578
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
579
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580
    S8 = _mm_add_epi32(S8, m128iAdd);
581
    S8 = _mm_srai_epi32(S8, shift_2nd);
582
583
    m128iA = _mm_packs_epi32(S0, S8);
584
585
    m128iTmp1 = _mm_madd_epi16(m128iAC,
586
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
587
    m128iTmp2 = _mm_madd_epi16(m128iBD,
588
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
589
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
590
    S0 = _mm_add_epi32(S0, m128iAdd);
591
    S0 = _mm_srai_epi32(S0, shift_2nd);
592
593
    m128iTmp1 = _mm_madd_epi16(m128iAC,
594
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
595
    m128iTmp2 = _mm_madd_epi16(m128iBD,
596
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
597
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
598
    S8 = _mm_add_epi32(S8, m128iAdd);
599
    S8 = _mm_srai_epi32(S8, shift_2nd);
600
601
    m128iD = _mm_packs_epi32(S0, S8);
602
603
    _mm_storeu_si128((__m128i *) (src), m128iA);
604
    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
605
    j = 0;
606
    for (i = 0; i < 2; i++) {
607
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
608
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
609
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
610
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
611
        j += 1;
612
        dst += stride;
613
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
614
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
615
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
616
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
617
        j += 1;
618
        dst += stride;
619
    }
620
621
}
622
#endif
623
624
625
#if HAVE_SSE4_1
626
void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
627
0
        ptrdiff_t _stride) {
628
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
629
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
630
631
0
    uint8_t *dst = (uint8_t*) _dst;
632
0
    ptrdiff_t stride = _stride;
633
0
    const int16_t *src = coeffs;
634
635
0
    __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
636
0
    S0 = _mm_load_si128((__m128i *) (src));
637
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
638
0
    m128iAdd = _mm_set1_epi32(add_1st);
639
640
0
    m128Tmp = _mm_unpacklo_epi16(S0, S8);
641
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
642
0
    E1 = _mm_add_epi32(E1, m128iAdd);
643
644
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
645
0
    E2 = _mm_add_epi32(E2, m128iAdd);
646
647
0
    m128Tmp = _mm_unpackhi_epi16(S0, S8);
648
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
649
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
650
651
0
    m128iA = _mm_add_epi32(E1, O1);
652
0
    m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
653
0
    m128Tmp = _mm_add_epi32(E2, O2);
654
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
655
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
656
657
0
    m128iD = _mm_sub_epi32(E2, O2);
658
0
    m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
659
660
0
    m128Tmp = _mm_sub_epi32(E1, O1);
661
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
662
663
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
664
665
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
666
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
667
668
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
669
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
670
671
    /*  ##########################  */
672
673
0
    m128iAdd = _mm_set1_epi32(add_2nd);
674
0
    m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
675
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
676
0
    E1 = _mm_add_epi32(E1, m128iAdd);
677
678
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
679
0
    E2 = _mm_add_epi32(E2, m128iAdd);
680
681
0
    m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
682
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
683
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
684
685
0
    m128iA = _mm_add_epi32(E1, O1);
686
0
    m128iA = _mm_srai_epi32(m128iA, shift_2nd);
687
0
    m128Tmp = _mm_add_epi32(E2, O2);
688
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
689
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
690
691
0
    m128iD = _mm_sub_epi32(E2, O2);
692
0
    m128iD = _mm_srai_epi32(m128iD, shift_2nd);
693
694
0
    m128Tmp = _mm_sub_epi32(E1, O1);
695
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
696
697
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
698
699
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
700
0
    S8 = _mm_move_epi64(m128iD); //row 2
701
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
702
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
703
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
704
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
705
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
706
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
707
708
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
709
710
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
711
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
713
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716
717
0
    dst += stride;
718
719
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
720
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
722
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725
726
0
    dst += stride;
727
728
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
729
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
730
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
731
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
732
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
733
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
734
735
0
    dst += stride;
736
737
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
738
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
739
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
740
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
741
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
742
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
743
0
}
744
#endif
745
746
#if 0
747
void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
748
        ptrdiff_t _stride) {
749
    int i;
750
    uint8_t shift_2nd = 10; // 20 - Bit depth
751
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
752
753
    uint16_t *dst = (uint16_t*) _dst;
754
    ptrdiff_t stride = _stride/2;
755
    int16_t *src = coeffs;
756
757
    int j;
758
        __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
759
        S0 = _mm_load_si128((__m128i *) (src));
760
        S8 = _mm_load_si128((__m128i *) (src + 8));
761
        m128iAdd = _mm_set1_epi32(add_1st);
762
763
        m128Tmp = _mm_unpacklo_epi16(S0, S8);
764
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
765
        E1 = _mm_add_epi32(E1, m128iAdd);
766
767
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
768
        E2 = _mm_add_epi32(E2, m128iAdd);
769
770
        m128Tmp = _mm_unpackhi_epi16(S0, S8);
771
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
772
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
773
774
        m128iA = _mm_add_epi32(E1, O1);
775
        m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
776
        m128Tmp = _mm_add_epi32(E2, O2);
777
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
778
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
779
780
        m128iD = _mm_sub_epi32(E2, O2);
781
        m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
782
783
        m128Tmp = _mm_sub_epi32(E1, O1);
784
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
785
786
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
787
788
        S0 = _mm_unpacklo_epi16(m128iA, m128iD);
789
        S8 = _mm_unpackhi_epi16(m128iA, m128iD);
790
791
        m128iA = _mm_unpacklo_epi16(S0, S8);
792
        m128iD = _mm_unpackhi_epi16(S0, S8);
793
794
        /*  ##########################  */
795
796
        m128iAdd = _mm_set1_epi32(add_2nd);
797
        m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
798
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
799
        E1 = _mm_add_epi32(E1, m128iAdd);
800
801
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
802
        E2 = _mm_add_epi32(E2, m128iAdd);
803
804
        m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
805
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
806
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
807
808
        m128iA = _mm_add_epi32(E1, O1);
809
        m128iA = _mm_srai_epi32(m128iA, shift_2nd);
810
        m128Tmp = _mm_add_epi32(E2, O2);
811
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
812
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
813
814
        m128iD = _mm_sub_epi32(E2, O2);
815
        m128iD = _mm_srai_epi32(m128iD, shift_2nd);
816
817
        m128Tmp = _mm_sub_epi32(E1, O1);
818
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
819
820
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
821
        _mm_storeu_si128((__m128i *) (src), m128iA);
822
        _mm_storeu_si128((__m128i *) (src + 8), m128iD);
823
        j = 0;
824
        for (i = 0; i < 2; i++) {
825
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
826
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
827
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
828
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
829
            j += 1;
830
            dst += stride;
831
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
832
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
833
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
834
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
835
            j += 1;
836
            dst += stride;
837
        }
838
}
839
#endif
840
841
#if HAVE_SSE4_1
842
void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
843
0
        ptrdiff_t _stride) {
844
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
845
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
846
847
0
    uint8_t *dst = (uint8_t*) _dst;
848
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
849
0
    const int16_t *src = coeffs;
850
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
851
0
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
852
0
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
853
854
0
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
855
0
            T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
856
0
    T0= _mm_load_si128((__m128i *) (transform8x8[0]));
857
0
    T1= _mm_load_si128((__m128i *) (transform8x8[1]));
858
0
    T2= _mm_load_si128((__m128i *) (transform8x8[2]));
859
0
    T3= _mm_load_si128((__m128i *) (transform8x8[3]));
860
0
    T4= _mm_load_si128((__m128i *) (transform8x8[4]));
861
0
    T5= _mm_load_si128((__m128i *) (transform8x8[5]));
862
0
    T6= _mm_load_si128((__m128i *) (transform8x8[6]));
863
0
    T7= _mm_load_si128((__m128i *) (transform8x8[7]));
864
0
    T8= _mm_load_si128((__m128i *) (transform8x8[8]));
865
0
    T9= _mm_load_si128((__m128i *) (transform8x8[9]));
866
0
    T10= _mm_load_si128((__m128i *) (transform8x8[10]));
867
0
    T11= _mm_load_si128((__m128i *) (transform8x8[11]));
868
869
0
    m128iAdd = _mm_set1_epi32(add_1st);
870
871
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
872
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
873
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
874
0
    E1l = _mm_madd_epi16(m128Tmp0, T0);
875
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
876
0
    E1h = _mm_madd_epi16(m128Tmp1, T0);
877
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
878
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
879
0
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
880
0
    E2l = _mm_madd_epi16(m128Tmp2, T1);
881
0
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
882
0
    E2h = _mm_madd_epi16(m128Tmp3, T1);
883
0
    O0l = _mm_add_epi32(E1l, E2l);
884
0
    O0h = _mm_add_epi32(E1h, E2h);
885
886
0
    E1l = _mm_madd_epi16(m128Tmp0, T2);
887
0
    E1h = _mm_madd_epi16(m128Tmp1, T2);
888
0
    E2l = _mm_madd_epi16(m128Tmp2, T3);
889
0
    E2h = _mm_madd_epi16(m128Tmp3, T3);
890
891
0
    O1l = _mm_add_epi32(E1l, E2l);
892
0
    O1h = _mm_add_epi32(E1h, E2h);
893
894
0
    E1l = _mm_madd_epi16(m128Tmp0, T4);
895
0
    E1h = _mm_madd_epi16(m128Tmp1, T4);
896
0
    E2l = _mm_madd_epi16(m128Tmp2, T5);
897
0
    E2h = _mm_madd_epi16(m128Tmp3, T5);
898
0
    O2l = _mm_add_epi32(E1l, E2l);
899
0
    O2h = _mm_add_epi32(E1h, E2h);
900
901
0
    E1l = _mm_madd_epi16(m128Tmp0, T6);
902
0
    E1h = _mm_madd_epi16(m128Tmp1, T6);
903
0
    E2l = _mm_madd_epi16(m128Tmp2, T7);
904
0
    E2h = _mm_madd_epi16(m128Tmp3, T7);
905
0
    O3h = _mm_add_epi32(E1h, E2h);
906
0
    O3l = _mm_add_epi32(E1l, E2l);
907
908
    /*    -------     */
909
910
0
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
911
0
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
912
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
913
0
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
914
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
915
0
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
916
917
0
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
918
0
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
919
920
    /*    -------     */
921
922
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
923
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
924
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
925
0
    E00l = _mm_madd_epi16(m128Tmp0, T10);
926
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
927
0
    E00h = _mm_madd_epi16(m128Tmp1, T10);
928
0
    E01l = _mm_madd_epi16(m128Tmp0, T11);
929
0
    E01h = _mm_madd_epi16(m128Tmp1, T11);
930
0
    E0l = _mm_add_epi32(EE0l, E00l);
931
0
    E0l = _mm_add_epi32(E0l, m128iAdd);
932
0
    E0h = _mm_add_epi32(EE0h, E00h);
933
0
    E0h = _mm_add_epi32(E0h, m128iAdd);
934
0
    E3l = _mm_sub_epi32(EE0l, E00l);
935
0
    E3l = _mm_add_epi32(E3l, m128iAdd);
936
0
    E3h = _mm_sub_epi32(EE0h, E00h);
937
0
    E3h = _mm_add_epi32(E3h, m128iAdd);
938
939
0
    E1l = _mm_add_epi32(EE1l, E01l);
940
0
    E1l = _mm_add_epi32(E1l, m128iAdd);
941
0
    E1h = _mm_add_epi32(EE1h, E01h);
942
0
    E1h = _mm_add_epi32(E1h, m128iAdd);
943
0
    E2l = _mm_sub_epi32(EE1l, E01l);
944
0
    E2l = _mm_add_epi32(E2l, m128iAdd);
945
0
    E2h = _mm_sub_epi32(EE1h, E01h);
946
0
    E2h = _mm_add_epi32(E2h, m128iAdd);
947
0
    m128iS0 = _mm_packs_epi32(
948
0
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
949
0
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
950
0
    m128iS1 = _mm_packs_epi32(
951
0
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
952
0
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
953
0
    m128iS2 = _mm_packs_epi32(
954
0
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
955
0
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
956
0
    m128iS3 = _mm_packs_epi32(
957
0
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
958
0
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
959
0
    m128iS4 = _mm_packs_epi32(
960
0
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
961
0
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
962
0
    m128iS5 = _mm_packs_epi32(
963
0
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
964
0
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
965
0
    m128iS6 = _mm_packs_epi32(
966
0
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
967
0
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
968
0
    m128iS7 = _mm_packs_epi32(
969
0
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
970
0
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
971
    /*  Invers matrix   */
972
973
0
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
974
0
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
975
0
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
976
0
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
977
0
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
978
0
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
979
0
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
980
0
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
981
0
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
982
0
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
983
0
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
984
0
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
985
0
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
986
0
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
987
0
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
988
0
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
989
0
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
990
0
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
991
0
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
992
0
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
993
0
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
994
0
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
995
0
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
996
0
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
997
998
0
    m128iAdd = _mm_set1_epi32(add_2nd);
999
1000
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1001
0
    E1l = _mm_madd_epi16(m128Tmp0, T0);
1002
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1003
0
    E1h = _mm_madd_epi16(m128Tmp1, T0);
1004
0
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1005
0
    E2l = _mm_madd_epi16(m128Tmp2, T1);
1006
0
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1007
0
    E2h = _mm_madd_epi16(m128Tmp3, T1);
1008
0
    O0l = _mm_add_epi32(E1l, E2l);
1009
0
    O0h = _mm_add_epi32(E1h, E2h);
1010
0
    E1l = _mm_madd_epi16(m128Tmp0, T2);
1011
0
    E1h = _mm_madd_epi16(m128Tmp1, T2);
1012
0
    E2l = _mm_madd_epi16(m128Tmp2, T3);
1013
0
    E2h = _mm_madd_epi16(m128Tmp3, T3);
1014
0
    O1l = _mm_add_epi32(E1l, E2l);
1015
0
    O1h = _mm_add_epi32(E1h, E2h);
1016
0
    E1l = _mm_madd_epi16(m128Tmp0, T4);
1017
0
    E1h = _mm_madd_epi16(m128Tmp1, T4);
1018
0
    E2l = _mm_madd_epi16(m128Tmp2, T5);
1019
0
    E2h = _mm_madd_epi16(m128Tmp3, T5);
1020
0
    O2l = _mm_add_epi32(E1l, E2l);
1021
0
    O2h = _mm_add_epi32(E1h, E2h);
1022
0
    E1l = _mm_madd_epi16(m128Tmp0, T6);
1023
0
    E1h = _mm_madd_epi16(m128Tmp1, T6);
1024
0
    E2l = _mm_madd_epi16(m128Tmp2, T7);
1025
0
    E2h = _mm_madd_epi16(m128Tmp3, T7);
1026
0
    O3h = _mm_add_epi32(E1h, E2h);
1027
0
    O3l = _mm_add_epi32(E1l, E2l);
1028
1029
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1030
0
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
1031
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1032
0
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
1033
0
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
1034
0
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
1035
1036
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1037
0
    E00l = _mm_madd_epi16(m128Tmp0, T10);
1038
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1039
0
    E00h = _mm_madd_epi16(m128Tmp1, T10);
1040
0
    E01l = _mm_madd_epi16(m128Tmp0, T11);
1041
0
    E01h = _mm_madd_epi16(m128Tmp1, T11);
1042
0
    E0l = _mm_add_epi32(EE0l, E00l);
1043
0
    E0l = _mm_add_epi32(E0l, m128iAdd);
1044
0
    E0h = _mm_add_epi32(EE0h, E00h);
1045
0
    E0h = _mm_add_epi32(E0h, m128iAdd);
1046
0
    E3l = _mm_sub_epi32(EE0l, E00l);
1047
0
    E3l = _mm_add_epi32(E3l, m128iAdd);
1048
0
    E3h = _mm_sub_epi32(EE0h, E00h);
1049
0
    E3h = _mm_add_epi32(E3h, m128iAdd);
1050
0
    E1l = _mm_add_epi32(EE1l, E01l);
1051
0
    E1l = _mm_add_epi32(E1l, m128iAdd);
1052
0
    E1h = _mm_add_epi32(EE1h, E01h);
1053
0
    E1h = _mm_add_epi32(E1h, m128iAdd);
1054
0
    E2l = _mm_sub_epi32(EE1l, E01l);
1055
0
    E2l = _mm_add_epi32(E2l, m128iAdd);
1056
0
    E2h = _mm_sub_epi32(EE1h, E01h);
1057
0
    E2h = _mm_add_epi32(E2h, m128iAdd);
1058
1059
0
    m128iS0 = _mm_packs_epi32(
1060
0
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1061
0
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1062
0
    m128iS1 = _mm_packs_epi32(
1063
0
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1064
0
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1065
0
    m128iS2 = _mm_packs_epi32(
1066
0
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1067
0
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1068
0
    m128iS3 = _mm_packs_epi32(
1069
0
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1070
0
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1071
0
    m128iS4 = _mm_packs_epi32(
1072
0
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1073
0
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1074
0
    m128iS5 = _mm_packs_epi32(
1075
0
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1076
0
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1077
0
    m128iS6 = _mm_packs_epi32(
1078
0
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1079
0
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1080
0
    m128iS7 = _mm_packs_epi32(
1081
0
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1082
0
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1083
1084
0
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1085
0
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1086
0
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1087
0
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1088
0
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1089
0
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1090
0
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1091
0
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1092
0
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1093
0
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1094
0
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1095
0
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1096
0
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1097
0
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1098
0
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1099
0
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1100
0
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1101
0
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1102
0
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1103
0
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1104
0
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1105
0
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1106
0
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1107
0
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1108
1109
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1110
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1111
1112
0
    E0l = _mm_adds_epi16(E0l, m128iS0);
1113
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1114
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1115
0
    dst += stride;
1116
1117
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1118
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1119
1120
0
    E0l = _mm_adds_epi16(E0l, m128iS1);
1121
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1122
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1123
0
    dst += stride;
1124
1125
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1126
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1127
1128
0
    E0l = _mm_adds_epi16(E0l, m128iS2);
1129
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1130
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1131
0
    dst += stride;
1132
1133
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1134
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1135
1136
0
    E0l = _mm_adds_epi16(E0l, m128iS3);
1137
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1138
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1139
0
    dst += stride;
1140
1141
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1142
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1143
1144
0
    E0l = _mm_adds_epi16(E0l, m128iS4);
1145
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1146
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1147
0
    dst += stride;
1148
1149
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1150
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1151
1152
0
    E0l = _mm_adds_epi16(E0l, m128iS5);
1153
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1154
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1155
0
    dst += stride;
1156
1157
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1158
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1159
1160
0
    E0l = _mm_adds_epi16(E0l, m128iS6);
1161
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1162
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1163
0
    dst += stride;
1164
1165
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1166
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1167
1168
0
    E0l = _mm_adds_epi16(E0l, m128iS7);
1169
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1170
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1171
0
    dst += stride;
1172
1173
0
}
1174
#endif
1175
1176
#if 0
1177
void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
1178
        ptrdiff_t _stride) {
1179
    int i;
1180
    uint16_t *dst = (uint16_t*) _dst;
1181
    ptrdiff_t stride = _stride / sizeof(uint16_t);
1182
    int16_t *src = coeffs;
1183
    uint8_t shift_2nd = 10; // 20 - Bit depth
1184
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1185
1186
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1187
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1188
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1189
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1190
    int j;
1191
    m128iAdd = _mm_set1_epi32(add_1st);
1192
1193
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1194
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1195
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1196
    E1l = _mm_madd_epi16(m128Tmp0,
1197
            _mm_load_si128((__m128i *) (transform8x8[0])));
1198
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1199
    E1h = _mm_madd_epi16(m128Tmp1,
1200
            _mm_load_si128((__m128i *) (transform8x8[0])));
1201
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1202
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1203
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1204
    E2l = _mm_madd_epi16(m128Tmp2,
1205
            _mm_load_si128((__m128i *) (transform8x8[1])));
1206
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1207
    E2h = _mm_madd_epi16(m128Tmp3,
1208
            _mm_load_si128((__m128i *) (transform8x8[1])));
1209
    O0l = _mm_add_epi32(E1l, E2l);
1210
    O0h = _mm_add_epi32(E1h, E2h);
1211
1212
    E1l = _mm_madd_epi16(m128Tmp0,
1213
            _mm_load_si128((__m128i *) (transform8x8[2])));
1214
    E1h = _mm_madd_epi16(m128Tmp1,
1215
            _mm_load_si128((__m128i *) (transform8x8[2])));
1216
    E2l = _mm_madd_epi16(m128Tmp2,
1217
            _mm_load_si128((__m128i *) (transform8x8[3])));
1218
    E2h = _mm_madd_epi16(m128Tmp3,
1219
            _mm_load_si128((__m128i *) (transform8x8[3])));
1220
1221
    O1l = _mm_add_epi32(E1l, E2l);
1222
    O1h = _mm_add_epi32(E1h, E2h);
1223
1224
    E1l = _mm_madd_epi16(m128Tmp0,
1225
            _mm_load_si128((__m128i *) (transform8x8[4])));
1226
    E1h = _mm_madd_epi16(m128Tmp1,
1227
            _mm_load_si128((__m128i *) (transform8x8[4])));
1228
    E2l = _mm_madd_epi16(m128Tmp2,
1229
            _mm_load_si128((__m128i *) (transform8x8[5])));
1230
    E2h = _mm_madd_epi16(m128Tmp3,
1231
            _mm_load_si128((__m128i *) (transform8x8[5])));
1232
    O2l = _mm_add_epi32(E1l, E2l);
1233
    O2h = _mm_add_epi32(E1h, E2h);
1234
1235
    E1l = _mm_madd_epi16(m128Tmp0,
1236
            _mm_load_si128((__m128i *) (transform8x8[6])));
1237
    E1h = _mm_madd_epi16(m128Tmp1,
1238
            _mm_load_si128((__m128i *) (transform8x8[6])));
1239
    E2l = _mm_madd_epi16(m128Tmp2,
1240
            _mm_load_si128((__m128i *) (transform8x8[7])));
1241
    E2h = _mm_madd_epi16(m128Tmp3,
1242
            _mm_load_si128((__m128i *) (transform8x8[7])));
1243
    O3h = _mm_add_epi32(E1h, E2h);
1244
    O3l = _mm_add_epi32(E1l, E2l);
1245
1246
    /*    -------     */
1247
1248
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1249
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1250
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1251
    EE0l = _mm_madd_epi16(m128Tmp0,
1252
            _mm_load_si128((__m128i *) (transform8x8[8])));
1253
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1254
    EE0h = _mm_madd_epi16(m128Tmp1,
1255
            _mm_load_si128((__m128i *) (transform8x8[8])));
1256
1257
    EE1l = _mm_madd_epi16(m128Tmp0,
1258
            _mm_load_si128((__m128i *) (transform8x8[9])));
1259
    EE1h = _mm_madd_epi16(m128Tmp1,
1260
            _mm_load_si128((__m128i *) (transform8x8[9])));
1261
1262
    /*    -------     */
1263
1264
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1265
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1266
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1267
    E00l = _mm_madd_epi16(m128Tmp0,
1268
            _mm_load_si128((__m128i *) (transform8x8[10])));
1269
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1270
    E00h = _mm_madd_epi16(m128Tmp1,
1271
            _mm_load_si128((__m128i *) (transform8x8[10])));
1272
    E01l = _mm_madd_epi16(m128Tmp0,
1273
            _mm_load_si128((__m128i *) (transform8x8[11])));
1274
    E01h = _mm_madd_epi16(m128Tmp1,
1275
            _mm_load_si128((__m128i *) (transform8x8[11])));
1276
    E0l = _mm_add_epi32(EE0l, E00l);
1277
    E0l = _mm_add_epi32(E0l, m128iAdd);
1278
    E0h = _mm_add_epi32(EE0h, E00h);
1279
    E0h = _mm_add_epi32(E0h, m128iAdd);
1280
    E3l = _mm_sub_epi32(EE0l, E00l);
1281
    E3l = _mm_add_epi32(E3l, m128iAdd);
1282
    E3h = _mm_sub_epi32(EE0h, E00h);
1283
    E3h = _mm_add_epi32(E3h, m128iAdd);
1284
1285
    E1l = _mm_add_epi32(EE1l, E01l);
1286
    E1l = _mm_add_epi32(E1l, m128iAdd);
1287
    E1h = _mm_add_epi32(EE1h, E01h);
1288
    E1h = _mm_add_epi32(E1h, m128iAdd);
1289
    E2l = _mm_sub_epi32(EE1l, E01l);
1290
    E2l = _mm_add_epi32(E2l, m128iAdd);
1291
    E2h = _mm_sub_epi32(EE1h, E01h);
1292
    E2h = _mm_add_epi32(E2h, m128iAdd);
1293
    m128iS0 = _mm_packs_epi32(
1294
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1295
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1296
    m128iS1 = _mm_packs_epi32(
1297
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1298
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1299
    m128iS2 = _mm_packs_epi32(
1300
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1301
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1302
    m128iS3 = _mm_packs_epi32(
1303
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1304
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1305
    m128iS4 = _mm_packs_epi32(
1306
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1307
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1308
    m128iS5 = _mm_packs_epi32(
1309
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1310
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1311
    m128iS6 = _mm_packs_epi32(
1312
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1313
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1314
    m128iS7 = _mm_packs_epi32(
1315
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1316
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1317
    /*  Invers matrix   */
1318
1319
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1320
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1321
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1322
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1323
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1324
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1325
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1326
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1327
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1328
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1329
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1330
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1331
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1332
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1333
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1334
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1335
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1336
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1337
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1338
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1339
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1340
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1341
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1342
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1343
1344
    m128iAdd = _mm_set1_epi32(add_2nd);
1345
1346
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1347
    E1l = _mm_madd_epi16(m128Tmp0,
1348
            _mm_load_si128((__m128i *) (transform8x8[0])));
1349
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1350
    E1h = _mm_madd_epi16(m128Tmp1,
1351
            _mm_load_si128((__m128i *) (transform8x8[0])));
1352
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1353
    E2l = _mm_madd_epi16(m128Tmp2,
1354
            _mm_load_si128((__m128i *) (transform8x8[1])));
1355
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1356
    E2h = _mm_madd_epi16(m128Tmp3,
1357
            _mm_load_si128((__m128i *) (transform8x8[1])));
1358
    O0l = _mm_add_epi32(E1l, E2l);
1359
    O0h = _mm_add_epi32(E1h, E2h);
1360
    E1l = _mm_madd_epi16(m128Tmp0,
1361
            _mm_load_si128((__m128i *) (transform8x8[2])));
1362
    E1h = _mm_madd_epi16(m128Tmp1,
1363
            _mm_load_si128((__m128i *) (transform8x8[2])));
1364
    E2l = _mm_madd_epi16(m128Tmp2,
1365
            _mm_load_si128((__m128i *) (transform8x8[3])));
1366
    E2h = _mm_madd_epi16(m128Tmp3,
1367
            _mm_load_si128((__m128i *) (transform8x8[3])));
1368
    O1l = _mm_add_epi32(E1l, E2l);
1369
    O1h = _mm_add_epi32(E1h, E2h);
1370
    E1l = _mm_madd_epi16(m128Tmp0,
1371
            _mm_load_si128((__m128i *) (transform8x8[4])));
1372
    E1h = _mm_madd_epi16(m128Tmp1,
1373
            _mm_load_si128((__m128i *) (transform8x8[4])));
1374
    E2l = _mm_madd_epi16(m128Tmp2,
1375
            _mm_load_si128((__m128i *) (transform8x8[5])));
1376
    E2h = _mm_madd_epi16(m128Tmp3,
1377
            _mm_load_si128((__m128i *) (transform8x8[5])));
1378
    O2l = _mm_add_epi32(E1l, E2l);
1379
    O2h = _mm_add_epi32(E1h, E2h);
1380
    E1l = _mm_madd_epi16(m128Tmp0,
1381
            _mm_load_si128((__m128i *) (transform8x8[6])));
1382
    E1h = _mm_madd_epi16(m128Tmp1,
1383
            _mm_load_si128((__m128i *) (transform8x8[6])));
1384
    E2l = _mm_madd_epi16(m128Tmp2,
1385
            _mm_load_si128((__m128i *) (transform8x8[7])));
1386
    E2h = _mm_madd_epi16(m128Tmp3,
1387
            _mm_load_si128((__m128i *) (transform8x8[7])));
1388
    O3h = _mm_add_epi32(E1h, E2h);
1389
    O3l = _mm_add_epi32(E1l, E2l);
1390
1391
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1392
    EE0l = _mm_madd_epi16(m128Tmp0,
1393
            _mm_load_si128((__m128i *) (transform8x8[8])));
1394
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1395
    EE0h = _mm_madd_epi16(m128Tmp1,
1396
            _mm_load_si128((__m128i *) (transform8x8[8])));
1397
    EE1l = _mm_madd_epi16(m128Tmp0,
1398
            _mm_load_si128((__m128i *) (transform8x8[9])));
1399
    EE1h = _mm_madd_epi16(m128Tmp1,
1400
            _mm_load_si128((__m128i *) (transform8x8[9])));
1401
1402
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1403
    E00l = _mm_madd_epi16(m128Tmp0,
1404
            _mm_load_si128((__m128i *) (transform8x8[10])));
1405
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1406
    E00h = _mm_madd_epi16(m128Tmp1,
1407
            _mm_load_si128((__m128i *) (transform8x8[10])));
1408
    E01l = _mm_madd_epi16(m128Tmp0,
1409
            _mm_load_si128((__m128i *) (transform8x8[11])));
1410
    E01h = _mm_madd_epi16(m128Tmp1,
1411
            _mm_load_si128((__m128i *) (transform8x8[11])));
1412
    E0l = _mm_add_epi32(EE0l, E00l);
1413
    E0l = _mm_add_epi32(E0l, m128iAdd);
1414
    E0h = _mm_add_epi32(EE0h, E00h);
1415
    E0h = _mm_add_epi32(E0h, m128iAdd);
1416
    E3l = _mm_sub_epi32(EE0l, E00l);
1417
    E3l = _mm_add_epi32(E3l, m128iAdd);
1418
    E3h = _mm_sub_epi32(EE0h, E00h);
1419
    E3h = _mm_add_epi32(E3h, m128iAdd);
1420
    E1l = _mm_add_epi32(EE1l, E01l);
1421
    E1l = _mm_add_epi32(E1l, m128iAdd);
1422
    E1h = _mm_add_epi32(EE1h, E01h);
1423
    E1h = _mm_add_epi32(E1h, m128iAdd);
1424
    E2l = _mm_sub_epi32(EE1l, E01l);
1425
    E2l = _mm_add_epi32(E2l, m128iAdd);
1426
    E2h = _mm_sub_epi32(EE1h, E01h);
1427
    E2h = _mm_add_epi32(E2h, m128iAdd);
1428
1429
    m128iS0 = _mm_packs_epi32(
1430
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1431
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1432
    m128iS1 = _mm_packs_epi32(
1433
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1434
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1435
    m128iS2 = _mm_packs_epi32(
1436
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1437
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1438
    m128iS3 = _mm_packs_epi32(
1439
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1440
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1441
    m128iS4 = _mm_packs_epi32(
1442
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1443
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1444
    m128iS5 = _mm_packs_epi32(
1445
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1446
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1447
    m128iS6 = _mm_packs_epi32(
1448
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1449
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1450
    m128iS7 = _mm_packs_epi32(
1451
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1452
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1453
1454
    _mm_store_si128((__m128i *) (src), m128iS0);
1455
    _mm_store_si128((__m128i *) (src + 8), m128iS1);
1456
    _mm_store_si128((__m128i *) (src + 16), m128iS2);
1457
    _mm_store_si128((__m128i *) (src + 24), m128iS3);
1458
    _mm_store_si128((__m128i *) (src + 32), m128iS4);
1459
    _mm_store_si128((__m128i *) (src + 40), m128iS5);
1460
    _mm_store_si128((__m128i *) (src + 48), m128iS6);
1461
    _mm_store_si128((__m128i *) (src + 56), m128iS7);
1462
1463
    j = 0;
1464
    for (i = 0; i < 4; i++) {
1465
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1466
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1467
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1468
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1469
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1470
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1471
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1472
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1473
        j += 1;
1474
        dst += stride;
1475
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1476
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1477
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1478
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1479
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1480
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1481
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1482
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1483
        j += 1;
1484
        dst += stride;
1485
    }
1486
1487
}
1488
#endif
1489
1490
1491
#if HAVE_SSE4_1
1492
void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
1493
0
        ptrdiff_t _stride) {
1494
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
1495
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1496
0
    int i;
1497
0
    uint8_t *dst = (uint8_t*) _dst;
1498
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
1499
0
    const int16_t *src = coeffs;
1500
0
    int32_t shift;
1501
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1502
0
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1503
0
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1504
0
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1505
0
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1506
0
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1507
0
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1508
0
    __m128i E4l, E5l, E6l, E7l;
1509
0
    __m128i E4h, E5h, E6h, E7h;
1510
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1511
0
    __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1512
1513
1514
    /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1515
    __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1516
    __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1517
    __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1518
1519
    __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1520
1521
    __m128i V00,V01, V10, V11;*/
1522
1523
1524
0
    const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1525
0
    const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1526
0
    const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1527
0
    const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1528
0
    const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1529
0
    const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1530
0
    const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1531
0
    const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1532
0
    const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1533
0
    const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1534
0
    const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1535
0
    const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1536
0
    const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1537
0
    const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1538
0
    const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1539
0
    const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1540
0
    const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1541
0
    const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1542
0
    const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1543
0
    const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1544
0
    const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1545
0
    const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1546
0
    const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1547
0
    const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1548
0
    const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1549
0
    const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1550
0
    const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1551
0
    const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1552
0
    const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1553
0
    const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1554
0
    const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1555
0
    const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1556
1557
0
    const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1558
0
    const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1559
0
    const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1560
0
    const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1561
0
    const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1562
0
    const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1563
0
    const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1564
0
    const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1565
1566
0
    const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1567
0
    const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1568
0
    const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1569
0
    const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1570
1571
1572
1573
0
    int j;
1574
0
    m128iS0 = _mm_load_si128((__m128i *) (src));
1575
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1576
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1577
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1578
0
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1579
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1580
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1581
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1582
0
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1583
0
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1584
0
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1585
0
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1586
0
    m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1587
0
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1588
0
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1589
0
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1590
0
    shift = shift_1st;
1591
0
    m128iAdd = _mm_set1_epi32(add_1st);
1592
1593
0
    for (j = 0; j < 2; j++) {
1594
0
        for (i = 0; i < 16; i += 8) {
1595
1596
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1597
0
            E0l = _mm_madd_epi16(m128Tmp0,T00);
1598
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1599
0
            E0h = _mm_madd_epi16(m128Tmp1,T00);
1600
1601
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1602
0
            E1l = _mm_madd_epi16(m128Tmp2,T10);
1603
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1604
0
            E1h = _mm_madd_epi16(m128Tmp3,T10);
1605
1606
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1607
0
            E2l = _mm_madd_epi16(m128Tmp4,T20);
1608
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1609
0
            E2h = _mm_madd_epi16(m128Tmp5,T20);
1610
1611
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1612
0
            E3l = _mm_madd_epi16(m128Tmp6,T30);
1613
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1614
0
            E3h = _mm_madd_epi16(m128Tmp7,T30);
1615
1616
0
            O0l = _mm_add_epi32(E0l, E1l);
1617
0
            O0l = _mm_add_epi32(O0l, E2l);
1618
0
            O0l = _mm_add_epi32(O0l, E3l);
1619
1620
0
            O0h = _mm_add_epi32(E0h, E1h);
1621
0
            O0h = _mm_add_epi32(O0h, E2h);
1622
0
            O0h = _mm_add_epi32(O0h, E3h);
1623
1624
            /* Compute O1*/
1625
0
            E0l = _mm_madd_epi16(m128Tmp0,T01);
1626
0
            E0h = _mm_madd_epi16(m128Tmp1,T01);
1627
0
            E1l = _mm_madd_epi16(m128Tmp2,T11);
1628
0
            E1h = _mm_madd_epi16(m128Tmp3,T11);
1629
0
            E2l = _mm_madd_epi16(m128Tmp4,T21);
1630
0
            E2h = _mm_madd_epi16(m128Tmp5,T21);
1631
0
            E3l = _mm_madd_epi16(m128Tmp6,T31);
1632
0
            E3h = _mm_madd_epi16(m128Tmp7,T31);
1633
0
            O1l = _mm_add_epi32(E0l, E1l);
1634
0
            O1l = _mm_add_epi32(O1l, E2l);
1635
0
            O1l = _mm_add_epi32(O1l, E3l);
1636
0
            O1h = _mm_add_epi32(E0h, E1h);
1637
0
            O1h = _mm_add_epi32(O1h, E2h);
1638
0
            O1h = _mm_add_epi32(O1h, E3h);
1639
1640
            /* Compute O2*/
1641
0
            E0l = _mm_madd_epi16(m128Tmp0,T02);
1642
0
            E0h = _mm_madd_epi16(m128Tmp1,T02);
1643
0
            E1l = _mm_madd_epi16(m128Tmp2,T12);
1644
0
            E1h = _mm_madd_epi16(m128Tmp3,T12);
1645
0
            E2l = _mm_madd_epi16(m128Tmp4,T22);
1646
0
            E2h = _mm_madd_epi16(m128Tmp5,T22);
1647
0
            E3l = _mm_madd_epi16(m128Tmp6,T32);
1648
0
            E3h = _mm_madd_epi16(m128Tmp7,T32);
1649
0
            O2l = _mm_add_epi32(E0l, E1l);
1650
0
            O2l = _mm_add_epi32(O2l, E2l);
1651
0
            O2l = _mm_add_epi32(O2l, E3l);
1652
1653
0
            O2h = _mm_add_epi32(E0h, E1h);
1654
0
            O2h = _mm_add_epi32(O2h, E2h);
1655
0
            O2h = _mm_add_epi32(O2h, E3h);
1656
1657
            /* Compute O3*/
1658
0
            E0l = _mm_madd_epi16(m128Tmp0,T03);
1659
0
            E0h = _mm_madd_epi16(m128Tmp1,T03);
1660
0
            E1l = _mm_madd_epi16(m128Tmp2,T13);
1661
0
            E1h = _mm_madd_epi16(m128Tmp3,T13);
1662
0
            E2l = _mm_madd_epi16(m128Tmp4,T23);
1663
0
            E2h = _mm_madd_epi16(m128Tmp5,T23);
1664
0
            E3l = _mm_madd_epi16(m128Tmp6,T33);
1665
0
            E3h = _mm_madd_epi16(m128Tmp7,T33);
1666
1667
0
            O3l = _mm_add_epi32(E0l, E1l);
1668
0
            O3l = _mm_add_epi32(O3l, E2l);
1669
0
            O3l = _mm_add_epi32(O3l, E3l);
1670
1671
0
            O3h = _mm_add_epi32(E0h, E1h);
1672
0
            O3h = _mm_add_epi32(O3h, E2h);
1673
0
            O3h = _mm_add_epi32(O3h, E3h);
1674
1675
            /* Compute O4*/
1676
1677
0
            E0l = _mm_madd_epi16(m128Tmp0,T04);
1678
0
            E0h = _mm_madd_epi16(m128Tmp1,T04);
1679
0
            E1l = _mm_madd_epi16(m128Tmp2,T14);
1680
0
            E1h = _mm_madd_epi16(m128Tmp3,T14);
1681
0
            E2l = _mm_madd_epi16(m128Tmp4,T24);
1682
0
            E2h = _mm_madd_epi16(m128Tmp5,T24);
1683
0
            E3l = _mm_madd_epi16(m128Tmp6,T34);
1684
0
            E3h = _mm_madd_epi16(m128Tmp7,T34);
1685
1686
0
            O4l = _mm_add_epi32(E0l, E1l);
1687
0
            O4l = _mm_add_epi32(O4l, E2l);
1688
0
            O4l = _mm_add_epi32(O4l, E3l);
1689
1690
0
            O4h = _mm_add_epi32(E0h, E1h);
1691
0
            O4h = _mm_add_epi32(O4h, E2h);
1692
0
            O4h = _mm_add_epi32(O4h, E3h);
1693
1694
            /* Compute O5*/
1695
0
            E0l = _mm_madd_epi16(m128Tmp0,T05);
1696
0
            E0h = _mm_madd_epi16(m128Tmp1,T05);
1697
0
            E1l = _mm_madd_epi16(m128Tmp2,T15);
1698
0
            E1h = _mm_madd_epi16(m128Tmp3,T15);
1699
0
            E2l = _mm_madd_epi16(m128Tmp4,T25);
1700
0
            E2h = _mm_madd_epi16(m128Tmp5,T25);
1701
0
            E3l = _mm_madd_epi16(m128Tmp6,T35);
1702
0
            E3h = _mm_madd_epi16(m128Tmp7,T35);
1703
1704
0
            O5l = _mm_add_epi32(E0l, E1l);
1705
0
            O5l = _mm_add_epi32(O5l, E2l);
1706
0
            O5l = _mm_add_epi32(O5l, E3l);
1707
1708
0
            O5h = _mm_add_epi32(E0h, E1h);
1709
0
            O5h = _mm_add_epi32(O5h, E2h);
1710
0
            O5h = _mm_add_epi32(O5h, E3h);
1711
1712
            /* Compute O6*/
1713
1714
0
            E0l = _mm_madd_epi16(m128Tmp0,T06);
1715
0
            E0h = _mm_madd_epi16(m128Tmp1,T06);
1716
0
            E1l = _mm_madd_epi16(m128Tmp2,T16);
1717
0
            E1h = _mm_madd_epi16(m128Tmp3,T16);
1718
0
            E2l = _mm_madd_epi16(m128Tmp4,T26);
1719
0
            E2h = _mm_madd_epi16(m128Tmp5,T26);
1720
0
            E3l = _mm_madd_epi16(m128Tmp6,T36);
1721
0
            E3h = _mm_madd_epi16(m128Tmp7,T36);
1722
1723
0
            O6l = _mm_add_epi32(E0l, E1l);
1724
0
            O6l = _mm_add_epi32(O6l, E2l);
1725
0
            O6l = _mm_add_epi32(O6l, E3l);
1726
1727
0
            O6h = _mm_add_epi32(E0h, E1h);
1728
0
            O6h = _mm_add_epi32(O6h, E2h);
1729
0
            O6h = _mm_add_epi32(O6h, E3h);
1730
1731
            /* Compute O7*/
1732
1733
0
            E0l = _mm_madd_epi16(m128Tmp0,T07);
1734
0
            E0h = _mm_madd_epi16(m128Tmp1,T07);
1735
0
            E1l = _mm_madd_epi16(m128Tmp2,T17);
1736
0
            E1h = _mm_madd_epi16(m128Tmp3,T17);
1737
0
            E2l = _mm_madd_epi16(m128Tmp4,T27);
1738
0
            E2h = _mm_madd_epi16(m128Tmp5,T27);
1739
0
            E3l = _mm_madd_epi16(m128Tmp6,T37);
1740
0
            E3h = _mm_madd_epi16(m128Tmp7,T37);
1741
1742
0
            O7l = _mm_add_epi32(E0l, E1l);
1743
0
            O7l = _mm_add_epi32(O7l, E2l);
1744
0
            O7l = _mm_add_epi32(O7l, E3l);
1745
1746
0
            O7h = _mm_add_epi32(E0h, E1h);
1747
0
            O7h = _mm_add_epi32(O7h, E2h);
1748
0
            O7h = _mm_add_epi32(O7h, E3h);
1749
1750
            /*  Compute E0  */
1751
1752
1753
1754
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1755
0
            E0l = _mm_madd_epi16(m128Tmp0,U00);
1756
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1757
0
            E0h = _mm_madd_epi16(m128Tmp1,U00);
1758
1759
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1760
0
            E0l = _mm_add_epi32(E0l,
1761
0
                    _mm_madd_epi16(m128Tmp2,U10));
1762
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1763
0
            E0h = _mm_add_epi32(E0h,
1764
0
                    _mm_madd_epi16(m128Tmp3,U10));
1765
1766
            /*  Compute E1  */
1767
0
            E1l = _mm_madd_epi16(m128Tmp0,U01);
1768
0
            E1h = _mm_madd_epi16(m128Tmp1,U01);
1769
0
            E1l = _mm_add_epi32(E1l,
1770
0
                    _mm_madd_epi16(m128Tmp2,U11));
1771
0
            E1h = _mm_add_epi32(E1h,
1772
0
                    _mm_madd_epi16(m128Tmp3,U11));
1773
1774
            /*  Compute E2  */
1775
0
            E2l = _mm_madd_epi16(m128Tmp0,U02);
1776
0
            E2h = _mm_madd_epi16(m128Tmp1,U02);
1777
0
            E2l = _mm_add_epi32(E2l,
1778
0
                    _mm_madd_epi16(m128Tmp2,U12));
1779
0
            E2h = _mm_add_epi32(E2h,
1780
0
                    _mm_madd_epi16(m128Tmp3,U12));
1781
            /*  Compute E3  */
1782
0
            E3l = _mm_madd_epi16(m128Tmp0,U03);
1783
0
            E3h = _mm_madd_epi16(m128Tmp1,U03);
1784
0
            E3l = _mm_add_epi32(E3l,
1785
0
                    _mm_madd_epi16(m128Tmp2,U13));
1786
0
            E3h = _mm_add_epi32(E3h,
1787
0
                    _mm_madd_epi16(m128Tmp3,U13));
1788
1789
            /*  Compute EE0 and EEE */
1790
1791
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1792
0
            E00l = _mm_madd_epi16(m128Tmp0,V00);
1793
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1794
0
            E00h = _mm_madd_epi16(m128Tmp1,V00);
1795
1796
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1797
0
            EE0l = _mm_madd_epi16(m128Tmp2,V10);
1798
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1799
0
            EE0h = _mm_madd_epi16(m128Tmp3,V10);
1800
1801
0
            E01l = _mm_madd_epi16(m128Tmp0,V01);
1802
0
            E01h = _mm_madd_epi16(m128Tmp1,V01);
1803
1804
0
            EE1l = _mm_madd_epi16(m128Tmp2,V11);
1805
0
            EE1h = _mm_madd_epi16(m128Tmp3,V11);
1806
1807
            /*  Compute EE    */
1808
0
            EE2l = _mm_sub_epi32(EE1l, E01l);
1809
0
            EE3l = _mm_sub_epi32(EE0l, E00l);
1810
0
            EE2h = _mm_sub_epi32(EE1h, E01h);
1811
0
            EE3h = _mm_sub_epi32(EE0h, E00h);
1812
1813
0
            EE0l = _mm_add_epi32(EE0l, E00l);
1814
0
            EE1l = _mm_add_epi32(EE1l, E01l);
1815
0
            EE0h = _mm_add_epi32(EE0h, E00h);
1816
0
            EE1h = _mm_add_epi32(EE1h, E01h);
1817
1818
            /*      Compute E       */
1819
1820
0
            E4l = _mm_sub_epi32(EE3l, E3l);
1821
0
            E4l = _mm_add_epi32(E4l, m128iAdd);
1822
1823
0
            E5l = _mm_sub_epi32(EE2l, E2l);
1824
0
            E5l = _mm_add_epi32(E5l, m128iAdd);
1825
1826
0
            E6l = _mm_sub_epi32(EE1l, E1l);
1827
0
            E6l = _mm_add_epi32(E6l, m128iAdd);
1828
1829
0
            E7l = _mm_sub_epi32(EE0l, E0l);
1830
0
            E7l = _mm_add_epi32(E7l, m128iAdd);
1831
1832
0
            E4h = _mm_sub_epi32(EE3h, E3h);
1833
0
            E4h = _mm_add_epi32(E4h, m128iAdd);
1834
1835
0
            E5h = _mm_sub_epi32(EE2h, E2h);
1836
0
            E5h = _mm_add_epi32(E5h, m128iAdd);
1837
1838
0
            E6h = _mm_sub_epi32(EE1h, E1h);
1839
0
            E6h = _mm_add_epi32(E6h, m128iAdd);
1840
1841
0
            E7h = _mm_sub_epi32(EE0h, E0h);
1842
0
            E7h = _mm_add_epi32(E7h, m128iAdd);
1843
1844
0
            E0l = _mm_add_epi32(EE0l, E0l);
1845
0
            E0l = _mm_add_epi32(E0l, m128iAdd);
1846
1847
0
            E1l = _mm_add_epi32(EE1l, E1l);
1848
0
            E1l = _mm_add_epi32(E1l, m128iAdd);
1849
1850
0
            E2l = _mm_add_epi32(EE2l, E2l);
1851
0
            E2l = _mm_add_epi32(E2l, m128iAdd);
1852
1853
0
            E3l = _mm_add_epi32(EE3l, E3l);
1854
0
            E3l = _mm_add_epi32(E3l, m128iAdd);
1855
1856
0
            E0h = _mm_add_epi32(EE0h, E0h);
1857
0
            E0h = _mm_add_epi32(E0h, m128iAdd);
1858
1859
0
            E1h = _mm_add_epi32(EE1h, E1h);
1860
0
            E1h = _mm_add_epi32(E1h, m128iAdd);
1861
1862
0
            E2h = _mm_add_epi32(EE2h, E2h);
1863
0
            E2h = _mm_add_epi32(E2h, m128iAdd);
1864
1865
0
            E3h = _mm_add_epi32(EE3h, E3h);
1866
0
            E3h = _mm_add_epi32(E3h, m128iAdd);
1867
1868
0
            m128iS0 = _mm_packs_epi32(
1869
0
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1870
0
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1871
0
            m128iS1 = _mm_packs_epi32(
1872
0
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1873
0
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1874
0
            m128iS2 = _mm_packs_epi32(
1875
0
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1876
0
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1877
0
            m128iS3 = _mm_packs_epi32(
1878
0
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1879
0
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1880
1881
0
            m128iS4 = _mm_packs_epi32(
1882
0
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1883
0
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1884
0
            m128iS5 = _mm_packs_epi32(
1885
0
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1886
0
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1887
0
            m128iS6 = _mm_packs_epi32(
1888
0
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1889
0
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1890
0
            m128iS7 = _mm_packs_epi32(
1891
0
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1892
0
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1893
1894
0
            m128iS15 = _mm_packs_epi32(
1895
0
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1896
0
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1897
0
            m128iS14 = _mm_packs_epi32(
1898
0
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1899
0
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1900
0
            m128iS13 = _mm_packs_epi32(
1901
0
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1902
0
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1903
0
            m128iS12 = _mm_packs_epi32(
1904
0
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1905
0
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1906
1907
0
            m128iS11 = _mm_packs_epi32(
1908
0
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1909
0
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1910
0
            m128iS10 = _mm_packs_epi32(
1911
0
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1912
0
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1913
0
            m128iS9 = _mm_packs_epi32(
1914
0
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1915
0
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1916
0
            m128iS8 = _mm_packs_epi32(
1917
0
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1918
0
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1919
1920
1921
1922
0
            if (!j) { //first pass
1923
1924
                /*      Inverse the matrix      */
1925
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1926
0
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1927
0
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1928
0
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1929
0
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1930
0
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1931
0
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1932
0
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1933
1934
0
                E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1935
0
                E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1936
0
                E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1937
0
                E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1938
0
                E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1939
0
                E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1940
0
                E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1941
0
                E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1942
1943
0
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1944
0
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1945
0
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1946
0
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1947
1948
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1949
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1950
0
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1951
0
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1952
1953
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1954
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1955
0
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1956
0
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1957
1958
0
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1959
0
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1960
0
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1961
0
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1962
1963
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1964
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1965
0
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1966
0
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1967
1968
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1969
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1970
0
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1971
0
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1972
1973
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1974
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1975
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1976
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1977
1978
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1979
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1980
0
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1981
0
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1982
1983
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1984
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1985
0
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1986
0
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1987
1988
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1989
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1990
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1991
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1992
1993
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1994
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1995
0
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1996
0
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1997
1998
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1999
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2000
0
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2001
0
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2002
2003
0
                if (!i) {
2004
2005
0
                    r0= m128iS0;    //0
2006
0
                    r1= m128iS1;    //16
2007
0
                    r2= m128iS2;    //32
2008
0
                    r3= m128iS3;    //48
2009
0
                    r4= m128iS4;    //64
2010
0
                    r5= m128iS5;    //80
2011
0
                    r6= m128iS6;    //96
2012
0
                    r7= m128iS7;    //112
2013
0
                    r8= m128iS8;    //128
2014
0
                    r9= m128iS9;    //144
2015
0
                    r10= m128iS10;  //160
2016
0
                    r11= m128iS11;  //176
2017
0
                    r12= m128iS12;  //192
2018
0
                    r13= m128iS13;  //208
2019
0
                    r14= m128iS14;  //224
2020
0
                    r15= m128iS15;  //240
2021
2022
2023
2024
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2025
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2026
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2027
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2028
0
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2029
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2030
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2031
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2032
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2033
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2034
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2035
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2036
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2037
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2038
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2039
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2040
0
                } else {
2041
2042
0
                    r16= m128iS0;    //8
2043
0
                    r17= m128iS1;    //24
2044
0
                    r18= m128iS2;    //40
2045
0
                    r19= m128iS3;    //56
2046
0
                    r20= m128iS4;    //72
2047
0
                    r21= m128iS5;    //88
2048
0
                    r22= m128iS6;    //104
2049
0
                    r23= m128iS7;    //120
2050
0
                    r24= m128iS8;    //136
2051
0
                    r25= m128iS9;    //152
2052
0
                    r26= m128iS10;  //168
2053
0
                    r27= m128iS11;  //184
2054
0
                    r28= m128iS12;  //200
2055
0
                    r29= m128iS13;  //216
2056
0
                    r30= m128iS14;  //232
2057
0
                    r31= m128iS15;  //248
2058
2059
                    //prepare next iteration :
2060
2061
0
                    m128iS0= r0;
2062
0
                    m128iS1= r2;
2063
0
                    m128iS2= r4;
2064
0
                    m128iS3= r6;
2065
0
                    m128iS4= r8;
2066
0
                    m128iS5= r10;
2067
0
                    m128iS6= r12;
2068
0
                    m128iS7= r14;
2069
0
                    m128iS8= r16;
2070
0
                    m128iS9= r18;
2071
0
                    m128iS10=r20;
2072
0
                    m128iS11=r22;
2073
0
                    m128iS12=r24;
2074
0
                    m128iS13=r26;
2075
0
                    m128iS14=r28;
2076
0
                    m128iS15=r30;
2077
2078
0
                    shift = shift_2nd;
2079
0
                    m128iAdd = _mm_set1_epi32(add_2nd);
2080
0
                }
2081
2082
0
            } else {
2083
2084
                //transpose half matrix :
2085
                //instead of having 1 register = 1 half-column,
2086
                //1 register = 1 half-row.
2087
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2088
0
                E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2089
0
                E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2090
0
                E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2091
0
                E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2092
0
                E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2093
0
                E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2094
0
                E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2095
2096
0
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2097
0
                O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2098
0
                O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2099
0
                O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2100
0
                O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2101
0
                O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2102
0
                O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2103
0
                O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2104
2105
2106
0
                m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2107
0
                m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2108
2109
0
                m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2110
0
                m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2111
2112
0
                r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);    //1st half 1st row
2113
0
                r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);    //2nd half 1st row
2114
2115
2116
0
                r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);    //1st half 2nd row
2117
0
                r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);    //2nd half 2nd row
2118
2119
0
                m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2120
0
                m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2121
0
                m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2122
0
                m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2123
2124
2125
0
                r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2126
0
                r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2127
2128
0
                r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2129
0
                r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2130
2131
0
                m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2132
0
                m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2133
0
                m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2134
0
                m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2135
2136
0
                r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2137
0
                r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2138
2139
2140
0
                r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2141
0
                r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2142
2143
0
                m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2144
0
                m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2145
0
                m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2146
0
                m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2147
2148
0
                r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2149
0
                r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2150
2151
2152
0
                r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2153
0
                r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2154
2155
0
                dst = (uint8_t*) (_dst + (i*stride));
2156
0
                m128Tmp0= _mm_setzero_si128();
2157
0
                m128Tmp1= _mm_load_si128((__m128i*)dst);
2158
0
                m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2159
0
                m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2160
0
                m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2161
0
                m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2162
0
                m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2163
0
                m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2164
0
                E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2165
2166
2167
0
                r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2168
0
                r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2169
0
                r0= _mm_packus_epi16(r0,r2);
2170
2171
2172
2173
2174
0
                r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2175
0
                r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2176
0
                r4= _mm_packus_epi16(r4,r6);
2177
2178
2179
0
                r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2180
0
                r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2181
0
                r8= _mm_packus_epi16(r8,r10);
2182
2183
2184
0
                r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2185
0
                r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2186
0
                r12= _mm_packus_epi16(r12,r14);
2187
2188
2189
0
                r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2190
0
                r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2191
0
                r16= _mm_packus_epi16(r16,r18);
2192
2193
2194
0
                r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2195
0
                r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2196
0
                r20= _mm_packus_epi16(r20,r22);
2197
2198
2199
0
                r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2200
0
                r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2201
0
                r24= _mm_packus_epi16(r24,r26);
2202
2203
2204
2205
0
                r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2206
0
                r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2207
0
                r28= _mm_packus_epi16(r28,r30);
2208
2209
0
                _mm_store_si128((__m128i*)dst,r0);
2210
0
                _mm_store_si128((__m128i*)(dst+stride),r4);
2211
0
                _mm_store_si128((__m128i*)(dst+2*stride),r8);
2212
0
                _mm_store_si128((__m128i*)(dst+3*stride),r12);
2213
0
                _mm_store_si128((__m128i*)(dst+4*stride),r16);
2214
0
                _mm_store_si128((__m128i*)(dst+5*stride),r20);
2215
0
                _mm_store_si128((__m128i*)(dst+6*stride),r24);
2216
0
                _mm_store_si128((__m128i*)(dst+7*stride),r28);
2217
2218
2219
2220
0
                if (!i) {
2221
                    //first half done, can store !
2222
2223
2224
0
                    m128iS0= r1;
2225
0
                    m128iS1= r3;
2226
0
                    m128iS2= r5;
2227
0
                    m128iS3= r7;
2228
0
                    m128iS4= r9;
2229
0
                    m128iS5= r11;
2230
0
                    m128iS6= r13;
2231
0
                    m128iS7= r15;
2232
0
                    m128iS8= r17;
2233
0
                    m128iS9= r19;
2234
0
                    m128iS10=r21;
2235
0
                    m128iS11=r23;
2236
0
                    m128iS12=r25;
2237
0
                    m128iS13=r27;
2238
0
                    m128iS14=r29;
2239
0
                    m128iS15=r31;
2240
0
                }
2241
0
            }
2242
0
        }
2243
0
    }
2244
0
}
2245
#endif
2246
2247
2248
#if 0
2249
void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
2250
        ptrdiff_t _stride) {
2251
    int i;
2252
    uint16_t *dst = (uint16_t*) _dst;
2253
    ptrdiff_t stride = _stride / 2;
2254
    int16_t *src = coeffs;
2255
    int32_t shift;
2256
    uint8_t shift_2nd = 10; //20 - bit depth
2257
    uint16_t add_2nd = 1 << 9; //shift - 1;
2258
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2259
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2260
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2261
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2262
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2263
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2264
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2265
    __m128i E4l, E5l, E6l, E7l;
2266
    __m128i E4h, E5h, E6h, E7h;
2267
    int j;
2268
    m128iS0 = _mm_load_si128((__m128i *) (src));
2269
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2270
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2271
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2272
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2273
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2274
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2275
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2276
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2277
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2278
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2279
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2280
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2281
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2282
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2283
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2284
    shift = shift_1st;
2285
    m128iAdd = _mm_set1_epi32(add_1st);
2286
2287
    for (j = 0; j < 2; j++) {
2288
        for (i = 0; i < 16; i += 8) {
2289
2290
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2291
            E0l = _mm_madd_epi16(m128Tmp0,
2292
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2293
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2294
            E0h = _mm_madd_epi16(m128Tmp1,
2295
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2296
2297
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2298
            E1l = _mm_madd_epi16(m128Tmp2,
2299
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2300
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2301
            E1h = _mm_madd_epi16(m128Tmp3,
2302
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2303
2304
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2305
            E2l = _mm_madd_epi16(m128Tmp4,
2306
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2307
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2308
            E2h = _mm_madd_epi16(m128Tmp5,
2309
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2310
2311
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2312
            E3l = _mm_madd_epi16(m128Tmp6,
2313
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2314
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2315
            E3h = _mm_madd_epi16(m128Tmp7,
2316
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2317
2318
            O0l = _mm_add_epi32(E0l, E1l);
2319
            O0l = _mm_add_epi32(O0l, E2l);
2320
            O0l = _mm_add_epi32(O0l, E3l);
2321
2322
            O0h = _mm_add_epi32(E0h, E1h);
2323
            O0h = _mm_add_epi32(O0h, E2h);
2324
            O0h = _mm_add_epi32(O0h, E3h);
2325
2326
            /* Compute O1*/
2327
            E0l = _mm_madd_epi16(m128Tmp0,
2328
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2329
            E0h = _mm_madd_epi16(m128Tmp1,
2330
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2331
            E1l = _mm_madd_epi16(m128Tmp2,
2332
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2333
            E1h = _mm_madd_epi16(m128Tmp3,
2334
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2335
            E2l = _mm_madd_epi16(m128Tmp4,
2336
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2337
            E2h = _mm_madd_epi16(m128Tmp5,
2338
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2339
            E3l = _mm_madd_epi16(m128Tmp6,
2340
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2341
            E3h = _mm_madd_epi16(m128Tmp7,
2342
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2343
            O1l = _mm_add_epi32(E0l, E1l);
2344
            O1l = _mm_add_epi32(O1l, E2l);
2345
            O1l = _mm_add_epi32(O1l, E3l);
2346
            O1h = _mm_add_epi32(E0h, E1h);
2347
            O1h = _mm_add_epi32(O1h, E2h);
2348
            O1h = _mm_add_epi32(O1h, E3h);
2349
2350
            /* Compute O2*/
2351
            E0l = _mm_madd_epi16(m128Tmp0,
2352
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2353
            E0h = _mm_madd_epi16(m128Tmp1,
2354
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2355
            E1l = _mm_madd_epi16(m128Tmp2,
2356
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2357
            E1h = _mm_madd_epi16(m128Tmp3,
2358
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2359
            E2l = _mm_madd_epi16(m128Tmp4,
2360
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2361
            E2h = _mm_madd_epi16(m128Tmp5,
2362
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2363
            E3l = _mm_madd_epi16(m128Tmp6,
2364
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2365
            E3h = _mm_madd_epi16(m128Tmp7,
2366
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2367
            O2l = _mm_add_epi32(E0l, E1l);
2368
            O2l = _mm_add_epi32(O2l, E2l);
2369
            O2l = _mm_add_epi32(O2l, E3l);
2370
2371
            O2h = _mm_add_epi32(E0h, E1h);
2372
            O2h = _mm_add_epi32(O2h, E2h);
2373
            O2h = _mm_add_epi32(O2h, E3h);
2374
2375
            /* Compute O3*/
2376
            E0l = _mm_madd_epi16(m128Tmp0,
2377
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2378
            E0h = _mm_madd_epi16(m128Tmp1,
2379
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2380
            E1l = _mm_madd_epi16(m128Tmp2,
2381
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2382
            E1h = _mm_madd_epi16(m128Tmp3,
2383
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2384
            E2l = _mm_madd_epi16(m128Tmp4,
2385
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2386
            E2h = _mm_madd_epi16(m128Tmp5,
2387
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2388
            E3l = _mm_madd_epi16(m128Tmp6,
2389
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2390
            E3h = _mm_madd_epi16(m128Tmp7,
2391
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2392
2393
            O3l = _mm_add_epi32(E0l, E1l);
2394
            O3l = _mm_add_epi32(O3l, E2l);
2395
            O3l = _mm_add_epi32(O3l, E3l);
2396
2397
            O3h = _mm_add_epi32(E0h, E1h);
2398
            O3h = _mm_add_epi32(O3h, E2h);
2399
            O3h = _mm_add_epi32(O3h, E3h);
2400
2401
            /* Compute O4*/
2402
2403
            E0l = _mm_madd_epi16(m128Tmp0,
2404
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2405
            E0h = _mm_madd_epi16(m128Tmp1,
2406
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2407
            E1l = _mm_madd_epi16(m128Tmp2,
2408
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2409
            E1h = _mm_madd_epi16(m128Tmp3,
2410
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2411
            E2l = _mm_madd_epi16(m128Tmp4,
2412
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2413
            E2h = _mm_madd_epi16(m128Tmp5,
2414
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2415
            E3l = _mm_madd_epi16(m128Tmp6,
2416
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2417
            E3h = _mm_madd_epi16(m128Tmp7,
2418
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2419
2420
            O4l = _mm_add_epi32(E0l, E1l);
2421
            O4l = _mm_add_epi32(O4l, E2l);
2422
            O4l = _mm_add_epi32(O4l, E3l);
2423
2424
            O4h = _mm_add_epi32(E0h, E1h);
2425
            O4h = _mm_add_epi32(O4h, E2h);
2426
            O4h = _mm_add_epi32(O4h, E3h);
2427
2428
            /* Compute O5*/
2429
            E0l = _mm_madd_epi16(m128Tmp0,
2430
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2431
            E0h = _mm_madd_epi16(m128Tmp1,
2432
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2433
            E1l = _mm_madd_epi16(m128Tmp2,
2434
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2435
            E1h = _mm_madd_epi16(m128Tmp3,
2436
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2437
            E2l = _mm_madd_epi16(m128Tmp4,
2438
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2439
            E2h = _mm_madd_epi16(m128Tmp5,
2440
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2441
            E3l = _mm_madd_epi16(m128Tmp6,
2442
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2443
            E3h = _mm_madd_epi16(m128Tmp7,
2444
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2445
2446
            O5l = _mm_add_epi32(E0l, E1l);
2447
            O5l = _mm_add_epi32(O5l, E2l);
2448
            O5l = _mm_add_epi32(O5l, E3l);
2449
2450
            O5h = _mm_add_epi32(E0h, E1h);
2451
            O5h = _mm_add_epi32(O5h, E2h);
2452
            O5h = _mm_add_epi32(O5h, E3h);
2453
2454
            /* Compute O6*/
2455
2456
            E0l = _mm_madd_epi16(m128Tmp0,
2457
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2458
            E0h = _mm_madd_epi16(m128Tmp1,
2459
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2460
            E1l = _mm_madd_epi16(m128Tmp2,
2461
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2462
            E1h = _mm_madd_epi16(m128Tmp3,
2463
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2464
            E2l = _mm_madd_epi16(m128Tmp4,
2465
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2466
            E2h = _mm_madd_epi16(m128Tmp5,
2467
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2468
            E3l = _mm_madd_epi16(m128Tmp6,
2469
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2470
            E3h = _mm_madd_epi16(m128Tmp7,
2471
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2472
2473
            O6l = _mm_add_epi32(E0l, E1l);
2474
            O6l = _mm_add_epi32(O6l, E2l);
2475
            O6l = _mm_add_epi32(O6l, E3l);
2476
2477
            O6h = _mm_add_epi32(E0h, E1h);
2478
            O6h = _mm_add_epi32(O6h, E2h);
2479
            O6h = _mm_add_epi32(O6h, E3h);
2480
2481
            /* Compute O7*/
2482
2483
            E0l = _mm_madd_epi16(m128Tmp0,
2484
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2485
            E0h = _mm_madd_epi16(m128Tmp1,
2486
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2487
            E1l = _mm_madd_epi16(m128Tmp2,
2488
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2489
            E1h = _mm_madd_epi16(m128Tmp3,
2490
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2491
            E2l = _mm_madd_epi16(m128Tmp4,
2492
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2493
            E2h = _mm_madd_epi16(m128Tmp5,
2494
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2495
            E3l = _mm_madd_epi16(m128Tmp6,
2496
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2497
            E3h = _mm_madd_epi16(m128Tmp7,
2498
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2499
2500
            O7l = _mm_add_epi32(E0l, E1l);
2501
            O7l = _mm_add_epi32(O7l, E2l);
2502
            O7l = _mm_add_epi32(O7l, E3l);
2503
2504
            O7h = _mm_add_epi32(E0h, E1h);
2505
            O7h = _mm_add_epi32(O7h, E2h);
2506
            O7h = _mm_add_epi32(O7h, E3h);
2507
2508
            /*  Compute E0  */
2509
2510
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2511
            E0l = _mm_madd_epi16(m128Tmp0,
2512
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2513
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2514
            E0h = _mm_madd_epi16(m128Tmp1,
2515
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2516
2517
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2518
            E0l = _mm_add_epi32(E0l,
2519
                    _mm_madd_epi16(m128Tmp2,
2520
                            _mm_load_si128(
2521
                                    (__m128i *) (transform16x16_2[1][0]))));
2522
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2523
            E0h = _mm_add_epi32(E0h,
2524
                    _mm_madd_epi16(m128Tmp3,
2525
                            _mm_load_si128(
2526
                                    (__m128i *) (transform16x16_2[1][0]))));
2527
2528
            /*  Compute E1  */
2529
            E1l = _mm_madd_epi16(m128Tmp0,
2530
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2531
            E1h = _mm_madd_epi16(m128Tmp1,
2532
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2533
            E1l = _mm_add_epi32(E1l,
2534
                    _mm_madd_epi16(m128Tmp2,
2535
                            _mm_load_si128(
2536
                                    (__m128i *) (transform16x16_2[1][1]))));
2537
            E1h = _mm_add_epi32(E1h,
2538
                    _mm_madd_epi16(m128Tmp3,
2539
                            _mm_load_si128(
2540
                                    (__m128i *) (transform16x16_2[1][1]))));
2541
2542
            /*  Compute E2  */
2543
            E2l = _mm_madd_epi16(m128Tmp0,
2544
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2545
            E2h = _mm_madd_epi16(m128Tmp1,
2546
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2547
            E2l = _mm_add_epi32(E2l,
2548
                    _mm_madd_epi16(m128Tmp2,
2549
                            _mm_load_si128(
2550
                                    (__m128i *) (transform16x16_2[1][2]))));
2551
            E2h = _mm_add_epi32(E2h,
2552
                    _mm_madd_epi16(m128Tmp3,
2553
                            _mm_load_si128(
2554
                                    (__m128i *) (transform16x16_2[1][2]))));
2555
            /*  Compute E3  */
2556
            E3l = _mm_madd_epi16(m128Tmp0,
2557
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2558
            E3h = _mm_madd_epi16(m128Tmp1,
2559
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2560
            E3l = _mm_add_epi32(E3l,
2561
                    _mm_madd_epi16(m128Tmp2,
2562
                            _mm_load_si128(
2563
                                    (__m128i *) (transform16x16_2[1][3]))));
2564
            E3h = _mm_add_epi32(E3h,
2565
                    _mm_madd_epi16(m128Tmp3,
2566
                            _mm_load_si128(
2567
                                    (__m128i *) (transform16x16_2[1][3]))));
2568
2569
            /*  Compute EE0 and EEE */
2570
2571
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2572
            E00l = _mm_madd_epi16(m128Tmp0,
2573
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2574
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2575
            E00h = _mm_madd_epi16(m128Tmp1,
2576
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2577
2578
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2579
            EE0l = _mm_madd_epi16(m128Tmp2,
2580
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2581
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2582
            EE0h = _mm_madd_epi16(m128Tmp3,
2583
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2584
2585
            E01l = _mm_madd_epi16(m128Tmp0,
2586
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2587
            E01h = _mm_madd_epi16(m128Tmp1,
2588
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2589
2590
            EE1l = _mm_madd_epi16(m128Tmp2,
2591
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2592
            EE1h = _mm_madd_epi16(m128Tmp3,
2593
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2594
2595
            /*  Compute EE    */
2596
            EE2l = _mm_sub_epi32(EE1l, E01l);
2597
            EE3l = _mm_sub_epi32(EE0l, E00l);
2598
            EE2h = _mm_sub_epi32(EE1h, E01h);
2599
            EE3h = _mm_sub_epi32(EE0h, E00h);
2600
2601
            EE0l = _mm_add_epi32(EE0l, E00l);
2602
            EE1l = _mm_add_epi32(EE1l, E01l);
2603
            EE0h = _mm_add_epi32(EE0h, E00h);
2604
            EE1h = _mm_add_epi32(EE1h, E01h);
2605
2606
            /*      Compute E       */
2607
2608
            E4l = _mm_sub_epi32(EE3l, E3l);
2609
            E4l = _mm_add_epi32(E4l, m128iAdd);
2610
2611
            E5l = _mm_sub_epi32(EE2l, E2l);
2612
            E5l = _mm_add_epi32(E5l, m128iAdd);
2613
2614
            E6l = _mm_sub_epi32(EE1l, E1l);
2615
            E6l = _mm_add_epi32(E6l, m128iAdd);
2616
2617
            E7l = _mm_sub_epi32(EE0l, E0l);
2618
            E7l = _mm_add_epi32(E7l, m128iAdd);
2619
2620
            E4h = _mm_sub_epi32(EE3h, E3h);
2621
            E4h = _mm_add_epi32(E4h, m128iAdd);
2622
2623
            E5h = _mm_sub_epi32(EE2h, E2h);
2624
            E5h = _mm_add_epi32(E5h, m128iAdd);
2625
2626
            E6h = _mm_sub_epi32(EE1h, E1h);
2627
            E6h = _mm_add_epi32(E6h, m128iAdd);
2628
2629
            E7h = _mm_sub_epi32(EE0h, E0h);
2630
            E7h = _mm_add_epi32(E7h, m128iAdd);
2631
2632
            E0l = _mm_add_epi32(EE0l, E0l);
2633
            E0l = _mm_add_epi32(E0l, m128iAdd);
2634
2635
            E1l = _mm_add_epi32(EE1l, E1l);
2636
            E1l = _mm_add_epi32(E1l, m128iAdd);
2637
2638
            E2l = _mm_add_epi32(EE2l, E2l);
2639
            E2l = _mm_add_epi32(E2l, m128iAdd);
2640
2641
            E3l = _mm_add_epi32(EE3l, E3l);
2642
            E3l = _mm_add_epi32(E3l, m128iAdd);
2643
2644
            E0h = _mm_add_epi32(EE0h, E0h);
2645
            E0h = _mm_add_epi32(E0h, m128iAdd);
2646
2647
            E1h = _mm_add_epi32(EE1h, E1h);
2648
            E1h = _mm_add_epi32(E1h, m128iAdd);
2649
2650
            E2h = _mm_add_epi32(EE2h, E2h);
2651
            E2h = _mm_add_epi32(E2h, m128iAdd);
2652
2653
            E3h = _mm_add_epi32(EE3h, E3h);
2654
            E3h = _mm_add_epi32(E3h, m128iAdd);
2655
2656
            m128iS0 = _mm_packs_epi32(
2657
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2658
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2659
            m128iS1 = _mm_packs_epi32(
2660
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2661
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2662
            m128iS2 = _mm_packs_epi32(
2663
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2664
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2665
            m128iS3 = _mm_packs_epi32(
2666
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2667
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2668
2669
            m128iS4 = _mm_packs_epi32(
2670
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2671
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2672
            m128iS5 = _mm_packs_epi32(
2673
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2674
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2675
            m128iS6 = _mm_packs_epi32(
2676
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2677
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2678
            m128iS7 = _mm_packs_epi32(
2679
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2680
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2681
2682
            m128iS15 = _mm_packs_epi32(
2683
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2684
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2685
            m128iS14 = _mm_packs_epi32(
2686
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2687
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2688
            m128iS13 = _mm_packs_epi32(
2689
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2690
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2691
            m128iS12 = _mm_packs_epi32(
2692
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2693
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2694
2695
            m128iS11 = _mm_packs_epi32(
2696
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2697
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2698
            m128iS10 = _mm_packs_epi32(
2699
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2700
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2701
            m128iS9 = _mm_packs_epi32(
2702
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2703
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2704
            m128iS8 = _mm_packs_epi32(
2705
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2706
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2707
2708
            if (!j) {
2709
                /*      Inverse the matrix      */
2710
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2711
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2712
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2713
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2714
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2715
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2716
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2717
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2718
2719
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2720
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2721
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2722
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2723
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2724
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2725
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2726
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2727
2728
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2729
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2730
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2731
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2732
2733
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2734
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2735
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2736
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2737
2738
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2739
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2740
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2741
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2742
2743
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2744
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2745
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2746
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2747
2748
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2749
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2750
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2751
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2752
2753
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2754
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2755
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2756
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2757
2758
                m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2759
                m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2760
                m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2761
                m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2762
2763
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2764
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2765
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2766
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2767
2768
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2769
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2770
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2771
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2772
2773
                m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2774
                m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2775
                m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2776
                m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2777
2778
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2779
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2780
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2781
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2782
2783
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2784
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2785
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2786
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2787
2788
                /*  */
2789
                _mm_store_si128((__m128i *) (src + i), m128iS0);
2790
                _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2791
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2792
                _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2793
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2794
                _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2795
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2796
                _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2797
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2798
                _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2799
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2800
                _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2801
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2802
                _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2803
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2804
                _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2805
2806
                if (!i) {
2807
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2808
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2809
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2810
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2811
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2812
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2813
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2814
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2815
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2816
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2817
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2818
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2819
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2820
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2821
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2822
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2823
                } else {
2824
                    m128iS0 = _mm_load_si128((__m128i *) (src));
2825
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2826
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2827
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2828
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2829
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2830
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2831
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2832
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2833
                    m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2834
                    m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2835
                    m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2836
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2837
                    m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2838
                    m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2839
                    m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2840
                    shift = shift_2nd;
2841
                    m128iAdd = _mm_set1_epi32(add_2nd);
2842
                }
2843
2844
            } else {
2845
                int k, m = 0;
2846
                _mm_storeu_si128((__m128i *) (src), m128iS0);
2847
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2848
                _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2849
                _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2850
                _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2851
                _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2852
                _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2853
                _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2854
                _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2855
                _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2856
                _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2857
                _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2858
                _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2859
                _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2860
                _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2861
                _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2862
                dst = (uint16_t*) _dst + (i * stride);
2863
2864
                for (k = 0; k < 8; k++) {
2865
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2866
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2867
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2868
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2869
                    dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2870
                    dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2871
                    dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2872
                    dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2873
2874
                    dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2875
                    dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2876
                    dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2877
                    dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2878
                    dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2879
                    dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2880
                    dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2881
                    dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2882
                    m += 1;
2883
                    dst += stride;
2884
                }
2885
                if (!i) {
2886
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2887
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2888
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2889
                    m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2890
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2891
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2892
                    m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2893
                    m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2894
                    m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2895
                    m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2896
                    m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2897
                    m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2898
                    m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2899
                    m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2900
                    m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2901
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2902
                }
2903
            }
2904
        }
2905
    }
2906
2907
}
2908
#endif
2909
2910
2911
#if HAVE_SSE4_1
2912
// All m128iS0..m128iS31 are unconditionally loaded at function entry before any
2913
// use, but GCC's path analysis gives up inside this very large inlined function
2914
// and emits spurious -Wmaybe-uninitialized warnings for them. Suppress them here.
2915
#if defined(__GNUC__) && !defined(__clang__)
2916
#pragma GCC diagnostic push
2917
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
2918
#endif
2919
void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
2920
0
        ptrdiff_t _stride) {
2921
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
2922
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2923
0
    int i, j;
2924
0
    uint8_t *dst = (uint8_t*) _dst;
2925
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
2926
0
    int shift;
2927
0
    const int16_t *src = coeffs;
2928
2929
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2930
0
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2931
0
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2932
0
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2933
0
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2934
0
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2935
0
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2936
0
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2937
0
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2938
0
            EEE0l, EEE1l, EEE0h, EEE1h;
2939
0
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2940
0
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2941
0
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2942
0
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2943
0
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2944
0
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2945
0
            EE4l, EE7h, EE6h, EE5h, EE4h;
2946
2947
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2948
0
    __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2949
0
    __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2950
0
    __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2951
2952
2953
0
    m128iS0 = _mm_load_si128((__m128i *) (src));
2954
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2955
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2956
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2957
0
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2958
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2959
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2960
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2961
0
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2962
0
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2963
0
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2964
0
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2965
0
    m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2966
0
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2967
0
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2968
0
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2969
0
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2970
0
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2971
0
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2972
0
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2973
0
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2974
0
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2975
0
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2976
0
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2977
0
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2978
0
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2979
0
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2980
0
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2981
0
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2982
0
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2983
0
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2984
0
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2985
2986
0
    shift = shift_1st;
2987
0
    m128iAdd = _mm_set1_epi32(add_1st);
2988
2989
0
    for (j = 0; j < 2; j++) {
2990
0
        for (i = 0; i < 32; i += 8) {
2991
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2992
0
            E0l = _mm_madd_epi16(m128Tmp0,
2993
0
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2994
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2995
0
            E0h = _mm_madd_epi16(m128Tmp1,
2996
0
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2997
2998
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2999
0
            E1l = _mm_madd_epi16(m128Tmp2,
3000
0
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
3001
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
3002
0
            E1h = _mm_madd_epi16(m128Tmp3,
3003
0
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
3004
3005
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
3006
0
            E2l = _mm_madd_epi16(m128Tmp4,
3007
0
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3008
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3009
0
            E2h = _mm_madd_epi16(m128Tmp5,
3010
0
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3011
3012
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3013
0
            E3l = _mm_madd_epi16(m128Tmp6,
3014
0
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3015
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3016
0
            E3h = _mm_madd_epi16(m128Tmp7,
3017
0
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3018
3019
0
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3020
0
            E4l = _mm_madd_epi16(m128Tmp8,
3021
0
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3022
0
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3023
0
            E4h = _mm_madd_epi16(m128Tmp9,
3024
0
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3025
3026
0
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3027
0
            E5l = _mm_madd_epi16(m128Tmp10,
3028
0
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3029
0
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3030
0
            E5h = _mm_madd_epi16(m128Tmp11,
3031
0
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3032
3033
0
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3034
0
            E6l = _mm_madd_epi16(m128Tmp12,
3035
0
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3036
0
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3037
0
            E6h = _mm_madd_epi16(m128Tmp13,
3038
0
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3039
3040
0
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3041
0
            E7l = _mm_madd_epi16(m128Tmp14,
3042
0
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3043
0
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3044
0
            E7h = _mm_madd_epi16(m128Tmp15,
3045
0
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3046
3047
0
            O0l = _mm_add_epi32(E0l, E1l);
3048
0
            O0l = _mm_add_epi32(O0l, E2l);
3049
0
            O0l = _mm_add_epi32(O0l, E3l);
3050
0
            O0l = _mm_add_epi32(O0l, E4l);
3051
0
            O0l = _mm_add_epi32(O0l, E5l);
3052
0
            O0l = _mm_add_epi32(O0l, E6l);
3053
0
            O0l = _mm_add_epi32(O0l, E7l);
3054
3055
0
            O0h = _mm_add_epi32(E0h, E1h);
3056
0
            O0h = _mm_add_epi32(O0h, E2h);
3057
0
            O0h = _mm_add_epi32(O0h, E3h);
3058
0
            O0h = _mm_add_epi32(O0h, E4h);
3059
0
            O0h = _mm_add_epi32(O0h, E5h);
3060
0
            O0h = _mm_add_epi32(O0h, E6h);
3061
0
            O0h = _mm_add_epi32(O0h, E7h);
3062
3063
            /* Compute O1*/
3064
0
            E0l = _mm_madd_epi16(m128Tmp0,
3065
0
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3066
0
            E0h = _mm_madd_epi16(m128Tmp1,
3067
0
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3068
0
            E1l = _mm_madd_epi16(m128Tmp2,
3069
0
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3070
0
            E1h = _mm_madd_epi16(m128Tmp3,
3071
0
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3072
0
            E2l = _mm_madd_epi16(m128Tmp4,
3073
0
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3074
0
            E2h = _mm_madd_epi16(m128Tmp5,
3075
0
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3076
0
            E3l = _mm_madd_epi16(m128Tmp6,
3077
0
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3078
0
            E3h = _mm_madd_epi16(m128Tmp7,
3079
0
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3080
3081
0
            E4l = _mm_madd_epi16(m128Tmp8,
3082
0
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3083
0
            E4h = _mm_madd_epi16(m128Tmp9,
3084
0
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3085
0
            E5l = _mm_madd_epi16(m128Tmp10,
3086
0
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3087
0
            E5h = _mm_madd_epi16(m128Tmp11,
3088
0
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3089
0
            E6l = _mm_madd_epi16(m128Tmp12,
3090
0
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3091
0
            E6h = _mm_madd_epi16(m128Tmp13,
3092
0
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3093
0
            E7l = _mm_madd_epi16(m128Tmp14,
3094
0
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3095
0
            E7h = _mm_madd_epi16(m128Tmp15,
3096
0
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3097
3098
0
            O1l = _mm_add_epi32(E0l, E1l);
3099
0
            O1l = _mm_add_epi32(O1l, E2l);
3100
0
            O1l = _mm_add_epi32(O1l, E3l);
3101
0
            O1l = _mm_add_epi32(O1l, E4l);
3102
0
            O1l = _mm_add_epi32(O1l, E5l);
3103
0
            O1l = _mm_add_epi32(O1l, E6l);
3104
0
            O1l = _mm_add_epi32(O1l, E7l);
3105
3106
0
            O1h = _mm_add_epi32(E0h, E1h);
3107
0
            O1h = _mm_add_epi32(O1h, E2h);
3108
0
            O1h = _mm_add_epi32(O1h, E3h);
3109
0
            O1h = _mm_add_epi32(O1h, E4h);
3110
0
            O1h = _mm_add_epi32(O1h, E5h);
3111
0
            O1h = _mm_add_epi32(O1h, E6h);
3112
0
            O1h = _mm_add_epi32(O1h, E7h);
3113
            /* Compute O2*/
3114
0
            E0l = _mm_madd_epi16(m128Tmp0,
3115
0
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3116
0
            E0h = _mm_madd_epi16(m128Tmp1,
3117
0
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3118
0
            E1l = _mm_madd_epi16(m128Tmp2,
3119
0
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3120
0
            E1h = _mm_madd_epi16(m128Tmp3,
3121
0
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3122
0
            E2l = _mm_madd_epi16(m128Tmp4,
3123
0
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3124
0
            E2h = _mm_madd_epi16(m128Tmp5,
3125
0
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3126
0
            E3l = _mm_madd_epi16(m128Tmp6,
3127
0
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3128
0
            E3h = _mm_madd_epi16(m128Tmp7,
3129
0
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3130
3131
0
            E4l = _mm_madd_epi16(m128Tmp8,
3132
0
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3133
0
            E4h = _mm_madd_epi16(m128Tmp9,
3134
0
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3135
0
            E5l = _mm_madd_epi16(m128Tmp10,
3136
0
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3137
0
            E5h = _mm_madd_epi16(m128Tmp11,
3138
0
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3139
0
            E6l = _mm_madd_epi16(m128Tmp12,
3140
0
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3141
0
            E6h = _mm_madd_epi16(m128Tmp13,
3142
0
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3143
0
            E7l = _mm_madd_epi16(m128Tmp14,
3144
0
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3145
0
            E7h = _mm_madd_epi16(m128Tmp15,
3146
0
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3147
3148
0
            O2l = _mm_add_epi32(E0l, E1l);
3149
0
            O2l = _mm_add_epi32(O2l, E2l);
3150
0
            O2l = _mm_add_epi32(O2l, E3l);
3151
0
            O2l = _mm_add_epi32(O2l, E4l);
3152
0
            O2l = _mm_add_epi32(O2l, E5l);
3153
0
            O2l = _mm_add_epi32(O2l, E6l);
3154
0
            O2l = _mm_add_epi32(O2l, E7l);
3155
3156
0
            O2h = _mm_add_epi32(E0h, E1h);
3157
0
            O2h = _mm_add_epi32(O2h, E2h);
3158
0
            O2h = _mm_add_epi32(O2h, E3h);
3159
0
            O2h = _mm_add_epi32(O2h, E4h);
3160
0
            O2h = _mm_add_epi32(O2h, E5h);
3161
0
            O2h = _mm_add_epi32(O2h, E6h);
3162
0
            O2h = _mm_add_epi32(O2h, E7h);
3163
            /* Compute O3*/
3164
0
            E0l = _mm_madd_epi16(m128Tmp0,
3165
0
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3166
0
            E0h = _mm_madd_epi16(m128Tmp1,
3167
0
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3168
0
            E1l = _mm_madd_epi16(m128Tmp2,
3169
0
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3170
0
            E1h = _mm_madd_epi16(m128Tmp3,
3171
0
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3172
0
            E2l = _mm_madd_epi16(m128Tmp4,
3173
0
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3174
0
            E2h = _mm_madd_epi16(m128Tmp5,
3175
0
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3176
0
            E3l = _mm_madd_epi16(m128Tmp6,
3177
0
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3178
0
            E3h = _mm_madd_epi16(m128Tmp7,
3179
0
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3180
3181
0
            E4l = _mm_madd_epi16(m128Tmp8,
3182
0
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3183
0
            E4h = _mm_madd_epi16(m128Tmp9,
3184
0
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3185
0
            E5l = _mm_madd_epi16(m128Tmp10,
3186
0
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3187
0
            E5h = _mm_madd_epi16(m128Tmp11,
3188
0
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3189
0
            E6l = _mm_madd_epi16(m128Tmp12,
3190
0
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3191
0
            E6h = _mm_madd_epi16(m128Tmp13,
3192
0
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3193
0
            E7l = _mm_madd_epi16(m128Tmp14,
3194
0
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3195
0
            E7h = _mm_madd_epi16(m128Tmp15,
3196
0
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3197
3198
0
            O3l = _mm_add_epi32(E0l, E1l);
3199
0
            O3l = _mm_add_epi32(O3l, E2l);
3200
0
            O3l = _mm_add_epi32(O3l, E3l);
3201
0
            O3l = _mm_add_epi32(O3l, E4l);
3202
0
            O3l = _mm_add_epi32(O3l, E5l);
3203
0
            O3l = _mm_add_epi32(O3l, E6l);
3204
0
            O3l = _mm_add_epi32(O3l, E7l);
3205
3206
0
            O3h = _mm_add_epi32(E0h, E1h);
3207
0
            O3h = _mm_add_epi32(O3h, E2h);
3208
0
            O3h = _mm_add_epi32(O3h, E3h);
3209
0
            O3h = _mm_add_epi32(O3h, E4h);
3210
0
            O3h = _mm_add_epi32(O3h, E5h);
3211
0
            O3h = _mm_add_epi32(O3h, E6h);
3212
0
            O3h = _mm_add_epi32(O3h, E7h);
3213
            /* Compute O4*/
3214
3215
0
            E0l = _mm_madd_epi16(m128Tmp0,
3216
0
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3217
0
            E0h = _mm_madd_epi16(m128Tmp1,
3218
0
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3219
0
            E1l = _mm_madd_epi16(m128Tmp2,
3220
0
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3221
0
            E1h = _mm_madd_epi16(m128Tmp3,
3222
0
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3223
0
            E2l = _mm_madd_epi16(m128Tmp4,
3224
0
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3225
0
            E2h = _mm_madd_epi16(m128Tmp5,
3226
0
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3227
0
            E3l = _mm_madd_epi16(m128Tmp6,
3228
0
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3229
0
            E3h = _mm_madd_epi16(m128Tmp7,
3230
0
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3231
3232
0
            E4l = _mm_madd_epi16(m128Tmp8,
3233
0
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3234
0
            E4h = _mm_madd_epi16(m128Tmp9,
3235
0
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3236
0
            E5l = _mm_madd_epi16(m128Tmp10,
3237
0
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3238
0
            E5h = _mm_madd_epi16(m128Tmp11,
3239
0
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3240
0
            E6l = _mm_madd_epi16(m128Tmp12,
3241
0
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3242
0
            E6h = _mm_madd_epi16(m128Tmp13,
3243
0
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3244
0
            E7l = _mm_madd_epi16(m128Tmp14,
3245
0
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3246
0
            E7h = _mm_madd_epi16(m128Tmp15,
3247
0
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3248
3249
0
            O4l = _mm_add_epi32(E0l, E1l);
3250
0
            O4l = _mm_add_epi32(O4l, E2l);
3251
0
            O4l = _mm_add_epi32(O4l, E3l);
3252
0
            O4l = _mm_add_epi32(O4l, E4l);
3253
0
            O4l = _mm_add_epi32(O4l, E5l);
3254
0
            O4l = _mm_add_epi32(O4l, E6l);
3255
0
            O4l = _mm_add_epi32(O4l, E7l);
3256
3257
0
            O4h = _mm_add_epi32(E0h, E1h);
3258
0
            O4h = _mm_add_epi32(O4h, E2h);
3259
0
            O4h = _mm_add_epi32(O4h, E3h);
3260
0
            O4h = _mm_add_epi32(O4h, E4h);
3261
0
            O4h = _mm_add_epi32(O4h, E5h);
3262
0
            O4h = _mm_add_epi32(O4h, E6h);
3263
0
            O4h = _mm_add_epi32(O4h, E7h);
3264
3265
            /* Compute O5*/
3266
0
            E0l = _mm_madd_epi16(m128Tmp0,
3267
0
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3268
0
            E0h = _mm_madd_epi16(m128Tmp1,
3269
0
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3270
0
            E1l = _mm_madd_epi16(m128Tmp2,
3271
0
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3272
0
            E1h = _mm_madd_epi16(m128Tmp3,
3273
0
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3274
0
            E2l = _mm_madd_epi16(m128Tmp4,
3275
0
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3276
0
            E2h = _mm_madd_epi16(m128Tmp5,
3277
0
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3278
0
            E3l = _mm_madd_epi16(m128Tmp6,
3279
0
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3280
0
            E3h = _mm_madd_epi16(m128Tmp7,
3281
0
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3282
3283
0
            E4l = _mm_madd_epi16(m128Tmp8,
3284
0
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3285
0
            E4h = _mm_madd_epi16(m128Tmp9,
3286
0
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3287
0
            E5l = _mm_madd_epi16(m128Tmp10,
3288
0
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3289
0
            E5h = _mm_madd_epi16(m128Tmp11,
3290
0
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3291
0
            E6l = _mm_madd_epi16(m128Tmp12,
3292
0
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3293
0
            E6h = _mm_madd_epi16(m128Tmp13,
3294
0
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3295
0
            E7l = _mm_madd_epi16(m128Tmp14,
3296
0
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3297
0
            E7h = _mm_madd_epi16(m128Tmp15,
3298
0
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3299
3300
0
            O5l = _mm_add_epi32(E0l, E1l);
3301
0
            O5l = _mm_add_epi32(O5l, E2l);
3302
0
            O5l = _mm_add_epi32(O5l, E3l);
3303
0
            O5l = _mm_add_epi32(O5l, E4l);
3304
0
            O5l = _mm_add_epi32(O5l, E5l);
3305
0
            O5l = _mm_add_epi32(O5l, E6l);
3306
0
            O5l = _mm_add_epi32(O5l, E7l);
3307
3308
0
            O5h = _mm_add_epi32(E0h, E1h);
3309
0
            O5h = _mm_add_epi32(O5h, E2h);
3310
0
            O5h = _mm_add_epi32(O5h, E3h);
3311
0
            O5h = _mm_add_epi32(O5h, E4h);
3312
0
            O5h = _mm_add_epi32(O5h, E5h);
3313
0
            O5h = _mm_add_epi32(O5h, E6h);
3314
0
            O5h = _mm_add_epi32(O5h, E7h);
3315
3316
            /* Compute O6*/
3317
3318
0
            E0l = _mm_madd_epi16(m128Tmp0,
3319
0
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3320
0
            E0h = _mm_madd_epi16(m128Tmp1,
3321
0
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3322
0
            E1l = _mm_madd_epi16(m128Tmp2,
3323
0
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3324
0
            E1h = _mm_madd_epi16(m128Tmp3,
3325
0
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3326
0
            E2l = _mm_madd_epi16(m128Tmp4,
3327
0
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3328
0
            E2h = _mm_madd_epi16(m128Tmp5,
3329
0
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3330
0
            E3l = _mm_madd_epi16(m128Tmp6,
3331
0
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3332
0
            E3h = _mm_madd_epi16(m128Tmp7,
3333
0
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3334
3335
0
            E4l = _mm_madd_epi16(m128Tmp8,
3336
0
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3337
0
            E4h = _mm_madd_epi16(m128Tmp9,
3338
0
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3339
0
            E5l = _mm_madd_epi16(m128Tmp10,
3340
0
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3341
0
            E5h = _mm_madd_epi16(m128Tmp11,
3342
0
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3343
0
            E6l = _mm_madd_epi16(m128Tmp12,
3344
0
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3345
0
            E6h = _mm_madd_epi16(m128Tmp13,
3346
0
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3347
0
            E7l = _mm_madd_epi16(m128Tmp14,
3348
0
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3349
0
            E7h = _mm_madd_epi16(m128Tmp15,
3350
0
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3351
3352
0
            O6l = _mm_add_epi32(E0l, E1l);
3353
0
            O6l = _mm_add_epi32(O6l, E2l);
3354
0
            O6l = _mm_add_epi32(O6l, E3l);
3355
0
            O6l = _mm_add_epi32(O6l, E4l);
3356
0
            O6l = _mm_add_epi32(O6l, E5l);
3357
0
            O6l = _mm_add_epi32(O6l, E6l);
3358
0
            O6l = _mm_add_epi32(O6l, E7l);
3359
3360
0
            O6h = _mm_add_epi32(E0h, E1h);
3361
0
            O6h = _mm_add_epi32(O6h, E2h);
3362
0
            O6h = _mm_add_epi32(O6h, E3h);
3363
0
            O6h = _mm_add_epi32(O6h, E4h);
3364
0
            O6h = _mm_add_epi32(O6h, E5h);
3365
0
            O6h = _mm_add_epi32(O6h, E6h);
3366
0
            O6h = _mm_add_epi32(O6h, E7h);
3367
3368
            /* Compute O7*/
3369
3370
0
            E0l = _mm_madd_epi16(m128Tmp0,
3371
0
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3372
0
            E0h = _mm_madd_epi16(m128Tmp1,
3373
0
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3374
0
            E1l = _mm_madd_epi16(m128Tmp2,
3375
0
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3376
0
            E1h = _mm_madd_epi16(m128Tmp3,
3377
0
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3378
0
            E2l = _mm_madd_epi16(m128Tmp4,
3379
0
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3380
0
            E2h = _mm_madd_epi16(m128Tmp5,
3381
0
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3382
0
            E3l = _mm_madd_epi16(m128Tmp6,
3383
0
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3384
0
            E3h = _mm_madd_epi16(m128Tmp7,
3385
0
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3386
3387
0
            E4l = _mm_madd_epi16(m128Tmp8,
3388
0
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3389
0
            E4h = _mm_madd_epi16(m128Tmp9,
3390
0
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3391
0
            E5l = _mm_madd_epi16(m128Tmp10,
3392
0
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3393
0
            E5h = _mm_madd_epi16(m128Tmp11,
3394
0
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3395
0
            E6l = _mm_madd_epi16(m128Tmp12,
3396
0
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3397
0
            E6h = _mm_madd_epi16(m128Tmp13,
3398
0
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3399
0
            E7l = _mm_madd_epi16(m128Tmp14,
3400
0
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3401
0
            E7h = _mm_madd_epi16(m128Tmp15,
3402
0
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3403
3404
0
            O7l = _mm_add_epi32(E0l, E1l);
3405
0
            O7l = _mm_add_epi32(O7l, E2l);
3406
0
            O7l = _mm_add_epi32(O7l, E3l);
3407
0
            O7l = _mm_add_epi32(O7l, E4l);
3408
0
            O7l = _mm_add_epi32(O7l, E5l);
3409
0
            O7l = _mm_add_epi32(O7l, E6l);
3410
0
            O7l = _mm_add_epi32(O7l, E7l);
3411
3412
0
            O7h = _mm_add_epi32(E0h, E1h);
3413
0
            O7h = _mm_add_epi32(O7h, E2h);
3414
0
            O7h = _mm_add_epi32(O7h, E3h);
3415
0
            O7h = _mm_add_epi32(O7h, E4h);
3416
0
            O7h = _mm_add_epi32(O7h, E5h);
3417
0
            O7h = _mm_add_epi32(O7h, E6h);
3418
0
            O7h = _mm_add_epi32(O7h, E7h);
3419
3420
            /* Compute O8*/
3421
3422
0
            E0l = _mm_madd_epi16(m128Tmp0,
3423
0
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3424
0
            E0h = _mm_madd_epi16(m128Tmp1,
3425
0
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3426
0
            E1l = _mm_madd_epi16(m128Tmp2,
3427
0
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3428
0
            E1h = _mm_madd_epi16(m128Tmp3,
3429
0
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3430
0
            E2l = _mm_madd_epi16(m128Tmp4,
3431
0
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3432
0
            E2h = _mm_madd_epi16(m128Tmp5,
3433
0
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3434
0
            E3l = _mm_madd_epi16(m128Tmp6,
3435
0
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3436
0
            E3h = _mm_madd_epi16(m128Tmp7,
3437
0
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3438
3439
0
            E4l = _mm_madd_epi16(m128Tmp8,
3440
0
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3441
0
            E4h = _mm_madd_epi16(m128Tmp9,
3442
0
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3443
0
            E5l = _mm_madd_epi16(m128Tmp10,
3444
0
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3445
0
            E5h = _mm_madd_epi16(m128Tmp11,
3446
0
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3447
0
            E6l = _mm_madd_epi16(m128Tmp12,
3448
0
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3449
0
            E6h = _mm_madd_epi16(m128Tmp13,
3450
0
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3451
0
            E7l = _mm_madd_epi16(m128Tmp14,
3452
0
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3453
0
            E7h = _mm_madd_epi16(m128Tmp15,
3454
0
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3455
3456
0
            O8l = _mm_add_epi32(E0l, E1l);
3457
0
            O8l = _mm_add_epi32(O8l, E2l);
3458
0
            O8l = _mm_add_epi32(O8l, E3l);
3459
0
            O8l = _mm_add_epi32(O8l, E4l);
3460
0
            O8l = _mm_add_epi32(O8l, E5l);
3461
0
            O8l = _mm_add_epi32(O8l, E6l);
3462
0
            O8l = _mm_add_epi32(O8l, E7l);
3463
3464
0
            O8h = _mm_add_epi32(E0h, E1h);
3465
0
            O8h = _mm_add_epi32(O8h, E2h);
3466
0
            O8h = _mm_add_epi32(O8h, E3h);
3467
0
            O8h = _mm_add_epi32(O8h, E4h);
3468
0
            O8h = _mm_add_epi32(O8h, E5h);
3469
0
            O8h = _mm_add_epi32(O8h, E6h);
3470
0
            O8h = _mm_add_epi32(O8h, E7h);
3471
3472
            /* Compute O9*/
3473
3474
0
            E0l = _mm_madd_epi16(m128Tmp0,
3475
0
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3476
0
            E0h = _mm_madd_epi16(m128Tmp1,
3477
0
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3478
0
            E1l = _mm_madd_epi16(m128Tmp2,
3479
0
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3480
0
            E1h = _mm_madd_epi16(m128Tmp3,
3481
0
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3482
0
            E2l = _mm_madd_epi16(m128Tmp4,
3483
0
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3484
0
            E2h = _mm_madd_epi16(m128Tmp5,
3485
0
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3486
0
            E3l = _mm_madd_epi16(m128Tmp6,
3487
0
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3488
0
            E3h = _mm_madd_epi16(m128Tmp7,
3489
0
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3490
3491
0
            E4l = _mm_madd_epi16(m128Tmp8,
3492
0
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3493
0
            E4h = _mm_madd_epi16(m128Tmp9,
3494
0
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3495
0
            E5l = _mm_madd_epi16(m128Tmp10,
3496
0
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3497
0
            E5h = _mm_madd_epi16(m128Tmp11,
3498
0
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3499
0
            E6l = _mm_madd_epi16(m128Tmp12,
3500
0
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3501
0
            E6h = _mm_madd_epi16(m128Tmp13,
3502
0
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3503
0
            E7l = _mm_madd_epi16(m128Tmp14,
3504
0
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3505
0
            E7h = _mm_madd_epi16(m128Tmp15,
3506
0
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3507
3508
0
            O9l = _mm_add_epi32(E0l, E1l);
3509
0
            O9l = _mm_add_epi32(O9l, E2l);
3510
0
            O9l = _mm_add_epi32(O9l, E3l);
3511
0
            O9l = _mm_add_epi32(O9l, E4l);
3512
0
            O9l = _mm_add_epi32(O9l, E5l);
3513
0
            O9l = _mm_add_epi32(O9l, E6l);
3514
0
            O9l = _mm_add_epi32(O9l, E7l);
3515
3516
0
            O9h = _mm_add_epi32(E0h, E1h);
3517
0
            O9h = _mm_add_epi32(O9h, E2h);
3518
0
            O9h = _mm_add_epi32(O9h, E3h);
3519
0
            O9h = _mm_add_epi32(O9h, E4h);
3520
0
            O9h = _mm_add_epi32(O9h, E5h);
3521
0
            O9h = _mm_add_epi32(O9h, E6h);
3522
0
            O9h = _mm_add_epi32(O9h, E7h);
3523
3524
            /* Compute 10*/
3525
3526
0
            E0l = _mm_madd_epi16(m128Tmp0,
3527
0
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3528
0
            E0h = _mm_madd_epi16(m128Tmp1,
3529
0
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3530
0
            E1l = _mm_madd_epi16(m128Tmp2,
3531
0
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3532
0
            E1h = _mm_madd_epi16(m128Tmp3,
3533
0
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3534
0
            E2l = _mm_madd_epi16(m128Tmp4,
3535
0
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3536
0
            E2h = _mm_madd_epi16(m128Tmp5,
3537
0
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3538
0
            E3l = _mm_madd_epi16(m128Tmp6,
3539
0
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3540
0
            E3h = _mm_madd_epi16(m128Tmp7,
3541
0
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3542
3543
0
            E4l = _mm_madd_epi16(m128Tmp8,
3544
0
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3545
0
            E4h = _mm_madd_epi16(m128Tmp9,
3546
0
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3547
0
            E5l = _mm_madd_epi16(m128Tmp10,
3548
0
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3549
0
            E5h = _mm_madd_epi16(m128Tmp11,
3550
0
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3551
0
            E6l = _mm_madd_epi16(m128Tmp12,
3552
0
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3553
0
            E6h = _mm_madd_epi16(m128Tmp13,
3554
0
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3555
0
            E7l = _mm_madd_epi16(m128Tmp14,
3556
0
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3557
0
            E7h = _mm_madd_epi16(m128Tmp15,
3558
0
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3559
3560
0
            O10l = _mm_add_epi32(E0l, E1l);
3561
0
            O10l = _mm_add_epi32(O10l, E2l);
3562
0
            O10l = _mm_add_epi32(O10l, E3l);
3563
0
            O10l = _mm_add_epi32(O10l, E4l);
3564
0
            O10l = _mm_add_epi32(O10l, E5l);
3565
0
            O10l = _mm_add_epi32(O10l, E6l);
3566
0
            O10l = _mm_add_epi32(O10l, E7l);
3567
3568
0
            O10h = _mm_add_epi32(E0h, E1h);
3569
0
            O10h = _mm_add_epi32(O10h, E2h);
3570
0
            O10h = _mm_add_epi32(O10h, E3h);
3571
0
            O10h = _mm_add_epi32(O10h, E4h);
3572
0
            O10h = _mm_add_epi32(O10h, E5h);
3573
0
            O10h = _mm_add_epi32(O10h, E6h);
3574
0
            O10h = _mm_add_epi32(O10h, E7h);
3575
3576
            /* Compute 11*/
3577
3578
0
            E0l = _mm_madd_epi16(m128Tmp0,
3579
0
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3580
0
            E0h = _mm_madd_epi16(m128Tmp1,
3581
0
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3582
0
            E1l = _mm_madd_epi16(m128Tmp2,
3583
0
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3584
0
            E1h = _mm_madd_epi16(m128Tmp3,
3585
0
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3586
0
            E2l = _mm_madd_epi16(m128Tmp4,
3587
0
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3588
0
            E2h = _mm_madd_epi16(m128Tmp5,
3589
0
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3590
0
            E3l = _mm_madd_epi16(m128Tmp6,
3591
0
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3592
0
            E3h = _mm_madd_epi16(m128Tmp7,
3593
0
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3594
3595
0
            E4l = _mm_madd_epi16(m128Tmp8,
3596
0
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3597
0
            E4h = _mm_madd_epi16(m128Tmp9,
3598
0
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3599
0
            E5l = _mm_madd_epi16(m128Tmp10,
3600
0
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3601
0
            E5h = _mm_madd_epi16(m128Tmp11,
3602
0
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3603
0
            E6l = _mm_madd_epi16(m128Tmp12,
3604
0
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3605
0
            E6h = _mm_madd_epi16(m128Tmp13,
3606
0
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3607
0
            E7l = _mm_madd_epi16(m128Tmp14,
3608
0
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3609
0
            E7h = _mm_madd_epi16(m128Tmp15,
3610
0
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3611
3612
0
            O11l = _mm_add_epi32(E0l, E1l);
3613
0
            O11l = _mm_add_epi32(O11l, E2l);
3614
0
            O11l = _mm_add_epi32(O11l, E3l);
3615
0
            O11l = _mm_add_epi32(O11l, E4l);
3616
0
            O11l = _mm_add_epi32(O11l, E5l);
3617
0
            O11l = _mm_add_epi32(O11l, E6l);
3618
0
            O11l = _mm_add_epi32(O11l, E7l);
3619
3620
0
            O11h = _mm_add_epi32(E0h, E1h);
3621
0
            O11h = _mm_add_epi32(O11h, E2h);
3622
0
            O11h = _mm_add_epi32(O11h, E3h);
3623
0
            O11h = _mm_add_epi32(O11h, E4h);
3624
0
            O11h = _mm_add_epi32(O11h, E5h);
3625
0
            O11h = _mm_add_epi32(O11h, E6h);
3626
0
            O11h = _mm_add_epi32(O11h, E7h);
3627
3628
            /* Compute 12*/
3629
3630
0
            E0l = _mm_madd_epi16(m128Tmp0,
3631
0
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3632
0
            E0h = _mm_madd_epi16(m128Tmp1,
3633
0
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3634
0
            E1l = _mm_madd_epi16(m128Tmp2,
3635
0
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3636
0
            E1h = _mm_madd_epi16(m128Tmp3,
3637
0
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3638
0
            E2l = _mm_madd_epi16(m128Tmp4,
3639
0
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3640
0
            E2h = _mm_madd_epi16(m128Tmp5,
3641
0
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3642
0
            E3l = _mm_madd_epi16(m128Tmp6,
3643
0
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3644
0
            E3h = _mm_madd_epi16(m128Tmp7,
3645
0
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3646
3647
0
            E4l = _mm_madd_epi16(m128Tmp8,
3648
0
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3649
0
            E4h = _mm_madd_epi16(m128Tmp9,
3650
0
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3651
0
            E5l = _mm_madd_epi16(m128Tmp10,
3652
0
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3653
0
            E5h = _mm_madd_epi16(m128Tmp11,
3654
0
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3655
0
            E6l = _mm_madd_epi16(m128Tmp12,
3656
0
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3657
0
            E6h = _mm_madd_epi16(m128Tmp13,
3658
0
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3659
0
            E7l = _mm_madd_epi16(m128Tmp14,
3660
0
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3661
0
            E7h = _mm_madd_epi16(m128Tmp15,
3662
0
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3663
3664
0
            O12l = _mm_add_epi32(E0l, E1l);
3665
0
            O12l = _mm_add_epi32(O12l, E2l);
3666
0
            O12l = _mm_add_epi32(O12l, E3l);
3667
0
            O12l = _mm_add_epi32(O12l, E4l);
3668
0
            O12l = _mm_add_epi32(O12l, E5l);
3669
0
            O12l = _mm_add_epi32(O12l, E6l);
3670
0
            O12l = _mm_add_epi32(O12l, E7l);
3671
3672
0
            O12h = _mm_add_epi32(E0h, E1h);
3673
0
            O12h = _mm_add_epi32(O12h, E2h);
3674
0
            O12h = _mm_add_epi32(O12h, E3h);
3675
0
            O12h = _mm_add_epi32(O12h, E4h);
3676
0
            O12h = _mm_add_epi32(O12h, E5h);
3677
0
            O12h = _mm_add_epi32(O12h, E6h);
3678
0
            O12h = _mm_add_epi32(O12h, E7h);
3679
3680
            /* Compute 13*/
3681
3682
0
            E0l = _mm_madd_epi16(m128Tmp0,
3683
0
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3684
0
            E0h = _mm_madd_epi16(m128Tmp1,
3685
0
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3686
0
            E1l = _mm_madd_epi16(m128Tmp2,
3687
0
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3688
0
            E1h = _mm_madd_epi16(m128Tmp3,
3689
0
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3690
0
            E2l = _mm_madd_epi16(m128Tmp4,
3691
0
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3692
0
            E2h = _mm_madd_epi16(m128Tmp5,
3693
0
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3694
0
            E3l = _mm_madd_epi16(m128Tmp6,
3695
0
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3696
0
            E3h = _mm_madd_epi16(m128Tmp7,
3697
0
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3698
3699
0
            E4l = _mm_madd_epi16(m128Tmp8,
3700
0
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3701
0
            E4h = _mm_madd_epi16(m128Tmp9,
3702
0
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3703
0
            E5l = _mm_madd_epi16(m128Tmp10,
3704
0
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3705
0
            E5h = _mm_madd_epi16(m128Tmp11,
3706
0
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3707
0
            E6l = _mm_madd_epi16(m128Tmp12,
3708
0
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3709
0
            E6h = _mm_madd_epi16(m128Tmp13,
3710
0
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3711
0
            E7l = _mm_madd_epi16(m128Tmp14,
3712
0
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3713
0
            E7h = _mm_madd_epi16(m128Tmp15,
3714
0
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3715
3716
0
            O13l = _mm_add_epi32(E0l, E1l);
3717
0
            O13l = _mm_add_epi32(O13l, E2l);
3718
0
            O13l = _mm_add_epi32(O13l, E3l);
3719
0
            O13l = _mm_add_epi32(O13l, E4l);
3720
0
            O13l = _mm_add_epi32(O13l, E5l);
3721
0
            O13l = _mm_add_epi32(O13l, E6l);
3722
0
            O13l = _mm_add_epi32(O13l, E7l);
3723
3724
0
            O13h = _mm_add_epi32(E0h, E1h);
3725
0
            O13h = _mm_add_epi32(O13h, E2h);
3726
0
            O13h = _mm_add_epi32(O13h, E3h);
3727
0
            O13h = _mm_add_epi32(O13h, E4h);
3728
0
            O13h = _mm_add_epi32(O13h, E5h);
3729
0
            O13h = _mm_add_epi32(O13h, E6h);
3730
0
            O13h = _mm_add_epi32(O13h, E7h);
3731
3732
            /* Compute O14  */
3733
3734
0
            E0l = _mm_madd_epi16(m128Tmp0,
3735
0
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3736
0
            E0h = _mm_madd_epi16(m128Tmp1,
3737
0
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3738
0
            E1l = _mm_madd_epi16(m128Tmp2,
3739
0
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3740
0
            E1h = _mm_madd_epi16(m128Tmp3,
3741
0
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3742
0
            E2l = _mm_madd_epi16(m128Tmp4,
3743
0
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3744
0
            E2h = _mm_madd_epi16(m128Tmp5,
3745
0
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3746
0
            E3l = _mm_madd_epi16(m128Tmp6,
3747
0
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3748
0
            E3h = _mm_madd_epi16(m128Tmp7,
3749
0
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3750
3751
0
            E4l = _mm_madd_epi16(m128Tmp8,
3752
0
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3753
0
            E4h = _mm_madd_epi16(m128Tmp9,
3754
0
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3755
0
            E5l = _mm_madd_epi16(m128Tmp10,
3756
0
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3757
0
            E5h = _mm_madd_epi16(m128Tmp11,
3758
0
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3759
0
            E6l = _mm_madd_epi16(m128Tmp12,
3760
0
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3761
0
            E6h = _mm_madd_epi16(m128Tmp13,
3762
0
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3763
0
            E7l = _mm_madd_epi16(m128Tmp14,
3764
0
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3765
0
            E7h = _mm_madd_epi16(m128Tmp15,
3766
0
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3767
3768
0
            O14l = _mm_add_epi32(E0l, E1l);
3769
0
            O14l = _mm_add_epi32(O14l, E2l);
3770
0
            O14l = _mm_add_epi32(O14l, E3l);
3771
0
            O14l = _mm_add_epi32(O14l, E4l);
3772
0
            O14l = _mm_add_epi32(O14l, E5l);
3773
0
            O14l = _mm_add_epi32(O14l, E6l);
3774
0
            O14l = _mm_add_epi32(O14l, E7l);
3775
3776
0
            O14h = _mm_add_epi32(E0h, E1h);
3777
0
            O14h = _mm_add_epi32(O14h, E2h);
3778
0
            O14h = _mm_add_epi32(O14h, E3h);
3779
0
            O14h = _mm_add_epi32(O14h, E4h);
3780
0
            O14h = _mm_add_epi32(O14h, E5h);
3781
0
            O14h = _mm_add_epi32(O14h, E6h);
3782
0
            O14h = _mm_add_epi32(O14h, E7h);
3783
3784
            /* Compute O15*/
3785
3786
0
            E0l = _mm_madd_epi16(m128Tmp0,
3787
0
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3788
0
            E0h = _mm_madd_epi16(m128Tmp1,
3789
0
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3790
0
            E1l = _mm_madd_epi16(m128Tmp2,
3791
0
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3792
0
            E1h = _mm_madd_epi16(m128Tmp3,
3793
0
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3794
0
            E2l = _mm_madd_epi16(m128Tmp4,
3795
0
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3796
0
            E2h = _mm_madd_epi16(m128Tmp5,
3797
0
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3798
0
            E3l = _mm_madd_epi16(m128Tmp6,
3799
0
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3800
0
            E3h = _mm_madd_epi16(m128Tmp7,
3801
0
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3802
3803
0
            E4l = _mm_madd_epi16(m128Tmp8,
3804
0
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3805
0
            E4h = _mm_madd_epi16(m128Tmp9,
3806
0
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3807
0
            E5l = _mm_madd_epi16(m128Tmp10,
3808
0
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3809
0
            E5h = _mm_madd_epi16(m128Tmp11,
3810
0
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3811
0
            E6l = _mm_madd_epi16(m128Tmp12,
3812
0
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3813
0
            E6h = _mm_madd_epi16(m128Tmp13,
3814
0
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3815
0
            E7l = _mm_madd_epi16(m128Tmp14,
3816
0
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3817
0
            E7h = _mm_madd_epi16(m128Tmp15,
3818
0
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3819
3820
0
            O15l = _mm_add_epi32(E0l, E1l);
3821
0
            O15l = _mm_add_epi32(O15l, E2l);
3822
0
            O15l = _mm_add_epi32(O15l, E3l);
3823
0
            O15l = _mm_add_epi32(O15l, E4l);
3824
0
            O15l = _mm_add_epi32(O15l, E5l);
3825
0
            O15l = _mm_add_epi32(O15l, E6l);
3826
0
            O15l = _mm_add_epi32(O15l, E7l);
3827
3828
0
            O15h = _mm_add_epi32(E0h, E1h);
3829
0
            O15h = _mm_add_epi32(O15h, E2h);
3830
0
            O15h = _mm_add_epi32(O15h, E3h);
3831
0
            O15h = _mm_add_epi32(O15h, E4h);
3832
0
            O15h = _mm_add_epi32(O15h, E5h);
3833
0
            O15h = _mm_add_epi32(O15h, E6h);
3834
0
            O15h = _mm_add_epi32(O15h, E7h);
3835
            /*  Compute E0  */
3836
3837
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3838
0
            E0l = _mm_madd_epi16(m128Tmp0,
3839
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3840
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3841
0
            E0h = _mm_madd_epi16(m128Tmp1,
3842
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3843
3844
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3845
0
            E0l = _mm_add_epi32(E0l,
3846
0
                    _mm_madd_epi16(m128Tmp2,
3847
0
                            _mm_load_si128(
3848
0
                                    (__m128i *) (transform16x16_1[1][0]))));
3849
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3850
0
            E0h = _mm_add_epi32(E0h,
3851
0
                    _mm_madd_epi16(m128Tmp3,
3852
0
                            _mm_load_si128(
3853
0
                                    (__m128i *) (transform16x16_1[1][0]))));
3854
3855
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3856
0
            E0l = _mm_add_epi32(E0l,
3857
0
                    _mm_madd_epi16(m128Tmp4,
3858
0
                            _mm_load_si128(
3859
0
                                    (__m128i *) (transform16x16_1[2][0]))));
3860
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3861
0
            E0h = _mm_add_epi32(E0h,
3862
0
                    _mm_madd_epi16(m128Tmp5,
3863
0
                            _mm_load_si128(
3864
0
                                    (__m128i *) (transform16x16_1[2][0]))));
3865
3866
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3867
0
            E0l = _mm_add_epi32(E0l,
3868
0
                    _mm_madd_epi16(m128Tmp6,
3869
0
                            _mm_load_si128(
3870
0
                                    (__m128i *) (transform16x16_1[3][0]))));
3871
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3872
0
            E0h = _mm_add_epi32(E0h,
3873
0
                    _mm_madd_epi16(m128Tmp7,
3874
0
                            _mm_load_si128(
3875
0
                                    (__m128i *) (transform16x16_1[3][0]))));
3876
3877
            /*  Compute E1  */
3878
0
            E1l = _mm_madd_epi16(m128Tmp0,
3879
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3880
0
            E1h = _mm_madd_epi16(m128Tmp1,
3881
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3882
0
            E1l = _mm_add_epi32(E1l,
3883
0
                    _mm_madd_epi16(m128Tmp2,
3884
0
                            _mm_load_si128(
3885
0
                                    (__m128i *) (transform16x16_1[1][1]))));
3886
0
            E1h = _mm_add_epi32(E1h,
3887
0
                    _mm_madd_epi16(m128Tmp3,
3888
0
                            _mm_load_si128(
3889
0
                                    (__m128i *) (transform16x16_1[1][1]))));
3890
0
            E1l = _mm_add_epi32(E1l,
3891
0
                    _mm_madd_epi16(m128Tmp4,
3892
0
                            _mm_load_si128(
3893
0
                                    (__m128i *) (transform16x16_1[2][1]))));
3894
0
            E1h = _mm_add_epi32(E1h,
3895
0
                    _mm_madd_epi16(m128Tmp5,
3896
0
                            _mm_load_si128(
3897
0
                                    (__m128i *) (transform16x16_1[2][1]))));
3898
0
            E1l = _mm_add_epi32(E1l,
3899
0
                    _mm_madd_epi16(m128Tmp6,
3900
0
                            _mm_load_si128(
3901
0
                                    (__m128i *) (transform16x16_1[3][1]))));
3902
0
            E1h = _mm_add_epi32(E1h,
3903
0
                    _mm_madd_epi16(m128Tmp7,
3904
0
                            _mm_load_si128(
3905
0
                                    (__m128i *) (transform16x16_1[3][1]))));
3906
3907
            /*  Compute E2  */
3908
0
            E2l = _mm_madd_epi16(m128Tmp0,
3909
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3910
0
            E2h = _mm_madd_epi16(m128Tmp1,
3911
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3912
0
            E2l = _mm_add_epi32(E2l,
3913
0
                    _mm_madd_epi16(m128Tmp2,
3914
0
                            _mm_load_si128(
3915
0
                                    (__m128i *) (transform16x16_1[1][2]))));
3916
0
            E2h = _mm_add_epi32(E2h,
3917
0
                    _mm_madd_epi16(m128Tmp3,
3918
0
                            _mm_load_si128(
3919
0
                                    (__m128i *) (transform16x16_1[1][2]))));
3920
0
            E2l = _mm_add_epi32(E2l,
3921
0
                    _mm_madd_epi16(m128Tmp4,
3922
0
                            _mm_load_si128(
3923
0
                                    (__m128i *) (transform16x16_1[2][2]))));
3924
0
            E2h = _mm_add_epi32(E2h,
3925
0
                    _mm_madd_epi16(m128Tmp5,
3926
0
                            _mm_load_si128(
3927
0
                                    (__m128i *) (transform16x16_1[2][2]))));
3928
0
            E2l = _mm_add_epi32(E2l,
3929
0
                    _mm_madd_epi16(m128Tmp6,
3930
0
                            _mm_load_si128(
3931
0
                                    (__m128i *) (transform16x16_1[3][2]))));
3932
0
            E2h = _mm_add_epi32(E2h,
3933
0
                    _mm_madd_epi16(m128Tmp7,
3934
0
                            _mm_load_si128(
3935
0
                                    (__m128i *) (transform16x16_1[3][2]))));
3936
3937
            /*  Compute E3  */
3938
0
            E3l = _mm_madd_epi16(m128Tmp0,
3939
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3940
0
            E3h = _mm_madd_epi16(m128Tmp1,
3941
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3942
0
            E3l = _mm_add_epi32(E3l,
3943
0
                    _mm_madd_epi16(m128Tmp2,
3944
0
                            _mm_load_si128(
3945
0
                                    (__m128i *) (transform16x16_1[1][3]))));
3946
0
            E3h = _mm_add_epi32(E3h,
3947
0
                    _mm_madd_epi16(m128Tmp3,
3948
0
                            _mm_load_si128(
3949
0
                                    (__m128i *) (transform16x16_1[1][3]))));
3950
0
            E3l = _mm_add_epi32(E3l,
3951
0
                    _mm_madd_epi16(m128Tmp4,
3952
0
                            _mm_load_si128(
3953
0
                                    (__m128i *) (transform16x16_1[2][3]))));
3954
0
            E3h = _mm_add_epi32(E3h,
3955
0
                    _mm_madd_epi16(m128Tmp5,
3956
0
                            _mm_load_si128(
3957
0
                                    (__m128i *) (transform16x16_1[2][3]))));
3958
0
            E3l = _mm_add_epi32(E3l,
3959
0
                    _mm_madd_epi16(m128Tmp6,
3960
0
                            _mm_load_si128(
3961
0
                                    (__m128i *) (transform16x16_1[3][3]))));
3962
0
            E3h = _mm_add_epi32(E3h,
3963
0
                    _mm_madd_epi16(m128Tmp7,
3964
0
                            _mm_load_si128(
3965
0
                                    (__m128i *) (transform16x16_1[3][3]))));
3966
3967
            /*  Compute E4  */
3968
0
            E4l = _mm_madd_epi16(m128Tmp0,
3969
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3970
0
            E4h = _mm_madd_epi16(m128Tmp1,
3971
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3972
0
            E4l = _mm_add_epi32(E4l,
3973
0
                    _mm_madd_epi16(m128Tmp2,
3974
0
                            _mm_load_si128(
3975
0
                                    (__m128i *) (transform16x16_1[1][4]))));
3976
0
            E4h = _mm_add_epi32(E4h,
3977
0
                    _mm_madd_epi16(m128Tmp3,
3978
0
                            _mm_load_si128(
3979
0
                                    (__m128i *) (transform16x16_1[1][4]))));
3980
0
            E4l = _mm_add_epi32(E4l,
3981
0
                    _mm_madd_epi16(m128Tmp4,
3982
0
                            _mm_load_si128(
3983
0
                                    (__m128i *) (transform16x16_1[2][4]))));
3984
0
            E4h = _mm_add_epi32(E4h,
3985
0
                    _mm_madd_epi16(m128Tmp5,
3986
0
                            _mm_load_si128(
3987
0
                                    (__m128i *) (transform16x16_1[2][4]))));
3988
0
            E4l = _mm_add_epi32(E4l,
3989
0
                    _mm_madd_epi16(m128Tmp6,
3990
0
                            _mm_load_si128(
3991
0
                                    (__m128i *) (transform16x16_1[3][4]))));
3992
0
            E4h = _mm_add_epi32(E4h,
3993
0
                    _mm_madd_epi16(m128Tmp7,
3994
0
                            _mm_load_si128(
3995
0
                                    (__m128i *) (transform16x16_1[3][4]))));
3996
3997
            /*  Compute E3  */
3998
0
            E5l = _mm_madd_epi16(m128Tmp0,
3999
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
4000
0
            E5h = _mm_madd_epi16(m128Tmp1,
4001
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
4002
0
            E5l = _mm_add_epi32(E5l,
4003
0
                    _mm_madd_epi16(m128Tmp2,
4004
0
                            _mm_load_si128(
4005
0
                                    (__m128i *) (transform16x16_1[1][5]))));
4006
0
            E5h = _mm_add_epi32(E5h,
4007
0
                    _mm_madd_epi16(m128Tmp3,
4008
0
                            _mm_load_si128(
4009
0
                                    (__m128i *) (transform16x16_1[1][5]))));
4010
0
            E5l = _mm_add_epi32(E5l,
4011
0
                    _mm_madd_epi16(m128Tmp4,
4012
0
                            _mm_load_si128(
4013
0
                                    (__m128i *) (transform16x16_1[2][5]))));
4014
0
            E5h = _mm_add_epi32(E5h,
4015
0
                    _mm_madd_epi16(m128Tmp5,
4016
0
                            _mm_load_si128(
4017
0
                                    (__m128i *) (transform16x16_1[2][5]))));
4018
0
            E5l = _mm_add_epi32(E5l,
4019
0
                    _mm_madd_epi16(m128Tmp6,
4020
0
                            _mm_load_si128(
4021
0
                                    (__m128i *) (transform16x16_1[3][5]))));
4022
0
            E5h = _mm_add_epi32(E5h,
4023
0
                    _mm_madd_epi16(m128Tmp7,
4024
0
                            _mm_load_si128(
4025
0
                                    (__m128i *) (transform16x16_1[3][5]))));
4026
4027
            /*  Compute E6  */
4028
0
            E6l = _mm_madd_epi16(m128Tmp0,
4029
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4030
0
            E6h = _mm_madd_epi16(m128Tmp1,
4031
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4032
0
            E6l = _mm_add_epi32(E6l,
4033
0
                    _mm_madd_epi16(m128Tmp2,
4034
0
                            _mm_load_si128(
4035
0
                                    (__m128i *) (transform16x16_1[1][6]))));
4036
0
            E6h = _mm_add_epi32(E6h,
4037
0
                    _mm_madd_epi16(m128Tmp3,
4038
0
                            _mm_load_si128(
4039
0
                                    (__m128i *) (transform16x16_1[1][6]))));
4040
0
            E6l = _mm_add_epi32(E6l,
4041
0
                    _mm_madd_epi16(m128Tmp4,
4042
0
                            _mm_load_si128(
4043
0
                                    (__m128i *) (transform16x16_1[2][6]))));
4044
0
            E6h = _mm_add_epi32(E6h,
4045
0
                    _mm_madd_epi16(m128Tmp5,
4046
0
                            _mm_load_si128(
4047
0
                                    (__m128i *) (transform16x16_1[2][6]))));
4048
0
            E6l = _mm_add_epi32(E6l,
4049
0
                    _mm_madd_epi16(m128Tmp6,
4050
0
                            _mm_load_si128(
4051
0
                                    (__m128i *) (transform16x16_1[3][6]))));
4052
0
            E6h = _mm_add_epi32(E6h,
4053
0
                    _mm_madd_epi16(m128Tmp7,
4054
0
                            _mm_load_si128(
4055
0
                                    (__m128i *) (transform16x16_1[3][6]))));
4056
4057
            /*  Compute E7  */
4058
0
            E7l = _mm_madd_epi16(m128Tmp0,
4059
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4060
0
            E7h = _mm_madd_epi16(m128Tmp1,
4061
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4062
0
            E7l = _mm_add_epi32(E7l,
4063
0
                    _mm_madd_epi16(m128Tmp2,
4064
0
                            _mm_load_si128(
4065
0
                                    (__m128i *) (transform16x16_1[1][7]))));
4066
0
            E7h = _mm_add_epi32(E7h,
4067
0
                    _mm_madd_epi16(m128Tmp3,
4068
0
                            _mm_load_si128(
4069
0
                                    (__m128i *) (transform16x16_1[1][7]))));
4070
0
            E7l = _mm_add_epi32(E7l,
4071
0
                    _mm_madd_epi16(m128Tmp4,
4072
0
                            _mm_load_si128(
4073
0
                                    (__m128i *) (transform16x16_1[2][7]))));
4074
0
            E7h = _mm_add_epi32(E7h,
4075
0
                    _mm_madd_epi16(m128Tmp5,
4076
0
                            _mm_load_si128(
4077
0
                                    (__m128i *) (transform16x16_1[2][7]))));
4078
0
            E7l = _mm_add_epi32(E7l,
4079
0
                    _mm_madd_epi16(m128Tmp6,
4080
0
                            _mm_load_si128(
4081
0
                                    (__m128i *) (transform16x16_1[3][7]))));
4082
0
            E7h = _mm_add_epi32(E7h,
4083
0
                    _mm_madd_epi16(m128Tmp7,
4084
0
                            _mm_load_si128(
4085
0
                                    (__m128i *) (transform16x16_1[3][7]))));
4086
4087
            /*  Compute EE0 and EEE */
4088
4089
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4090
0
            E00l = _mm_madd_epi16(m128Tmp0,
4091
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4092
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4093
0
            E00h = _mm_madd_epi16(m128Tmp1,
4094
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4095
4096
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4097
0
            E00l = _mm_add_epi32(E00l,
4098
0
                    _mm_madd_epi16(m128Tmp2,
4099
0
                            _mm_load_si128(
4100
0
                                    (__m128i *) (transform16x16_2[1][0]))));
4101
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4102
0
            E00h = _mm_add_epi32(E00h,
4103
0
                    _mm_madd_epi16(m128Tmp3,
4104
0
                            _mm_load_si128(
4105
0
                                    (__m128i *) (transform16x16_2[1][0]))));
4106
4107
0
            E01l = _mm_madd_epi16(m128Tmp0,
4108
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4109
0
            E01h = _mm_madd_epi16(m128Tmp1,
4110
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4111
0
            E01l = _mm_add_epi32(E01l,
4112
0
                    _mm_madd_epi16(m128Tmp2,
4113
0
                            _mm_load_si128(
4114
0
                                    (__m128i *) (transform16x16_2[1][1]))));
4115
0
            E01h = _mm_add_epi32(E01h,
4116
0
                    _mm_madd_epi16(m128Tmp3,
4117
0
                            _mm_load_si128(
4118
0
                                    (__m128i *) (transform16x16_2[1][1]))));
4119
4120
0
            E02l = _mm_madd_epi16(m128Tmp0,
4121
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4122
0
            E02h = _mm_madd_epi16(m128Tmp1,
4123
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4124
0
            E02l = _mm_add_epi32(E02l,
4125
0
                    _mm_madd_epi16(m128Tmp2,
4126
0
                            _mm_load_si128(
4127
0
                                    (__m128i *) (transform16x16_2[1][2]))));
4128
0
            E02h = _mm_add_epi32(E02h,
4129
0
                    _mm_madd_epi16(m128Tmp3,
4130
0
                            _mm_load_si128(
4131
0
                                    (__m128i *) (transform16x16_2[1][2]))));
4132
4133
0
            E03l = _mm_madd_epi16(m128Tmp0,
4134
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4135
0
            E03h = _mm_madd_epi16(m128Tmp1,
4136
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4137
0
            E03l = _mm_add_epi32(E03l,
4138
0
                    _mm_madd_epi16(m128Tmp2,
4139
0
                            _mm_load_si128(
4140
0
                                    (__m128i *) (transform16x16_2[1][3]))));
4141
0
            E03h = _mm_add_epi32(E03h,
4142
0
                    _mm_madd_epi16(m128Tmp3,
4143
0
                            _mm_load_si128(
4144
0
                                    (__m128i *) (transform16x16_2[1][3]))));
4145
4146
            /*  Compute EE0 and EEE */
4147
4148
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4149
0
            EE0l = _mm_madd_epi16(m128Tmp0,
4150
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4151
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4152
0
            EE0h = _mm_madd_epi16(m128Tmp1,
4153
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4154
4155
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4156
0
            EEE0l = _mm_madd_epi16(m128Tmp2,
4157
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4158
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4159
0
            EEE0h = _mm_madd_epi16(m128Tmp3,
4160
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4161
4162
0
            EE1l = _mm_madd_epi16(m128Tmp0,
4163
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4164
0
            EE1h = _mm_madd_epi16(m128Tmp1,
4165
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4166
4167
0
            EEE1l = _mm_madd_epi16(m128Tmp2,
4168
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4169
0
            EEE1h = _mm_madd_epi16(m128Tmp3,
4170
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4171
4172
            /*  Compute EE    */
4173
4174
0
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
4175
0
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
4176
0
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
4177
0
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
4178
4179
0
            EE0l = _mm_add_epi32(EEE0l, EE0l);
4180
0
            EE1l = _mm_add_epi32(EEE1l, EE1l);
4181
0
            EE0h = _mm_add_epi32(EEE0h, EE0h);
4182
0
            EE1h = _mm_add_epi32(EEE1h, EE1h);
4183
            /**/
4184
4185
0
            EE7l = _mm_sub_epi32(EE0l, E00l);
4186
0
            EE6l = _mm_sub_epi32(EE1l, E01l);
4187
0
            EE5l = _mm_sub_epi32(EE2l, E02l);
4188
0
            EE4l = _mm_sub_epi32(EE3l, E03l);
4189
4190
0
            EE7h = _mm_sub_epi32(EE0h, E00h);
4191
0
            EE6h = _mm_sub_epi32(EE1h, E01h);
4192
0
            EE5h = _mm_sub_epi32(EE2h, E02h);
4193
0
            EE4h = _mm_sub_epi32(EE3h, E03h);
4194
4195
0
            EE0l = _mm_add_epi32(EE0l, E00l);
4196
0
            EE1l = _mm_add_epi32(EE1l, E01l);
4197
0
            EE2l = _mm_add_epi32(EE2l, E02l);
4198
0
            EE3l = _mm_add_epi32(EE3l, E03l);
4199
4200
0
            EE0h = _mm_add_epi32(EE0h, E00h);
4201
0
            EE1h = _mm_add_epi32(EE1h, E01h);
4202
0
            EE2h = _mm_add_epi32(EE2h, E02h);
4203
0
            EE3h = _mm_add_epi32(EE3h, E03h);
4204
            /*      Compute E       */
4205
4206
0
            E15l = _mm_sub_epi32(EE0l, E0l);
4207
0
            E15l = _mm_add_epi32(E15l, m128iAdd);
4208
0
            E14l = _mm_sub_epi32(EE1l, E1l);
4209
0
            E14l = _mm_add_epi32(E14l, m128iAdd);
4210
0
            E13l = _mm_sub_epi32(EE2l, E2l);
4211
0
            E13l = _mm_add_epi32(E13l, m128iAdd);
4212
0
            E12l = _mm_sub_epi32(EE3l, E3l);
4213
0
            E12l = _mm_add_epi32(E12l, m128iAdd);
4214
0
            E11l = _mm_sub_epi32(EE4l, E4l);
4215
0
            E11l = _mm_add_epi32(E11l, m128iAdd);
4216
0
            E10l = _mm_sub_epi32(EE5l, E5l);
4217
0
            E10l = _mm_add_epi32(E10l, m128iAdd);
4218
0
            E9l = _mm_sub_epi32(EE6l, E6l);
4219
0
            E9l = _mm_add_epi32(E9l, m128iAdd);
4220
0
            E8l = _mm_sub_epi32(EE7l, E7l);
4221
0
            E8l = _mm_add_epi32(E8l, m128iAdd);
4222
4223
0
            E0l = _mm_add_epi32(EE0l, E0l);
4224
0
            E0l = _mm_add_epi32(E0l, m128iAdd);
4225
0
            E1l = _mm_add_epi32(EE1l, E1l);
4226
0
            E1l = _mm_add_epi32(E1l, m128iAdd);
4227
0
            E2l = _mm_add_epi32(EE2l, E2l);
4228
0
            E2l = _mm_add_epi32(E2l, m128iAdd);
4229
0
            E3l = _mm_add_epi32(EE3l, E3l);
4230
0
            E3l = _mm_add_epi32(E3l, m128iAdd);
4231
0
            E4l = _mm_add_epi32(EE4l, E4l);
4232
0
            E4l = _mm_add_epi32(E4l, m128iAdd);
4233
0
            E5l = _mm_add_epi32(EE5l, E5l);
4234
0
            E5l = _mm_add_epi32(E5l, m128iAdd);
4235
0
            E6l = _mm_add_epi32(EE6l, E6l);
4236
0
            E6l = _mm_add_epi32(E6l, m128iAdd);
4237
0
            E7l = _mm_add_epi32(EE7l, E7l);
4238
0
            E7l = _mm_add_epi32(E7l, m128iAdd);
4239
4240
0
            E15h = _mm_sub_epi32(EE0h, E0h);
4241
0
            E15h = _mm_add_epi32(E15h, m128iAdd);
4242
0
            E14h = _mm_sub_epi32(EE1h, E1h);
4243
0
            E14h = _mm_add_epi32(E14h, m128iAdd);
4244
0
            E13h = _mm_sub_epi32(EE2h, E2h);
4245
0
            E13h = _mm_add_epi32(E13h, m128iAdd);
4246
0
            E12h = _mm_sub_epi32(EE3h, E3h);
4247
0
            E12h = _mm_add_epi32(E12h, m128iAdd);
4248
0
            E11h = _mm_sub_epi32(EE4h, E4h);
4249
0
            E11h = _mm_add_epi32(E11h, m128iAdd);
4250
0
            E10h = _mm_sub_epi32(EE5h, E5h);
4251
0
            E10h = _mm_add_epi32(E10h, m128iAdd);
4252
0
            E9h = _mm_sub_epi32(EE6h, E6h);
4253
0
            E9h = _mm_add_epi32(E9h, m128iAdd);
4254
0
            E8h = _mm_sub_epi32(EE7h, E7h);
4255
0
            E8h = _mm_add_epi32(E8h, m128iAdd);
4256
4257
0
            E0h = _mm_add_epi32(EE0h, E0h);
4258
0
            E0h = _mm_add_epi32(E0h, m128iAdd);
4259
0
            E1h = _mm_add_epi32(EE1h, E1h);
4260
0
            E1h = _mm_add_epi32(E1h, m128iAdd);
4261
0
            E2h = _mm_add_epi32(EE2h, E2h);
4262
0
            E2h = _mm_add_epi32(E2h, m128iAdd);
4263
0
            E3h = _mm_add_epi32(EE3h, E3h);
4264
0
            E3h = _mm_add_epi32(E3h, m128iAdd);
4265
0
            E4h = _mm_add_epi32(EE4h, E4h);
4266
0
            E4h = _mm_add_epi32(E4h, m128iAdd);
4267
0
            E5h = _mm_add_epi32(EE5h, E5h);
4268
0
            E5h = _mm_add_epi32(E5h, m128iAdd);
4269
0
            E6h = _mm_add_epi32(EE6h, E6h);
4270
0
            E6h = _mm_add_epi32(E6h, m128iAdd);
4271
0
            E7h = _mm_add_epi32(EE7h, E7h);
4272
0
            E7h = _mm_add_epi32(E7h, m128iAdd);
4273
4274
0
            m128iS0 = _mm_packs_epi32(
4275
0
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4276
0
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4277
0
            m128iS1 = _mm_packs_epi32(
4278
0
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4279
0
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4280
0
            m128iS2 = _mm_packs_epi32(
4281
0
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4282
0
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4283
0
            m128iS3 = _mm_packs_epi32(
4284
0
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4285
0
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4286
0
            m128iS4 = _mm_packs_epi32(
4287
0
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4288
0
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4289
0
            m128iS5 = _mm_packs_epi32(
4290
0
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4291
0
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4292
0
            m128iS6 = _mm_packs_epi32(
4293
0
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4294
0
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4295
0
            m128iS7 = _mm_packs_epi32(
4296
0
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4297
0
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4298
0
            m128iS8 = _mm_packs_epi32(
4299
0
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4300
0
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4301
0
            m128iS9 = _mm_packs_epi32(
4302
0
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4303
0
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4304
0
            m128iS10 = _mm_packs_epi32(
4305
0
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4306
0
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4307
0
            m128iS11 = _mm_packs_epi32(
4308
0
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4309
0
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4310
0
            m128iS12 = _mm_packs_epi32(
4311
0
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4312
0
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4313
0
            m128iS13 = _mm_packs_epi32(
4314
0
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4315
0
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4316
0
            m128iS14 = _mm_packs_epi32(
4317
0
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4318
0
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4319
0
            m128iS15 = _mm_packs_epi32(
4320
0
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4321
0
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4322
4323
0
            m128iS31 = _mm_packs_epi32(
4324
0
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4325
0
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4326
0
            m128iS30 = _mm_packs_epi32(
4327
0
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4328
0
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4329
0
            m128iS29 = _mm_packs_epi32(
4330
0
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4331
0
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4332
0
            m128iS28 = _mm_packs_epi32(
4333
0
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4334
0
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4335
0
            m128iS27 = _mm_packs_epi32(
4336
0
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4337
0
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4338
0
            m128iS26 = _mm_packs_epi32(
4339
0
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4340
0
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4341
0
            m128iS25 = _mm_packs_epi32(
4342
0
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4343
0
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4344
0
            m128iS24 = _mm_packs_epi32(
4345
0
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4346
0
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4347
0
            m128iS23 = _mm_packs_epi32(
4348
0
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4349
0
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4350
0
            m128iS22 = _mm_packs_epi32(
4351
0
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4352
0
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4353
0
            m128iS21 = _mm_packs_epi32(
4354
0
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4355
0
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4356
0
            m128iS20 = _mm_packs_epi32(
4357
0
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4358
0
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4359
0
            m128iS19 = _mm_packs_epi32(
4360
0
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4361
0
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4362
0
            m128iS18 = _mm_packs_epi32(
4363
0
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4364
0
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4365
0
            m128iS17 = _mm_packs_epi32(
4366
0
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4367
0
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4368
0
            m128iS16 = _mm_packs_epi32(
4369
0
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4370
0
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4371
4372
0
            if (!j) {
4373
                /*      Inverse the matrix      */
4374
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4375
0
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4376
0
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4377
0
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4378
0
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4379
0
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4380
0
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4381
0
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4382
0
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4383
0
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4384
0
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4385
0
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4386
0
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4387
0
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4388
0
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4389
0
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4390
4391
0
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4392
0
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4393
0
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4394
0
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4395
0
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4396
0
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4397
0
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4398
0
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4399
0
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4400
0
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4401
0
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4402
0
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4403
0
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4404
0
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4405
0
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4406
0
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4407
4408
0
                E0h = _mm_unpacklo_epi16(E0l, E8l);
4409
0
                E1h = _mm_unpacklo_epi16(E1l, E9l);
4410
0
                E2h = _mm_unpacklo_epi16(E2l, E10l);
4411
0
                E3h = _mm_unpacklo_epi16(E3l, E11l);
4412
0
                E4h = _mm_unpacklo_epi16(E4l, E12l);
4413
0
                E5h = _mm_unpacklo_epi16(E5l, E13l);
4414
0
                E6h = _mm_unpacklo_epi16(E6l, E14l);
4415
0
                E7h = _mm_unpacklo_epi16(E7l, E15l);
4416
4417
0
                E8h = _mm_unpackhi_epi16(E0l, E8l);
4418
0
                E9h = _mm_unpackhi_epi16(E1l, E9l);
4419
0
                E10h = _mm_unpackhi_epi16(E2l, E10l);
4420
0
                E11h = _mm_unpackhi_epi16(E3l, E11l);
4421
0
                E12h = _mm_unpackhi_epi16(E4l, E12l);
4422
0
                E13h = _mm_unpackhi_epi16(E5l, E13l);
4423
0
                E14h = _mm_unpackhi_epi16(E6l, E14l);
4424
0
                E15h = _mm_unpackhi_epi16(E7l, E15l);
4425
4426
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4427
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4428
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4429
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4430
4431
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4432
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4433
0
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4434
0
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4435
4436
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4437
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4438
0
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4439
0
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4440
4441
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4442
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4443
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4444
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4445
4446
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4447
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4448
0
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4449
0
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4450
4451
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4452
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4453
0
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4454
0
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4455
4456
0
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4457
0
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4458
0
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4459
0
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4460
4461
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4462
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4463
0
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4464
0
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4465
4466
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4467
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4468
0
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4469
0
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4470
4471
0
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4472
0
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4473
0
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4474
0
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4475
4476
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4477
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4478
0
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4479
0
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4480
4481
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4482
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4483
0
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4484
0
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4485
4486
                /*  */
4487
0
                E0h = _mm_unpacklo_epi16(O0l, O8l);
4488
0
                E1h = _mm_unpacklo_epi16(O1l, O9l);
4489
0
                E2h = _mm_unpacklo_epi16(O2l, O10l);
4490
0
                E3h = _mm_unpacklo_epi16(O3l, O11l);
4491
0
                E4h = _mm_unpacklo_epi16(O4l, O12l);
4492
0
                E5h = _mm_unpacklo_epi16(O5l, O13l);
4493
0
                E6h = _mm_unpacklo_epi16(O6l, O14l);
4494
0
                E7h = _mm_unpacklo_epi16(O7l, O15l);
4495
4496
0
                E8h = _mm_unpackhi_epi16(O0l, O8l);
4497
0
                E9h = _mm_unpackhi_epi16(O1l, O9l);
4498
0
                E10h = _mm_unpackhi_epi16(O2l, O10l);
4499
0
                E11h = _mm_unpackhi_epi16(O3l, O11l);
4500
0
                E12h = _mm_unpackhi_epi16(O4l, O12l);
4501
0
                E13h = _mm_unpackhi_epi16(O5l, O13l);
4502
0
                E14h = _mm_unpackhi_epi16(O6l, O14l);
4503
0
                E15h = _mm_unpackhi_epi16(O7l, O15l);
4504
4505
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4506
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4507
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4508
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4509
4510
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4511
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4512
0
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4513
0
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4514
4515
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4516
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4517
0
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4518
0
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4519
4520
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4521
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4522
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4523
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4524
4525
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4526
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4527
0
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4528
0
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4529
4530
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4531
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4532
0
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4533
0
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4534
4535
0
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4536
0
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4537
0
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4538
0
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4539
4540
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4541
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4542
0
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4543
0
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4544
4545
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4546
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4547
0
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4548
0
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4549
4550
0
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4551
0
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4552
0
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4553
0
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4554
4555
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4556
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4557
0
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4558
0
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4559
4560
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4561
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4562
0
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4563
0
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4564
4565
0
                if(i==0){
4566
0
                    int k = 8;
4567
0
                    r0=m128iS0;
4568
0
                    r1=m128iS1;
4569
0
                    r2=m128iS2;
4570
0
                    r3=m128iS3;
4571
0
                    r4=m128iS4;
4572
0
                    r5=m128iS5;
4573
0
                    r6=m128iS6;
4574
0
                    r7=m128iS7;
4575
0
                    r8=m128iS8;
4576
0
                    r9=m128iS9;
4577
0
                    r10=m128iS10;
4578
0
                    r11=m128iS11;
4579
0
                    r12=m128iS12;
4580
0
                    r13=m128iS13;
4581
0
                    r14=m128iS14;
4582
0
                    r15=m128iS15;
4583
0
                    r16=m128iS16;
4584
0
                    r17=m128iS17;
4585
0
                    r18=m128iS18;
4586
0
                    r19=m128iS19;
4587
0
                    r20=m128iS20;
4588
0
                    r21=m128iS21;
4589
0
                    r22=m128iS22;
4590
0
                    r23=m128iS23;
4591
0
                    r24=m128iS24;
4592
0
                    r25=m128iS25;
4593
0
                    r26=m128iS26;
4594
0
                    r27=m128iS27;
4595
0
                    r28=m128iS28;
4596
0
                    r29=m128iS29;
4597
0
                    r30=m128iS30;
4598
0
                    r31=m128iS31;
4599
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
4600
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4601
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4602
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4603
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4604
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4605
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4606
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4607
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4608
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4609
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4610
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4611
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4612
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4613
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4614
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4615
4616
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4617
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4618
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4619
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4620
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4621
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4622
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4623
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4624
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4625
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4626
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4627
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4628
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4629
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4630
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4631
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4632
4633
0
                }else if(i ==8){
4634
4635
0
                    r32=m128iS0;
4636
0
                    r33=m128iS1;
4637
0
                    r34=m128iS2;
4638
0
                    r35=m128iS3;
4639
0
                    r36=m128iS4;
4640
0
                    r37=m128iS5;
4641
0
                    r38=m128iS6;
4642
0
                    r39=m128iS7;
4643
0
                    r40=m128iS8;
4644
0
                    r41=m128iS9;
4645
0
                    r42=m128iS10;
4646
0
                    r43=m128iS11;
4647
0
                    r44=m128iS12;
4648
0
                    r45=m128iS13;
4649
0
                    r46=m128iS14;
4650
0
                    r47=m128iS15;
4651
0
                    r48=m128iS16;
4652
0
                    r49=m128iS17;
4653
0
                    r50=m128iS18;
4654
0
                    r51=m128iS19;
4655
0
                    r52=m128iS20;
4656
0
                    r53=m128iS21;
4657
0
                    r54=m128iS22;
4658
0
                    r55=m128iS23;
4659
0
                    r56=m128iS24;
4660
0
                    r57=m128iS25;
4661
0
                    r58=m128iS26;
4662
0
                    r59=m128iS27;
4663
0
                    r60=m128iS28;
4664
0
                    r61=m128iS29;
4665
0
                    r62=m128iS30;
4666
0
                    r63=m128iS31;
4667
4668
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4669
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4670
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4671
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4672
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4673
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4674
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4675
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4676
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4677
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4678
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4679
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4680
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4681
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4682
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4683
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4684
4685
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4686
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4687
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4688
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4689
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4690
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4691
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4692
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4693
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4694
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4695
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4696
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4697
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4698
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4699
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4700
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4701
4702
4703
0
                }else if(i ==16){
4704
4705
0
                    r64=m128iS0;
4706
0
                    r65=m128iS1;
4707
0
                    r66=m128iS2;
4708
0
                    r67=m128iS3;
4709
0
                    r68=m128iS4;
4710
0
                    r69=m128iS5;
4711
0
                    r70=m128iS6;
4712
0
                    r71=m128iS7;
4713
0
                    r72=m128iS8;
4714
0
                    r73=m128iS9;
4715
0
                    r74=m128iS10;
4716
0
                    r75=m128iS11;
4717
0
                    r76=m128iS12;
4718
0
                    r77=m128iS13;
4719
0
                    r78=m128iS14;
4720
0
                    r79=m128iS15;
4721
0
                    r80=m128iS16;
4722
0
                    r81=m128iS17;
4723
0
                    r82=m128iS18;
4724
0
                    r83=m128iS19;
4725
0
                    r84=m128iS20;
4726
0
                    r85=m128iS21;
4727
0
                    r86=m128iS22;
4728
0
                    r87=m128iS23;
4729
0
                    r88=m128iS24;
4730
0
                    r89=m128iS25;
4731
0
                    r90=m128iS26;
4732
0
                    r91=m128iS27;
4733
0
                    r92=m128iS28;
4734
0
                    r93=m128iS29;
4735
0
                    r94=m128iS30;
4736
0
                    r95=m128iS31;
4737
4738
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4739
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4740
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4741
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4742
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4743
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4744
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4745
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4746
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4747
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4748
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4749
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4750
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4751
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4752
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4753
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4754
4755
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4756
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4757
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4758
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4759
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4760
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4761
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4762
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4763
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4764
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4765
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4766
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4767
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4768
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4769
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4770
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4771
4772
0
                }else{
4773
0
                    r96=m128iS0;
4774
0
                    r97=m128iS1;
4775
0
                    r98=m128iS2;
4776
0
                    r99=m128iS3;
4777
0
                    r100=m128iS4;
4778
0
                    r101=m128iS5;
4779
0
                    r102=m128iS6;
4780
0
                    r103=m128iS7;
4781
0
                    r104=m128iS8;
4782
0
                    r105=m128iS9;
4783
0
                    r106=m128iS10;
4784
0
                    r107=m128iS11;
4785
0
                    r108=m128iS12;
4786
0
                    r109=m128iS13;
4787
0
                    r110=m128iS14;
4788
0
                    r111=m128iS15;
4789
0
                    r112=m128iS16;
4790
0
                    r113=m128iS17;
4791
0
                    r114=m128iS18;
4792
0
                    r115=m128iS19;
4793
0
                    r116=m128iS20;
4794
0
                    r117=m128iS21;
4795
0
                    r118=m128iS22;
4796
0
                    r119=m128iS23;
4797
0
                    r120=m128iS24;
4798
0
                    r121=m128iS25;
4799
0
                    r122=m128iS26;
4800
0
                    r123=m128iS27;
4801
0
                    r124=m128iS28;
4802
0
                    r125=m128iS29;
4803
0
                    r126=m128iS30;
4804
0
                    r127=m128iS31;
4805
4806
                    //load data for next j :
4807
0
                    m128iS0 =  r0;
4808
0
                    m128iS1 =  r4;
4809
0
                    m128iS2 =  r8;
4810
0
                    m128iS3 =  r12;
4811
0
                    m128iS4 =  r16;
4812
0
                    m128iS5 =  r20;
4813
0
                    m128iS6 =  r24;
4814
0
                    m128iS7 =  r28;
4815
0
                    m128iS8 =  r32;
4816
0
                    m128iS9 =  r36;
4817
0
                    m128iS10 = r40;
4818
0
                    m128iS11 = r44;
4819
0
                    m128iS12 = r48;
4820
0
                    m128iS13 = r52;
4821
0
                    m128iS14 = r56;
4822
0
                    m128iS15 = r60;
4823
0
                    m128iS16 = r64;
4824
0
                    m128iS17 = r68;
4825
0
                    m128iS18 = r72;
4826
0
                    m128iS19 = r76;
4827
0
                    m128iS20 = r80;
4828
0
                    m128iS21 = r84;
4829
0
                    m128iS22 = r88;
4830
0
                    m128iS23 = r92;
4831
0
                    m128iS24 = r96;
4832
0
                    m128iS25 = r100;
4833
0
                    m128iS26 = r104;
4834
0
                    m128iS27 = r108;
4835
0
                    m128iS28 = r112;
4836
0
                    m128iS29 = r116;
4837
0
                    m128iS30 = r120;
4838
0
                    m128iS31 =r124;
4839
0
                    shift = shift_2nd;
4840
0
                    m128iAdd = _mm_set1_epi32(add_2nd);
4841
4842
4843
0
                }
4844
4845
0
            } else {
4846
4847
                //Transpose Matrix
4848
4849
0
                E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4850
0
                E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4851
0
                E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4852
0
                E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4853
0
                E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4854
0
                E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4855
0
                E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4856
0
                E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4857
0
                E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4858
0
                E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4859
0
                E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4860
0
                E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4861
0
                E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4862
0
                E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4863
0
                E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4864
0
                E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4865
4866
4867
0
                E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4868
0
                E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4869
0
                E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4870
0
                E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4871
0
                E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4872
0
                E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4873
0
                E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4874
0
                E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4875
0
                E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4876
0
                E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4877
0
                E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4878
0
                E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4879
0
                E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4880
0
                E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4881
0
                E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4882
0
                E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4883
4884
0
                m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4885
0
                m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4886
0
                m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4887
0
                m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4888
0
                m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4889
0
                m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4890
0
                m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4891
0
                m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4892
4893
0
                m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4894
0
                m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4895
4896
4897
0
                m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4898
0
                m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4899
4900
                //second row
4901
4902
0
                m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4903
0
                m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4904
4905
0
                m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4906
0
                m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4907
4908
               //third row
4909
4910
0
                m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4911
0
                m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4912
0
                m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4913
0
                m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4914
0
                m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4915
0
                m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4916
0
                m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4917
0
                m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4918
4919
4920
0
                m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4921
0
                m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4922
4923
0
                m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4924
0
                m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4925
4926
                //fourth row
4927
4928
0
                m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4929
0
                m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4930
4931
0
                m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4932
0
                m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4933
4934
                //fifth row
4935
4936
0
                m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4937
0
                m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4938
0
                m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4939
0
                m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4940
0
                m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4941
0
                m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4942
0
                m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4943
0
                m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4944
4945
0
                m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4946
0
                m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4947
4948
4949
0
                m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4950
0
                m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4951
4952
                //sixth row
4953
4954
0
                m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4955
0
                m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4956
4957
4958
0
                m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4959
0
                m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4960
4961
               //seventh row
4962
4963
0
                m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4964
0
                m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4965
0
                m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4966
0
                m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4967
0
                m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4968
0
                m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4969
0
                m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4970
0
                m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4971
4972
4973
0
                m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4974
0
                m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4975
4976
4977
0
                m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4978
0
                m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4979
4980
                //last row
4981
4982
4983
0
                m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4984
0
                m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4985
4986
0
                m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4987
0
                m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4988
4989
4990
0
                m128Tmp0=_mm_setzero_si128();
4991
4992
4993
                //store
4994
0
                dst = (uint8_t*) _dst + i*stride;
4995
4996
4997
0
                E0l= _mm_load_si128((__m128i*)dst); //16 values
4998
0
                E1l= _mm_load_si128((__m128i*)(dst+16));
4999
0
                E2l= _mm_load_si128((__m128i*)(dst+stride));
5000
0
                E3l= _mm_load_si128((__m128i*)(dst+stride+16));
5001
0
                E4l= _mm_load_si128((__m128i*)(dst+2*stride));
5002
0
                E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
5003
0
                E6l= _mm_load_si128((__m128i*)(dst+3*stride));
5004
0
                E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
5005
0
                E8l= _mm_load_si128((__m128i*)(dst+4*stride));
5006
0
                E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
5007
0
                E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5008
0
                E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5009
0
                E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5010
0
                E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5011
0
                E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5012
0
                E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5013
5014
0
                m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5015
0
                m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5016
0
                m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5017
5018
0
                m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5019
0
                m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5020
0
                m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5021
5022
0
                m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5023
0
                m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5024
0
                m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5025
5026
0
                m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5027
0
                m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5028
0
                m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5029
5030
0
                m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5031
0
                m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5032
0
                m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5033
5034
0
                m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5035
0
                m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5036
0
                m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5037
5038
0
                m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5039
0
                m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5040
0
                m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5041
5042
0
                m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5043
0
                m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5044
0
                m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5045
5046
0
                m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5047
0
                m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5048
0
                m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5049
5050
0
                m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5051
0
                m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5052
0
                m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5053
5054
0
                m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5055
0
                m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5056
0
                m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5057
5058
0
                m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5059
0
                m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5060
0
                m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5061
5062
0
                m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5063
0
                m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5064
0
                m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5065
5066
0
                m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5067
0
                m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5068
0
                m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5069
5070
0
                m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5071
0
                m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5072
0
                m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5073
5074
0
                m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5075
0
                m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5076
0
                m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5077
5078
5079
0
                _mm_store_si128((__m128i*)dst,m128iS0);
5080
0
                _mm_store_si128((__m128i*)(dst+16),m128iS2);
5081
0
                _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5082
0
                _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5083
0
                _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5084
0
                _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5085
0
                _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5086
0
                _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5087
0
                _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5088
0
                _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5089
0
                _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5090
0
                _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5091
0
                _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5092
0
                _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5093
0
                _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5094
0
                _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5095
5096
5097
0
                if(i==0){
5098
                    //load next values :
5099
0
                    m128iS0 =  r1;
5100
0
                    m128iS1 =  r5;
5101
0
                    m128iS2 =  r9;
5102
0
                    m128iS3 =  r13;
5103
0
                    m128iS4 =  r17;
5104
0
                    m128iS5 =  r21;
5105
0
                    m128iS6 =  r25;
5106
0
                    m128iS7 =  r29;
5107
0
                    m128iS8 =  r33;
5108
0
                    m128iS9 =  r37;
5109
0
                    m128iS10 = r41;
5110
0
                    m128iS11 = r45;
5111
0
                    m128iS12 = r49;
5112
0
                    m128iS13 = r53;
5113
0
                    m128iS14 = r57;
5114
0
                    m128iS15 = r61;
5115
0
                    m128iS16 = r65;
5116
0
                    m128iS17 = r69;
5117
0
                    m128iS18 = r73;
5118
0
                    m128iS19 = r77;
5119
0
                    m128iS20 = r81;
5120
0
                    m128iS21 = r85;
5121
0
                    m128iS22 = r89;
5122
0
                    m128iS23 = r93;
5123
0
                    m128iS24 = r97;
5124
0
                    m128iS25 = r101;
5125
0
                    m128iS26 = r105;
5126
0
                    m128iS27 = r109;
5127
0
                    m128iS28 = r113;
5128
0
                    m128iS29 = r117;
5129
0
                    m128iS30 = r121;
5130
0
                    m128iS31 =r125;
5131
5132
0
                }else if(i ==8){
5133
                    //load next values :
5134
0
                    m128iS0 =  r2;
5135
0
                    m128iS1 =  r6;
5136
0
                    m128iS2 =  r10;
5137
0
                    m128iS3 =  r14;
5138
0
                    m128iS4 =  r18;
5139
0
                    m128iS5 =  r22;
5140
0
                    m128iS6 =  r26;
5141
0
                    m128iS7 =  r30;
5142
0
                    m128iS8 =  r34;
5143
0
                    m128iS9 =  r38;
5144
0
                    m128iS10 = r42;
5145
0
                    m128iS11 = r46;
5146
0
                    m128iS12 = r50;
5147
0
                    m128iS13 = r54;
5148
0
                    m128iS14 = r58;
5149
0
                    m128iS15 = r62;
5150
0
                    m128iS16 = r66;
5151
0
                    m128iS17 = r70;
5152
0
                    m128iS18 = r74;
5153
0
                    m128iS19 = r78;
5154
0
                    m128iS20 = r82;
5155
0
                    m128iS21 = r86;
5156
0
                    m128iS22 = r90;
5157
0
                    m128iS23 = r94;
5158
0
                    m128iS24 = r98;
5159
0
                    m128iS25 = r102;
5160
0
                    m128iS26 = r106;
5161
0
                    m128iS27 = r110;
5162
0
                    m128iS28 = r114;
5163
0
                    m128iS29 = r118;
5164
0
                    m128iS30 = r122;
5165
0
                    m128iS31 =r126;
5166
5167
0
                }else if(i==16)
5168
0
                {
5169
                    //load next values :
5170
0
                    m128iS0 =  r3;
5171
0
                    m128iS1 =  r7;
5172
0
                    m128iS2 =  r11;
5173
0
                    m128iS3 =  r15;
5174
0
                    m128iS4 =  r19;
5175
0
                    m128iS5 =  r23;
5176
0
                    m128iS6 =  r27;
5177
0
                    m128iS7 =  r31;
5178
0
                    m128iS8 =  r35;
5179
0
                    m128iS9 =  r39;
5180
0
                    m128iS10 = r43;
5181
0
                    m128iS11 = r47;
5182
0
                    m128iS12 = r51;
5183
0
                    m128iS13 = r55;
5184
0
                    m128iS14 = r59;
5185
0
                    m128iS15 = r63;
5186
0
                    m128iS16 = r67;
5187
0
                    m128iS17 = r71;
5188
0
                    m128iS18 = r75;
5189
0
                    m128iS19 = r79;
5190
0
                    m128iS20 = r83;
5191
0
                    m128iS21 = r87;
5192
0
                    m128iS22 = r91;
5193
0
                    m128iS23 = r95;
5194
0
                    m128iS24 = r99;
5195
0
                    m128iS25 = r103;
5196
0
                    m128iS26 = r107;
5197
0
                    m128iS27 = r111;
5198
0
                    m128iS28 = r115;
5199
0
                    m128iS29 = r119;
5200
0
                    m128iS30 = r123;
5201
0
                    m128iS31 =r127;
5202
0
                }
5203
0
            }
5204
0
        }
5205
0
    }
5206
0
}
5207
#if defined(__GNUC__) && !defined(__clang__)
5208
#pragma GCC diagnostic pop
5209
#endif
5210
#endif
5211
5212
5213
#if 0
5214
void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
5215
        ptrdiff_t _stride) {
5216
    int i, j;
5217
    uint16_t *dst = (uint16_t*) _dst;
5218
    ptrdiff_t stride = _stride / 2;
5219
    int shift;
5220
    uint8_t shift_2nd = 10; //20 - bit depth
5221
    uint16_t add_2nd = 1<<9; //shift2 - 1
5222
    int16_t *src = coeffs;
5223
5224
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5225
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5226
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5227
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5228
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5229
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5230
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5231
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5232
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5233
            EEE0l, EEE1l, EEE0h, EEE1h;
5234
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5235
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5236
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5237
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5238
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5239
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5240
            EE4l, EE7h, EE6h, EE5h, EE4h;
5241
    m128iS0 = _mm_load_si128((__m128i *) (src));
5242
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5243
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5244
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5245
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5246
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5247
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5248
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5249
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5250
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5251
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5252
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5253
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5254
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5255
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5256
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5257
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5258
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5259
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5260
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5261
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5262
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5263
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5264
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5265
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5266
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5267
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5268
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5269
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5270
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5271
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5272
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5273
5274
    shift = shift_1st;
5275
    m128iAdd = _mm_set1_epi32(add_1st);
5276
5277
    for (j = 0; j < 2; j++) {
5278
        for (i = 0; i < 32; i += 8) {
5279
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5280
            E0l = _mm_madd_epi16(m128Tmp0,
5281
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5282
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5283
            E0h = _mm_madd_epi16(m128Tmp1,
5284
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5285
5286
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5287
            E1l = _mm_madd_epi16(m128Tmp2,
5288
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5289
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5290
            E1h = _mm_madd_epi16(m128Tmp3,
5291
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5292
5293
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5294
            E2l = _mm_madd_epi16(m128Tmp4,
5295
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5296
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5297
            E2h = _mm_madd_epi16(m128Tmp5,
5298
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5299
5300
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5301
            E3l = _mm_madd_epi16(m128Tmp6,
5302
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5303
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5304
            E3h = _mm_madd_epi16(m128Tmp7,
5305
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5306
5307
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5308
            E4l = _mm_madd_epi16(m128Tmp8,
5309
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5310
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5311
            E4h = _mm_madd_epi16(m128Tmp9,
5312
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5313
5314
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5315
            E5l = _mm_madd_epi16(m128Tmp10,
5316
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5317
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5318
            E5h = _mm_madd_epi16(m128Tmp11,
5319
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5320
5321
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5322
            E6l = _mm_madd_epi16(m128Tmp12,
5323
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5324
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5325
            E6h = _mm_madd_epi16(m128Tmp13,
5326
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5327
5328
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5329
            E7l = _mm_madd_epi16(m128Tmp14,
5330
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5331
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5332
            E7h = _mm_madd_epi16(m128Tmp15,
5333
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5334
5335
            O0l = _mm_add_epi32(E0l, E1l);
5336
            O0l = _mm_add_epi32(O0l, E2l);
5337
            O0l = _mm_add_epi32(O0l, E3l);
5338
            O0l = _mm_add_epi32(O0l, E4l);
5339
            O0l = _mm_add_epi32(O0l, E5l);
5340
            O0l = _mm_add_epi32(O0l, E6l);
5341
            O0l = _mm_add_epi32(O0l, E7l);
5342
5343
            O0h = _mm_add_epi32(E0h, E1h);
5344
            O0h = _mm_add_epi32(O0h, E2h);
5345
            O0h = _mm_add_epi32(O0h, E3h);
5346
            O0h = _mm_add_epi32(O0h, E4h);
5347
            O0h = _mm_add_epi32(O0h, E5h);
5348
            O0h = _mm_add_epi32(O0h, E6h);
5349
            O0h = _mm_add_epi32(O0h, E7h);
5350
5351
            /* Compute O1*/
5352
            E0l = _mm_madd_epi16(m128Tmp0,
5353
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5354
            E0h = _mm_madd_epi16(m128Tmp1,
5355
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5356
            E1l = _mm_madd_epi16(m128Tmp2,
5357
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5358
            E1h = _mm_madd_epi16(m128Tmp3,
5359
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5360
            E2l = _mm_madd_epi16(m128Tmp4,
5361
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5362
            E2h = _mm_madd_epi16(m128Tmp5,
5363
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5364
            E3l = _mm_madd_epi16(m128Tmp6,
5365
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5366
            E3h = _mm_madd_epi16(m128Tmp7,
5367
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5368
5369
            E4l = _mm_madd_epi16(m128Tmp8,
5370
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5371
            E4h = _mm_madd_epi16(m128Tmp9,
5372
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5373
            E5l = _mm_madd_epi16(m128Tmp10,
5374
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5375
            E5h = _mm_madd_epi16(m128Tmp11,
5376
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5377
            E6l = _mm_madd_epi16(m128Tmp12,
5378
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5379
            E6h = _mm_madd_epi16(m128Tmp13,
5380
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5381
            E7l = _mm_madd_epi16(m128Tmp14,
5382
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5383
            E7h = _mm_madd_epi16(m128Tmp15,
5384
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5385
5386
            O1l = _mm_add_epi32(E0l, E1l);
5387
            O1l = _mm_add_epi32(O1l, E2l);
5388
            O1l = _mm_add_epi32(O1l, E3l);
5389
            O1l = _mm_add_epi32(O1l, E4l);
5390
            O1l = _mm_add_epi32(O1l, E5l);
5391
            O1l = _mm_add_epi32(O1l, E6l);
5392
            O1l = _mm_add_epi32(O1l, E7l);
5393
5394
            O1h = _mm_add_epi32(E0h, E1h);
5395
            O1h = _mm_add_epi32(O1h, E2h);
5396
            O1h = _mm_add_epi32(O1h, E3h);
5397
            O1h = _mm_add_epi32(O1h, E4h);
5398
            O1h = _mm_add_epi32(O1h, E5h);
5399
            O1h = _mm_add_epi32(O1h, E6h);
5400
            O1h = _mm_add_epi32(O1h, E7h);
5401
            /* Compute O2*/
5402
            E0l = _mm_madd_epi16(m128Tmp0,
5403
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5404
            E0h = _mm_madd_epi16(m128Tmp1,
5405
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5406
            E1l = _mm_madd_epi16(m128Tmp2,
5407
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5408
            E1h = _mm_madd_epi16(m128Tmp3,
5409
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5410
            E2l = _mm_madd_epi16(m128Tmp4,
5411
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5412
            E2h = _mm_madd_epi16(m128Tmp5,
5413
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5414
            E3l = _mm_madd_epi16(m128Tmp6,
5415
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5416
            E3h = _mm_madd_epi16(m128Tmp7,
5417
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5418
5419
            E4l = _mm_madd_epi16(m128Tmp8,
5420
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5421
            E4h = _mm_madd_epi16(m128Tmp9,
5422
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5423
            E5l = _mm_madd_epi16(m128Tmp10,
5424
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5425
            E5h = _mm_madd_epi16(m128Tmp11,
5426
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5427
            E6l = _mm_madd_epi16(m128Tmp12,
5428
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5429
            E6h = _mm_madd_epi16(m128Tmp13,
5430
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5431
            E7l = _mm_madd_epi16(m128Tmp14,
5432
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5433
            E7h = _mm_madd_epi16(m128Tmp15,
5434
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5435
5436
            O2l = _mm_add_epi32(E0l, E1l);
5437
            O2l = _mm_add_epi32(O2l, E2l);
5438
            O2l = _mm_add_epi32(O2l, E3l);
5439
            O2l = _mm_add_epi32(O2l, E4l);
5440
            O2l = _mm_add_epi32(O2l, E5l);
5441
            O2l = _mm_add_epi32(O2l, E6l);
5442
            O2l = _mm_add_epi32(O2l, E7l);
5443
5444
            O2h = _mm_add_epi32(E0h, E1h);
5445
            O2h = _mm_add_epi32(O2h, E2h);
5446
            O2h = _mm_add_epi32(O2h, E3h);
5447
            O2h = _mm_add_epi32(O2h, E4h);
5448
            O2h = _mm_add_epi32(O2h, E5h);
5449
            O2h = _mm_add_epi32(O2h, E6h);
5450
            O2h = _mm_add_epi32(O2h, E7h);
5451
            /* Compute O3*/
5452
            E0l = _mm_madd_epi16(m128Tmp0,
5453
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5454
            E0h = _mm_madd_epi16(m128Tmp1,
5455
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5456
            E1l = _mm_madd_epi16(m128Tmp2,
5457
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5458
            E1h = _mm_madd_epi16(m128Tmp3,
5459
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5460
            E2l = _mm_madd_epi16(m128Tmp4,
5461
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5462
            E2h = _mm_madd_epi16(m128Tmp5,
5463
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5464
            E3l = _mm_madd_epi16(m128Tmp6,
5465
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5466
            E3h = _mm_madd_epi16(m128Tmp7,
5467
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5468
5469
            E4l = _mm_madd_epi16(m128Tmp8,
5470
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5471
            E4h = _mm_madd_epi16(m128Tmp9,
5472
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5473
            E5l = _mm_madd_epi16(m128Tmp10,
5474
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5475
            E5h = _mm_madd_epi16(m128Tmp11,
5476
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5477
            E6l = _mm_madd_epi16(m128Tmp12,
5478
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5479
            E6h = _mm_madd_epi16(m128Tmp13,
5480
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5481
            E7l = _mm_madd_epi16(m128Tmp14,
5482
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5483
            E7h = _mm_madd_epi16(m128Tmp15,
5484
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5485
5486
            O3l = _mm_add_epi32(E0l, E1l);
5487
            O3l = _mm_add_epi32(O3l, E2l);
5488
            O3l = _mm_add_epi32(O3l, E3l);
5489
            O3l = _mm_add_epi32(O3l, E4l);
5490
            O3l = _mm_add_epi32(O3l, E5l);
5491
            O3l = _mm_add_epi32(O3l, E6l);
5492
            O3l = _mm_add_epi32(O3l, E7l);
5493
5494
            O3h = _mm_add_epi32(E0h, E1h);
5495
            O3h = _mm_add_epi32(O3h, E2h);
5496
            O3h = _mm_add_epi32(O3h, E3h);
5497
            O3h = _mm_add_epi32(O3h, E4h);
5498
            O3h = _mm_add_epi32(O3h, E5h);
5499
            O3h = _mm_add_epi32(O3h, E6h);
5500
            O3h = _mm_add_epi32(O3h, E7h);
5501
            /* Compute O4*/
5502
5503
            E0l = _mm_madd_epi16(m128Tmp0,
5504
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5505
            E0h = _mm_madd_epi16(m128Tmp1,
5506
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5507
            E1l = _mm_madd_epi16(m128Tmp2,
5508
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5509
            E1h = _mm_madd_epi16(m128Tmp3,
5510
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5511
            E2l = _mm_madd_epi16(m128Tmp4,
5512
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5513
            E2h = _mm_madd_epi16(m128Tmp5,
5514
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5515
            E3l = _mm_madd_epi16(m128Tmp6,
5516
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5517
            E3h = _mm_madd_epi16(m128Tmp7,
5518
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5519
5520
            E4l = _mm_madd_epi16(m128Tmp8,
5521
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5522
            E4h = _mm_madd_epi16(m128Tmp9,
5523
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5524
            E5l = _mm_madd_epi16(m128Tmp10,
5525
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5526
            E5h = _mm_madd_epi16(m128Tmp11,
5527
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5528
            E6l = _mm_madd_epi16(m128Tmp12,
5529
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5530
            E6h = _mm_madd_epi16(m128Tmp13,
5531
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5532
            E7l = _mm_madd_epi16(m128Tmp14,
5533
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5534
            E7h = _mm_madd_epi16(m128Tmp15,
5535
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5536
5537
            O4l = _mm_add_epi32(E0l, E1l);
5538
            O4l = _mm_add_epi32(O4l, E2l);
5539
            O4l = _mm_add_epi32(O4l, E3l);
5540
            O4l = _mm_add_epi32(O4l, E4l);
5541
            O4l = _mm_add_epi32(O4l, E5l);
5542
            O4l = _mm_add_epi32(O4l, E6l);
5543
            O4l = _mm_add_epi32(O4l, E7l);
5544
5545
            O4h = _mm_add_epi32(E0h, E1h);
5546
            O4h = _mm_add_epi32(O4h, E2h);
5547
            O4h = _mm_add_epi32(O4h, E3h);
5548
            O4h = _mm_add_epi32(O4h, E4h);
5549
            O4h = _mm_add_epi32(O4h, E5h);
5550
            O4h = _mm_add_epi32(O4h, E6h);
5551
            O4h = _mm_add_epi32(O4h, E7h);
5552
5553
            /* Compute O5*/
5554
            E0l = _mm_madd_epi16(m128Tmp0,
5555
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5556
            E0h = _mm_madd_epi16(m128Tmp1,
5557
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5558
            E1l = _mm_madd_epi16(m128Tmp2,
5559
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5560
            E1h = _mm_madd_epi16(m128Tmp3,
5561
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5562
            E2l = _mm_madd_epi16(m128Tmp4,
5563
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5564
            E2h = _mm_madd_epi16(m128Tmp5,
5565
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5566
            E3l = _mm_madd_epi16(m128Tmp6,
5567
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5568
            E3h = _mm_madd_epi16(m128Tmp7,
5569
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5570
5571
            E4l = _mm_madd_epi16(m128Tmp8,
5572
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5573
            E4h = _mm_madd_epi16(m128Tmp9,
5574
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5575
            E5l = _mm_madd_epi16(m128Tmp10,
5576
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5577
            E5h = _mm_madd_epi16(m128Tmp11,
5578
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5579
            E6l = _mm_madd_epi16(m128Tmp12,
5580
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5581
            E6h = _mm_madd_epi16(m128Tmp13,
5582
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5583
            E7l = _mm_madd_epi16(m128Tmp14,
5584
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5585
            E7h = _mm_madd_epi16(m128Tmp15,
5586
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5587
5588
            O5l = _mm_add_epi32(E0l, E1l);
5589
            O5l = _mm_add_epi32(O5l, E2l);
5590
            O5l = _mm_add_epi32(O5l, E3l);
5591
            O5l = _mm_add_epi32(O5l, E4l);
5592
            O5l = _mm_add_epi32(O5l, E5l);
5593
            O5l = _mm_add_epi32(O5l, E6l);
5594
            O5l = _mm_add_epi32(O5l, E7l);
5595
5596
            O5h = _mm_add_epi32(E0h, E1h);
5597
            O5h = _mm_add_epi32(O5h, E2h);
5598
            O5h = _mm_add_epi32(O5h, E3h);
5599
            O5h = _mm_add_epi32(O5h, E4h);
5600
            O5h = _mm_add_epi32(O5h, E5h);
5601
            O5h = _mm_add_epi32(O5h, E6h);
5602
            O5h = _mm_add_epi32(O5h, E7h);
5603
5604
            /* Compute O6*/
5605
5606
            E0l = _mm_madd_epi16(m128Tmp0,
5607
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5608
            E0h = _mm_madd_epi16(m128Tmp1,
5609
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5610
            E1l = _mm_madd_epi16(m128Tmp2,
5611
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5612
            E1h = _mm_madd_epi16(m128Tmp3,
5613
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5614
            E2l = _mm_madd_epi16(m128Tmp4,
5615
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5616
            E2h = _mm_madd_epi16(m128Tmp5,
5617
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5618
            E3l = _mm_madd_epi16(m128Tmp6,
5619
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5620
            E3h = _mm_madd_epi16(m128Tmp7,
5621
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5622
5623
            E4l = _mm_madd_epi16(m128Tmp8,
5624
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5625
            E4h = _mm_madd_epi16(m128Tmp9,
5626
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5627
            E5l = _mm_madd_epi16(m128Tmp10,
5628
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5629
            E5h = _mm_madd_epi16(m128Tmp11,
5630
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5631
            E6l = _mm_madd_epi16(m128Tmp12,
5632
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5633
            E6h = _mm_madd_epi16(m128Tmp13,
5634
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5635
            E7l = _mm_madd_epi16(m128Tmp14,
5636
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5637
            E7h = _mm_madd_epi16(m128Tmp15,
5638
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5639
5640
            O6l = _mm_add_epi32(E0l, E1l);
5641
            O6l = _mm_add_epi32(O6l, E2l);
5642
            O6l = _mm_add_epi32(O6l, E3l);
5643
            O6l = _mm_add_epi32(O6l, E4l);
5644
            O6l = _mm_add_epi32(O6l, E5l);
5645
            O6l = _mm_add_epi32(O6l, E6l);
5646
            O6l = _mm_add_epi32(O6l, E7l);
5647
5648
            O6h = _mm_add_epi32(E0h, E1h);
5649
            O6h = _mm_add_epi32(O6h, E2h);
5650
            O6h = _mm_add_epi32(O6h, E3h);
5651
            O6h = _mm_add_epi32(O6h, E4h);
5652
            O6h = _mm_add_epi32(O6h, E5h);
5653
            O6h = _mm_add_epi32(O6h, E6h);
5654
            O6h = _mm_add_epi32(O6h, E7h);
5655
5656
            /* Compute O7*/
5657
5658
            E0l = _mm_madd_epi16(m128Tmp0,
5659
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5660
            E0h = _mm_madd_epi16(m128Tmp1,
5661
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5662
            E1l = _mm_madd_epi16(m128Tmp2,
5663
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5664
            E1h = _mm_madd_epi16(m128Tmp3,
5665
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5666
            E2l = _mm_madd_epi16(m128Tmp4,
5667
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5668
            E2h = _mm_madd_epi16(m128Tmp5,
5669
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5670
            E3l = _mm_madd_epi16(m128Tmp6,
5671
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5672
            E3h = _mm_madd_epi16(m128Tmp7,
5673
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5674
5675
            E4l = _mm_madd_epi16(m128Tmp8,
5676
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5677
            E4h = _mm_madd_epi16(m128Tmp9,
5678
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5679
            E5l = _mm_madd_epi16(m128Tmp10,
5680
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5681
            E5h = _mm_madd_epi16(m128Tmp11,
5682
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5683
            E6l = _mm_madd_epi16(m128Tmp12,
5684
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5685
            E6h = _mm_madd_epi16(m128Tmp13,
5686
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5687
            E7l = _mm_madd_epi16(m128Tmp14,
5688
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5689
            E7h = _mm_madd_epi16(m128Tmp15,
5690
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5691
5692
            O7l = _mm_add_epi32(E0l, E1l);
5693
            O7l = _mm_add_epi32(O7l, E2l);
5694
            O7l = _mm_add_epi32(O7l, E3l);
5695
            O7l = _mm_add_epi32(O7l, E4l);
5696
            O7l = _mm_add_epi32(O7l, E5l);
5697
            O7l = _mm_add_epi32(O7l, E6l);
5698
            O7l = _mm_add_epi32(O7l, E7l);
5699
5700
            O7h = _mm_add_epi32(E0h, E1h);
5701
            O7h = _mm_add_epi32(O7h, E2h);
5702
            O7h = _mm_add_epi32(O7h, E3h);
5703
            O7h = _mm_add_epi32(O7h, E4h);
5704
            O7h = _mm_add_epi32(O7h, E5h);
5705
            O7h = _mm_add_epi32(O7h, E6h);
5706
            O7h = _mm_add_epi32(O7h, E7h);
5707
5708
            /* Compute O8*/
5709
5710
            E0l = _mm_madd_epi16(m128Tmp0,
5711
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5712
            E0h = _mm_madd_epi16(m128Tmp1,
5713
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5714
            E1l = _mm_madd_epi16(m128Tmp2,
5715
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5716
            E1h = _mm_madd_epi16(m128Tmp3,
5717
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5718
            E2l = _mm_madd_epi16(m128Tmp4,
5719
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5720
            E2h = _mm_madd_epi16(m128Tmp5,
5721
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5722
            E3l = _mm_madd_epi16(m128Tmp6,
5723
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5724
            E3h = _mm_madd_epi16(m128Tmp7,
5725
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5726
5727
            E4l = _mm_madd_epi16(m128Tmp8,
5728
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5729
            E4h = _mm_madd_epi16(m128Tmp9,
5730
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5731
            E5l = _mm_madd_epi16(m128Tmp10,
5732
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5733
            E5h = _mm_madd_epi16(m128Tmp11,
5734
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5735
            E6l = _mm_madd_epi16(m128Tmp12,
5736
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5737
            E6h = _mm_madd_epi16(m128Tmp13,
5738
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5739
            E7l = _mm_madd_epi16(m128Tmp14,
5740
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5741
            E7h = _mm_madd_epi16(m128Tmp15,
5742
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5743
5744
            O8l = _mm_add_epi32(E0l, E1l);
5745
            O8l = _mm_add_epi32(O8l, E2l);
5746
            O8l = _mm_add_epi32(O8l, E3l);
5747
            O8l = _mm_add_epi32(O8l, E4l);
5748
            O8l = _mm_add_epi32(O8l, E5l);
5749
            O8l = _mm_add_epi32(O8l, E6l);
5750
            O8l = _mm_add_epi32(O8l, E7l);
5751
5752
            O8h = _mm_add_epi32(E0h, E1h);
5753
            O8h = _mm_add_epi32(O8h, E2h);
5754
            O8h = _mm_add_epi32(O8h, E3h);
5755
            O8h = _mm_add_epi32(O8h, E4h);
5756
            O8h = _mm_add_epi32(O8h, E5h);
5757
            O8h = _mm_add_epi32(O8h, E6h);
5758
            O8h = _mm_add_epi32(O8h, E7h);
5759
5760
            /* Compute O9*/
5761
5762
            E0l = _mm_madd_epi16(m128Tmp0,
5763
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5764
            E0h = _mm_madd_epi16(m128Tmp1,
5765
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5766
            E1l = _mm_madd_epi16(m128Tmp2,
5767
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5768
            E1h = _mm_madd_epi16(m128Tmp3,
5769
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5770
            E2l = _mm_madd_epi16(m128Tmp4,
5771
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5772
            E2h = _mm_madd_epi16(m128Tmp5,
5773
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5774
            E3l = _mm_madd_epi16(m128Tmp6,
5775
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5776
            E3h = _mm_madd_epi16(m128Tmp7,
5777
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5778
5779
            E4l = _mm_madd_epi16(m128Tmp8,
5780
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5781
            E4h = _mm_madd_epi16(m128Tmp9,
5782
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5783
            E5l = _mm_madd_epi16(m128Tmp10,
5784
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5785
            E5h = _mm_madd_epi16(m128Tmp11,
5786
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5787
            E6l = _mm_madd_epi16(m128Tmp12,
5788
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5789
            E6h = _mm_madd_epi16(m128Tmp13,
5790
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5791
            E7l = _mm_madd_epi16(m128Tmp14,
5792
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5793
            E7h = _mm_madd_epi16(m128Tmp15,
5794
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5795
5796
            O9l = _mm_add_epi32(E0l, E1l);
5797
            O9l = _mm_add_epi32(O9l, E2l);
5798
            O9l = _mm_add_epi32(O9l, E3l);
5799
            O9l = _mm_add_epi32(O9l, E4l);
5800
            O9l = _mm_add_epi32(O9l, E5l);
5801
            O9l = _mm_add_epi32(O9l, E6l);
5802
            O9l = _mm_add_epi32(O9l, E7l);
5803
5804
            O9h = _mm_add_epi32(E0h, E1h);
5805
            O9h = _mm_add_epi32(O9h, E2h);
5806
            O9h = _mm_add_epi32(O9h, E3h);
5807
            O9h = _mm_add_epi32(O9h, E4h);
5808
            O9h = _mm_add_epi32(O9h, E5h);
5809
            O9h = _mm_add_epi32(O9h, E6h);
5810
            O9h = _mm_add_epi32(O9h, E7h);
5811
5812
            /* Compute 10*/
5813
5814
            E0l = _mm_madd_epi16(m128Tmp0,
5815
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5816
            E0h = _mm_madd_epi16(m128Tmp1,
5817
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5818
            E1l = _mm_madd_epi16(m128Tmp2,
5819
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5820
            E1h = _mm_madd_epi16(m128Tmp3,
5821
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5822
            E2l = _mm_madd_epi16(m128Tmp4,
5823
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5824
            E2h = _mm_madd_epi16(m128Tmp5,
5825
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5826
            E3l = _mm_madd_epi16(m128Tmp6,
5827
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5828
            E3h = _mm_madd_epi16(m128Tmp7,
5829
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5830
5831
            E4l = _mm_madd_epi16(m128Tmp8,
5832
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5833
            E4h = _mm_madd_epi16(m128Tmp9,
5834
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5835
            E5l = _mm_madd_epi16(m128Tmp10,
5836
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5837
            E5h = _mm_madd_epi16(m128Tmp11,
5838
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5839
            E6l = _mm_madd_epi16(m128Tmp12,
5840
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5841
            E6h = _mm_madd_epi16(m128Tmp13,
5842
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5843
            E7l = _mm_madd_epi16(m128Tmp14,
5844
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5845
            E7h = _mm_madd_epi16(m128Tmp15,
5846
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5847
5848
            O10l = _mm_add_epi32(E0l, E1l);
5849
            O10l = _mm_add_epi32(O10l, E2l);
5850
            O10l = _mm_add_epi32(O10l, E3l);
5851
            O10l = _mm_add_epi32(O10l, E4l);
5852
            O10l = _mm_add_epi32(O10l, E5l);
5853
            O10l = _mm_add_epi32(O10l, E6l);
5854
            O10l = _mm_add_epi32(O10l, E7l);
5855
5856
            O10h = _mm_add_epi32(E0h, E1h);
5857
            O10h = _mm_add_epi32(O10h, E2h);
5858
            O10h = _mm_add_epi32(O10h, E3h);
5859
            O10h = _mm_add_epi32(O10h, E4h);
5860
            O10h = _mm_add_epi32(O10h, E5h);
5861
            O10h = _mm_add_epi32(O10h, E6h);
5862
            O10h = _mm_add_epi32(O10h, E7h);
5863
5864
            /* Compute 11*/
5865
5866
            E0l = _mm_madd_epi16(m128Tmp0,
5867
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5868
            E0h = _mm_madd_epi16(m128Tmp1,
5869
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5870
            E1l = _mm_madd_epi16(m128Tmp2,
5871
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5872
            E1h = _mm_madd_epi16(m128Tmp3,
5873
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5874
            E2l = _mm_madd_epi16(m128Tmp4,
5875
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5876
            E2h = _mm_madd_epi16(m128Tmp5,
5877
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5878
            E3l = _mm_madd_epi16(m128Tmp6,
5879
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5880
            E3h = _mm_madd_epi16(m128Tmp7,
5881
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5882
5883
            E4l = _mm_madd_epi16(m128Tmp8,
5884
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5885
            E4h = _mm_madd_epi16(m128Tmp9,
5886
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5887
            E5l = _mm_madd_epi16(m128Tmp10,
5888
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5889
            E5h = _mm_madd_epi16(m128Tmp11,
5890
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5891
            E6l = _mm_madd_epi16(m128Tmp12,
5892
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5893
            E6h = _mm_madd_epi16(m128Tmp13,
5894
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5895
            E7l = _mm_madd_epi16(m128Tmp14,
5896
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5897
            E7h = _mm_madd_epi16(m128Tmp15,
5898
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5899
5900
            O11l = _mm_add_epi32(E0l, E1l);
5901
            O11l = _mm_add_epi32(O11l, E2l);
5902
            O11l = _mm_add_epi32(O11l, E3l);
5903
            O11l = _mm_add_epi32(O11l, E4l);
5904
            O11l = _mm_add_epi32(O11l, E5l);
5905
            O11l = _mm_add_epi32(O11l, E6l);
5906
            O11l = _mm_add_epi32(O11l, E7l);
5907
5908
            O11h = _mm_add_epi32(E0h, E1h);
5909
            O11h = _mm_add_epi32(O11h, E2h);
5910
            O11h = _mm_add_epi32(O11h, E3h);
5911
            O11h = _mm_add_epi32(O11h, E4h);
5912
            O11h = _mm_add_epi32(O11h, E5h);
5913
            O11h = _mm_add_epi32(O11h, E6h);
5914
            O11h = _mm_add_epi32(O11h, E7h);
5915
5916
            /* Compute 12*/
5917
5918
            E0l = _mm_madd_epi16(m128Tmp0,
5919
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5920
            E0h = _mm_madd_epi16(m128Tmp1,
5921
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5922
            E1l = _mm_madd_epi16(m128Tmp2,
5923
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5924
            E1h = _mm_madd_epi16(m128Tmp3,
5925
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5926
            E2l = _mm_madd_epi16(m128Tmp4,
5927
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5928
            E2h = _mm_madd_epi16(m128Tmp5,
5929
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5930
            E3l = _mm_madd_epi16(m128Tmp6,
5931
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5932
            E3h = _mm_madd_epi16(m128Tmp7,
5933
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5934
5935
            E4l = _mm_madd_epi16(m128Tmp8,
5936
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5937
            E4h = _mm_madd_epi16(m128Tmp9,
5938
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5939
            E5l = _mm_madd_epi16(m128Tmp10,
5940
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5941
            E5h = _mm_madd_epi16(m128Tmp11,
5942
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5943
            E6l = _mm_madd_epi16(m128Tmp12,
5944
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5945
            E6h = _mm_madd_epi16(m128Tmp13,
5946
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5947
            E7l = _mm_madd_epi16(m128Tmp14,
5948
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5949
            E7h = _mm_madd_epi16(m128Tmp15,
5950
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5951
5952
            O12l = _mm_add_epi32(E0l, E1l);
5953
            O12l = _mm_add_epi32(O12l, E2l);
5954
            O12l = _mm_add_epi32(O12l, E3l);
5955
            O12l = _mm_add_epi32(O12l, E4l);
5956
            O12l = _mm_add_epi32(O12l, E5l);
5957
            O12l = _mm_add_epi32(O12l, E6l);
5958
            O12l = _mm_add_epi32(O12l, E7l);
5959
5960
            O12h = _mm_add_epi32(E0h, E1h);
5961
            O12h = _mm_add_epi32(O12h, E2h);
5962
            O12h = _mm_add_epi32(O12h, E3h);
5963
            O12h = _mm_add_epi32(O12h, E4h);
5964
            O12h = _mm_add_epi32(O12h, E5h);
5965
            O12h = _mm_add_epi32(O12h, E6h);
5966
            O12h = _mm_add_epi32(O12h, E7h);
5967
5968
            /* Compute 13*/
5969
5970
            E0l = _mm_madd_epi16(m128Tmp0,
5971
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5972
            E0h = _mm_madd_epi16(m128Tmp1,
5973
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5974
            E1l = _mm_madd_epi16(m128Tmp2,
5975
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5976
            E1h = _mm_madd_epi16(m128Tmp3,
5977
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5978
            E2l = _mm_madd_epi16(m128Tmp4,
5979
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5980
            E2h = _mm_madd_epi16(m128Tmp5,
5981
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5982
            E3l = _mm_madd_epi16(m128Tmp6,
5983
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5984
            E3h = _mm_madd_epi16(m128Tmp7,
5985
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5986
5987
            E4l = _mm_madd_epi16(m128Tmp8,
5988
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5989
            E4h = _mm_madd_epi16(m128Tmp9,
5990
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5991
            E5l = _mm_madd_epi16(m128Tmp10,
5992
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5993
            E5h = _mm_madd_epi16(m128Tmp11,
5994
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5995
            E6l = _mm_madd_epi16(m128Tmp12,
5996
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5997
            E6h = _mm_madd_epi16(m128Tmp13,
5998
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5999
            E7l = _mm_madd_epi16(m128Tmp14,
6000
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
6001
            E7h = _mm_madd_epi16(m128Tmp15,
6002
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
6003
6004
            O13l = _mm_add_epi32(E0l, E1l);
6005
            O13l = _mm_add_epi32(O13l, E2l);
6006
            O13l = _mm_add_epi32(O13l, E3l);
6007
            O13l = _mm_add_epi32(O13l, E4l);
6008
            O13l = _mm_add_epi32(O13l, E5l);
6009
            O13l = _mm_add_epi32(O13l, E6l);
6010
            O13l = _mm_add_epi32(O13l, E7l);
6011
6012
            O13h = _mm_add_epi32(E0h, E1h);
6013
            O13h = _mm_add_epi32(O13h, E2h);
6014
            O13h = _mm_add_epi32(O13h, E3h);
6015
            O13h = _mm_add_epi32(O13h, E4h);
6016
            O13h = _mm_add_epi32(O13h, E5h);
6017
            O13h = _mm_add_epi32(O13h, E6h);
6018
            O13h = _mm_add_epi32(O13h, E7h);
6019
6020
            /* Compute O14  */
6021
6022
            E0l = _mm_madd_epi16(m128Tmp0,
6023
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6024
            E0h = _mm_madd_epi16(m128Tmp1,
6025
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6026
            E1l = _mm_madd_epi16(m128Tmp2,
6027
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6028
            E1h = _mm_madd_epi16(m128Tmp3,
6029
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6030
            E2l = _mm_madd_epi16(m128Tmp4,
6031
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6032
            E2h = _mm_madd_epi16(m128Tmp5,
6033
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6034
            E3l = _mm_madd_epi16(m128Tmp6,
6035
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6036
            E3h = _mm_madd_epi16(m128Tmp7,
6037
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6038
6039
            E4l = _mm_madd_epi16(m128Tmp8,
6040
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6041
            E4h = _mm_madd_epi16(m128Tmp9,
6042
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6043
            E5l = _mm_madd_epi16(m128Tmp10,
6044
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6045
            E5h = _mm_madd_epi16(m128Tmp11,
6046
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6047
            E6l = _mm_madd_epi16(m128Tmp12,
6048
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6049
            E6h = _mm_madd_epi16(m128Tmp13,
6050
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6051
            E7l = _mm_madd_epi16(m128Tmp14,
6052
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6053
            E7h = _mm_madd_epi16(m128Tmp15,
6054
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6055
6056
            O14l = _mm_add_epi32(E0l, E1l);
6057
            O14l = _mm_add_epi32(O14l, E2l);
6058
            O14l = _mm_add_epi32(O14l, E3l);
6059
            O14l = _mm_add_epi32(O14l, E4l);
6060
            O14l = _mm_add_epi32(O14l, E5l);
6061
            O14l = _mm_add_epi32(O14l, E6l);
6062
            O14l = _mm_add_epi32(O14l, E7l);
6063
6064
            O14h = _mm_add_epi32(E0h, E1h);
6065
            O14h = _mm_add_epi32(O14h, E2h);
6066
            O14h = _mm_add_epi32(O14h, E3h);
6067
            O14h = _mm_add_epi32(O14h, E4h);
6068
            O14h = _mm_add_epi32(O14h, E5h);
6069
            O14h = _mm_add_epi32(O14h, E6h);
6070
            O14h = _mm_add_epi32(O14h, E7h);
6071
6072
            /* Compute O15*/
6073
6074
            E0l = _mm_madd_epi16(m128Tmp0,
6075
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6076
            E0h = _mm_madd_epi16(m128Tmp1,
6077
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6078
            E1l = _mm_madd_epi16(m128Tmp2,
6079
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6080
            E1h = _mm_madd_epi16(m128Tmp3,
6081
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6082
            E2l = _mm_madd_epi16(m128Tmp4,
6083
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6084
            E2h = _mm_madd_epi16(m128Tmp5,
6085
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6086
            E3l = _mm_madd_epi16(m128Tmp6,
6087
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6088
            E3h = _mm_madd_epi16(m128Tmp7,
6089
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6090
6091
            E4l = _mm_madd_epi16(m128Tmp8,
6092
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6093
            E4h = _mm_madd_epi16(m128Tmp9,
6094
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6095
            E5l = _mm_madd_epi16(m128Tmp10,
6096
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6097
            E5h = _mm_madd_epi16(m128Tmp11,
6098
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6099
            E6l = _mm_madd_epi16(m128Tmp12,
6100
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6101
            E6h = _mm_madd_epi16(m128Tmp13,
6102
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6103
            E7l = _mm_madd_epi16(m128Tmp14,
6104
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6105
            E7h = _mm_madd_epi16(m128Tmp15,
6106
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6107
6108
            O15l = _mm_add_epi32(E0l, E1l);
6109
            O15l = _mm_add_epi32(O15l, E2l);
6110
            O15l = _mm_add_epi32(O15l, E3l);
6111
            O15l = _mm_add_epi32(O15l, E4l);
6112
            O15l = _mm_add_epi32(O15l, E5l);
6113
            O15l = _mm_add_epi32(O15l, E6l);
6114
            O15l = _mm_add_epi32(O15l, E7l);
6115
6116
            O15h = _mm_add_epi32(E0h, E1h);
6117
            O15h = _mm_add_epi32(O15h, E2h);
6118
            O15h = _mm_add_epi32(O15h, E3h);
6119
            O15h = _mm_add_epi32(O15h, E4h);
6120
            O15h = _mm_add_epi32(O15h, E5h);
6121
            O15h = _mm_add_epi32(O15h, E6h);
6122
            O15h = _mm_add_epi32(O15h, E7h);
6123
            /*  Compute E0  */
6124
6125
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6126
            E0l = _mm_madd_epi16(m128Tmp0,
6127
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6128
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6129
            E0h = _mm_madd_epi16(m128Tmp1,
6130
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6131
6132
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6133
            E0l = _mm_add_epi32(E0l,
6134
                    _mm_madd_epi16(m128Tmp2,
6135
                            _mm_load_si128(
6136
                                    (__m128i *) (transform16x16_1[1][0]))));
6137
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6138
            E0h = _mm_add_epi32(E0h,
6139
                    _mm_madd_epi16(m128Tmp3,
6140
                            _mm_load_si128(
6141
                                    (__m128i *) (transform16x16_1[1][0]))));
6142
6143
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6144
            E0l = _mm_add_epi32(E0l,
6145
                    _mm_madd_epi16(m128Tmp4,
6146
                            _mm_load_si128(
6147
                                    (__m128i *) (transform16x16_1[2][0]))));
6148
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6149
            E0h = _mm_add_epi32(E0h,
6150
                    _mm_madd_epi16(m128Tmp5,
6151
                            _mm_load_si128(
6152
                                    (__m128i *) (transform16x16_1[2][0]))));
6153
6154
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6155
            E0l = _mm_add_epi32(E0l,
6156
                    _mm_madd_epi16(m128Tmp6,
6157
                            _mm_load_si128(
6158
                                    (__m128i *) (transform16x16_1[3][0]))));
6159
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6160
            E0h = _mm_add_epi32(E0h,
6161
                    _mm_madd_epi16(m128Tmp7,
6162
                            _mm_load_si128(
6163
                                    (__m128i *) (transform16x16_1[3][0]))));
6164
6165
            /*  Compute E1  */
6166
            E1l = _mm_madd_epi16(m128Tmp0,
6167
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6168
            E1h = _mm_madd_epi16(m128Tmp1,
6169
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6170
            E1l = _mm_add_epi32(E1l,
6171
                    _mm_madd_epi16(m128Tmp2,
6172
                            _mm_load_si128(
6173
                                    (__m128i *) (transform16x16_1[1][1]))));
6174
            E1h = _mm_add_epi32(E1h,
6175
                    _mm_madd_epi16(m128Tmp3,
6176
                            _mm_load_si128(
6177
                                    (__m128i *) (transform16x16_1[1][1]))));
6178
            E1l = _mm_add_epi32(E1l,
6179
                    _mm_madd_epi16(m128Tmp4,
6180
                            _mm_load_si128(
6181
                                    (__m128i *) (transform16x16_1[2][1]))));
6182
            E1h = _mm_add_epi32(E1h,
6183
                    _mm_madd_epi16(m128Tmp5,
6184
                            _mm_load_si128(
6185
                                    (__m128i *) (transform16x16_1[2][1]))));
6186
            E1l = _mm_add_epi32(E1l,
6187
                    _mm_madd_epi16(m128Tmp6,
6188
                            _mm_load_si128(
6189
                                    (__m128i *) (transform16x16_1[3][1]))));
6190
            E1h = _mm_add_epi32(E1h,
6191
                    _mm_madd_epi16(m128Tmp7,
6192
                            _mm_load_si128(
6193
                                    (__m128i *) (transform16x16_1[3][1]))));
6194
6195
            /*  Compute E2  */
6196
            E2l = _mm_madd_epi16(m128Tmp0,
6197
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6198
            E2h = _mm_madd_epi16(m128Tmp1,
6199
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6200
            E2l = _mm_add_epi32(E2l,
6201
                    _mm_madd_epi16(m128Tmp2,
6202
                            _mm_load_si128(
6203
                                    (__m128i *) (transform16x16_1[1][2]))));
6204
            E2h = _mm_add_epi32(E2h,
6205
                    _mm_madd_epi16(m128Tmp3,
6206
                            _mm_load_si128(
6207
                                    (__m128i *) (transform16x16_1[1][2]))));
6208
            E2l = _mm_add_epi32(E2l,
6209
                    _mm_madd_epi16(m128Tmp4,
6210
                            _mm_load_si128(
6211
                                    (__m128i *) (transform16x16_1[2][2]))));
6212
            E2h = _mm_add_epi32(E2h,
6213
                    _mm_madd_epi16(m128Tmp5,
6214
                            _mm_load_si128(
6215
                                    (__m128i *) (transform16x16_1[2][2]))));
6216
            E2l = _mm_add_epi32(E2l,
6217
                    _mm_madd_epi16(m128Tmp6,
6218
                            _mm_load_si128(
6219
                                    (__m128i *) (transform16x16_1[3][2]))));
6220
            E2h = _mm_add_epi32(E2h,
6221
                    _mm_madd_epi16(m128Tmp7,
6222
                            _mm_load_si128(
6223
                                    (__m128i *) (transform16x16_1[3][2]))));
6224
6225
            /*  Compute E3  */
6226
            E3l = _mm_madd_epi16(m128Tmp0,
6227
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6228
            E3h = _mm_madd_epi16(m128Tmp1,
6229
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6230
            E3l = _mm_add_epi32(E3l,
6231
                    _mm_madd_epi16(m128Tmp2,
6232
                            _mm_load_si128(
6233
                                    (__m128i *) (transform16x16_1[1][3]))));
6234
            E3h = _mm_add_epi32(E3h,
6235
                    _mm_madd_epi16(m128Tmp3,
6236
                            _mm_load_si128(
6237
                                    (__m128i *) (transform16x16_1[1][3]))));
6238
            E3l = _mm_add_epi32(E3l,
6239
                    _mm_madd_epi16(m128Tmp4,
6240
                            _mm_load_si128(
6241
                                    (__m128i *) (transform16x16_1[2][3]))));
6242
            E3h = _mm_add_epi32(E3h,
6243
                    _mm_madd_epi16(m128Tmp5,
6244
                            _mm_load_si128(
6245
                                    (__m128i *) (transform16x16_1[2][3]))));
6246
            E3l = _mm_add_epi32(E3l,
6247
                    _mm_madd_epi16(m128Tmp6,
6248
                            _mm_load_si128(
6249
                                    (__m128i *) (transform16x16_1[3][3]))));
6250
            E3h = _mm_add_epi32(E3h,
6251
                    _mm_madd_epi16(m128Tmp7,
6252
                            _mm_load_si128(
6253
                                    (__m128i *) (transform16x16_1[3][3]))));
6254
6255
            /*  Compute E4  */
6256
            E4l = _mm_madd_epi16(m128Tmp0,
6257
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6258
            E4h = _mm_madd_epi16(m128Tmp1,
6259
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6260
            E4l = _mm_add_epi32(E4l,
6261
                    _mm_madd_epi16(m128Tmp2,
6262
                            _mm_load_si128(
6263
                                    (__m128i *) (transform16x16_1[1][4]))));
6264
            E4h = _mm_add_epi32(E4h,
6265
                    _mm_madd_epi16(m128Tmp3,
6266
                            _mm_load_si128(
6267
                                    (__m128i *) (transform16x16_1[1][4]))));
6268
            E4l = _mm_add_epi32(E4l,
6269
                    _mm_madd_epi16(m128Tmp4,
6270
                            _mm_load_si128(
6271
                                    (__m128i *) (transform16x16_1[2][4]))));
6272
            E4h = _mm_add_epi32(E4h,
6273
                    _mm_madd_epi16(m128Tmp5,
6274
                            _mm_load_si128(
6275
                                    (__m128i *) (transform16x16_1[2][4]))));
6276
            E4l = _mm_add_epi32(E4l,
6277
                    _mm_madd_epi16(m128Tmp6,
6278
                            _mm_load_si128(
6279
                                    (__m128i *) (transform16x16_1[3][4]))));
6280
            E4h = _mm_add_epi32(E4h,
6281
                    _mm_madd_epi16(m128Tmp7,
6282
                            _mm_load_si128(
6283
                                    (__m128i *) (transform16x16_1[3][4]))));
6284
6285
            /*  Compute E3  */
6286
            E5l = _mm_madd_epi16(m128Tmp0,
6287
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6288
            E5h = _mm_madd_epi16(m128Tmp1,
6289
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6290
            E5l = _mm_add_epi32(E5l,
6291
                    _mm_madd_epi16(m128Tmp2,
6292
                            _mm_load_si128(
6293
                                    (__m128i *) (transform16x16_1[1][5]))));
6294
            E5h = _mm_add_epi32(E5h,
6295
                    _mm_madd_epi16(m128Tmp3,
6296
                            _mm_load_si128(
6297
                                    (__m128i *) (transform16x16_1[1][5]))));
6298
            E5l = _mm_add_epi32(E5l,
6299
                    _mm_madd_epi16(m128Tmp4,
6300
                            _mm_load_si128(
6301
                                    (__m128i *) (transform16x16_1[2][5]))));
6302
            E5h = _mm_add_epi32(E5h,
6303
                    _mm_madd_epi16(m128Tmp5,
6304
                            _mm_load_si128(
6305
                                    (__m128i *) (transform16x16_1[2][5]))));
6306
            E5l = _mm_add_epi32(E5l,
6307
                    _mm_madd_epi16(m128Tmp6,
6308
                            _mm_load_si128(
6309
                                    (__m128i *) (transform16x16_1[3][5]))));
6310
            E5h = _mm_add_epi32(E5h,
6311
                    _mm_madd_epi16(m128Tmp7,
6312
                            _mm_load_si128(
6313
                                    (__m128i *) (transform16x16_1[3][5]))));
6314
6315
            /*  Compute E6  */
6316
            E6l = _mm_madd_epi16(m128Tmp0,
6317
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6318
            E6h = _mm_madd_epi16(m128Tmp1,
6319
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6320
            E6l = _mm_add_epi32(E6l,
6321
                    _mm_madd_epi16(m128Tmp2,
6322
                            _mm_load_si128(
6323
                                    (__m128i *) (transform16x16_1[1][6]))));
6324
            E6h = _mm_add_epi32(E6h,
6325
                    _mm_madd_epi16(m128Tmp3,
6326
                            _mm_load_si128(
6327
                                    (__m128i *) (transform16x16_1[1][6]))));
6328
            E6l = _mm_add_epi32(E6l,
6329
                    _mm_madd_epi16(m128Tmp4,
6330
                            _mm_load_si128(
6331
                                    (__m128i *) (transform16x16_1[2][6]))));
6332
            E6h = _mm_add_epi32(E6h,
6333
                    _mm_madd_epi16(m128Tmp5,
6334
                            _mm_load_si128(
6335
                                    (__m128i *) (transform16x16_1[2][6]))));
6336
            E6l = _mm_add_epi32(E6l,
6337
                    _mm_madd_epi16(m128Tmp6,
6338
                            _mm_load_si128(
6339
                                    (__m128i *) (transform16x16_1[3][6]))));
6340
            E6h = _mm_add_epi32(E6h,
6341
                    _mm_madd_epi16(m128Tmp7,
6342
                            _mm_load_si128(
6343
                                    (__m128i *) (transform16x16_1[3][6]))));
6344
6345
            /*  Compute E7  */
6346
            E7l = _mm_madd_epi16(m128Tmp0,
6347
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6348
            E7h = _mm_madd_epi16(m128Tmp1,
6349
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6350
            E7l = _mm_add_epi32(E7l,
6351
                    _mm_madd_epi16(m128Tmp2,
6352
                            _mm_load_si128(
6353
                                    (__m128i *) (transform16x16_1[1][7]))));
6354
            E7h = _mm_add_epi32(E7h,
6355
                    _mm_madd_epi16(m128Tmp3,
6356
                            _mm_load_si128(
6357
                                    (__m128i *) (transform16x16_1[1][7]))));
6358
            E7l = _mm_add_epi32(E7l,
6359
                    _mm_madd_epi16(m128Tmp4,
6360
                            _mm_load_si128(
6361
                                    (__m128i *) (transform16x16_1[2][7]))));
6362
            E7h = _mm_add_epi32(E7h,
6363
                    _mm_madd_epi16(m128Tmp5,
6364
                            _mm_load_si128(
6365
                                    (__m128i *) (transform16x16_1[2][7]))));
6366
            E7l = _mm_add_epi32(E7l,
6367
                    _mm_madd_epi16(m128Tmp6,
6368
                            _mm_load_si128(
6369
                                    (__m128i *) (transform16x16_1[3][7]))));
6370
            E7h = _mm_add_epi32(E7h,
6371
                    _mm_madd_epi16(m128Tmp7,
6372
                            _mm_load_si128(
6373
                                    (__m128i *) (transform16x16_1[3][7]))));
6374
6375
            /*  Compute EE0 and EEE */
6376
6377
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6378
            E00l = _mm_madd_epi16(m128Tmp0,
6379
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6380
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6381
            E00h = _mm_madd_epi16(m128Tmp1,
6382
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6383
6384
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6385
            E00l = _mm_add_epi32(E00l,
6386
                    _mm_madd_epi16(m128Tmp2,
6387
                            _mm_load_si128(
6388
                                    (__m128i *) (transform16x16_2[1][0]))));
6389
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6390
            E00h = _mm_add_epi32(E00h,
6391
                    _mm_madd_epi16(m128Tmp3,
6392
                            _mm_load_si128(
6393
                                    (__m128i *) (transform16x16_2[1][0]))));
6394
6395
            E01l = _mm_madd_epi16(m128Tmp0,
6396
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6397
            E01h = _mm_madd_epi16(m128Tmp1,
6398
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6399
            E01l = _mm_add_epi32(E01l,
6400
                    _mm_madd_epi16(m128Tmp2,
6401
                            _mm_load_si128(
6402
                                    (__m128i *) (transform16x16_2[1][1]))));
6403
            E01h = _mm_add_epi32(E01h,
6404
                    _mm_madd_epi16(m128Tmp3,
6405
                            _mm_load_si128(
6406
                                    (__m128i *) (transform16x16_2[1][1]))));
6407
6408
            E02l = _mm_madd_epi16(m128Tmp0,
6409
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6410
            E02h = _mm_madd_epi16(m128Tmp1,
6411
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6412
            E02l = _mm_add_epi32(E02l,
6413
                    _mm_madd_epi16(m128Tmp2,
6414
                            _mm_load_si128(
6415
                                    (__m128i *) (transform16x16_2[1][2]))));
6416
            E02h = _mm_add_epi32(E02h,
6417
                    _mm_madd_epi16(m128Tmp3,
6418
                            _mm_load_si128(
6419
                                    (__m128i *) (transform16x16_2[1][2]))));
6420
6421
            E03l = _mm_madd_epi16(m128Tmp0,
6422
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6423
            E03h = _mm_madd_epi16(m128Tmp1,
6424
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6425
            E03l = _mm_add_epi32(E03l,
6426
                    _mm_madd_epi16(m128Tmp2,
6427
                            _mm_load_si128(
6428
                                    (__m128i *) (transform16x16_2[1][3]))));
6429
            E03h = _mm_add_epi32(E03h,
6430
                    _mm_madd_epi16(m128Tmp3,
6431
                            _mm_load_si128(
6432
                                    (__m128i *) (transform16x16_2[1][3]))));
6433
6434
            /*  Compute EE0 and EEE */
6435
6436
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6437
            EE0l = _mm_madd_epi16(m128Tmp0,
6438
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6439
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6440
            EE0h = _mm_madd_epi16(m128Tmp1,
6441
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6442
6443
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6444
            EEE0l = _mm_madd_epi16(m128Tmp2,
6445
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6446
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6447
            EEE0h = _mm_madd_epi16(m128Tmp3,
6448
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6449
6450
            EE1l = _mm_madd_epi16(m128Tmp0,
6451
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6452
            EE1h = _mm_madd_epi16(m128Tmp1,
6453
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6454
6455
            EEE1l = _mm_madd_epi16(m128Tmp2,
6456
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6457
            EEE1h = _mm_madd_epi16(m128Tmp3,
6458
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6459
6460
            /*  Compute EE    */
6461
6462
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
6463
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
6464
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
6465
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
6466
6467
            EE0l = _mm_add_epi32(EEE0l, EE0l);
6468
            EE1l = _mm_add_epi32(EEE1l, EE1l);
6469
            EE0h = _mm_add_epi32(EEE0h, EE0h);
6470
            EE1h = _mm_add_epi32(EEE1h, EE1h);
6471
            /**/
6472
6473
            EE7l = _mm_sub_epi32(EE0l, E00l);
6474
            EE6l = _mm_sub_epi32(EE1l, E01l);
6475
            EE5l = _mm_sub_epi32(EE2l, E02l);
6476
            EE4l = _mm_sub_epi32(EE3l, E03l);
6477
6478
            EE7h = _mm_sub_epi32(EE0h, E00h);
6479
            EE6h = _mm_sub_epi32(EE1h, E01h);
6480
            EE5h = _mm_sub_epi32(EE2h, E02h);
6481
            EE4h = _mm_sub_epi32(EE3h, E03h);
6482
6483
            EE0l = _mm_add_epi32(EE0l, E00l);
6484
            EE1l = _mm_add_epi32(EE1l, E01l);
6485
            EE2l = _mm_add_epi32(EE2l, E02l);
6486
            EE3l = _mm_add_epi32(EE3l, E03l);
6487
6488
            EE0h = _mm_add_epi32(EE0h, E00h);
6489
            EE1h = _mm_add_epi32(EE1h, E01h);
6490
            EE2h = _mm_add_epi32(EE2h, E02h);
6491
            EE3h = _mm_add_epi32(EE3h, E03h);
6492
            /*      Compute E       */
6493
6494
            E15l = _mm_sub_epi32(EE0l, E0l);
6495
            E15l = _mm_add_epi32(E15l, m128iAdd);
6496
            E14l = _mm_sub_epi32(EE1l, E1l);
6497
            E14l = _mm_add_epi32(E14l, m128iAdd);
6498
            E13l = _mm_sub_epi32(EE2l, E2l);
6499
            E13l = _mm_add_epi32(E13l, m128iAdd);
6500
            E12l = _mm_sub_epi32(EE3l, E3l);
6501
            E12l = _mm_add_epi32(E12l, m128iAdd);
6502
            E11l = _mm_sub_epi32(EE4l, E4l);
6503
            E11l = _mm_add_epi32(E11l, m128iAdd);
6504
            E10l = _mm_sub_epi32(EE5l, E5l);
6505
            E10l = _mm_add_epi32(E10l, m128iAdd);
6506
            E9l = _mm_sub_epi32(EE6l, E6l);
6507
            E9l = _mm_add_epi32(E9l, m128iAdd);
6508
            E8l = _mm_sub_epi32(EE7l, E7l);
6509
            E8l = _mm_add_epi32(E8l, m128iAdd);
6510
6511
            E0l = _mm_add_epi32(EE0l, E0l);
6512
            E0l = _mm_add_epi32(E0l, m128iAdd);
6513
            E1l = _mm_add_epi32(EE1l, E1l);
6514
            E1l = _mm_add_epi32(E1l, m128iAdd);
6515
            E2l = _mm_add_epi32(EE2l, E2l);
6516
            E2l = _mm_add_epi32(E2l, m128iAdd);
6517
            E3l = _mm_add_epi32(EE3l, E3l);
6518
            E3l = _mm_add_epi32(E3l, m128iAdd);
6519
            E4l = _mm_add_epi32(EE4l, E4l);
6520
            E4l = _mm_add_epi32(E4l, m128iAdd);
6521
            E5l = _mm_add_epi32(EE5l, E5l);
6522
            E5l = _mm_add_epi32(E5l, m128iAdd);
6523
            E6l = _mm_add_epi32(EE6l, E6l);
6524
            E6l = _mm_add_epi32(E6l, m128iAdd);
6525
            E7l = _mm_add_epi32(EE7l, E7l);
6526
            E7l = _mm_add_epi32(E7l, m128iAdd);
6527
6528
            E15h = _mm_sub_epi32(EE0h, E0h);
6529
            E15h = _mm_add_epi32(E15h, m128iAdd);
6530
            E14h = _mm_sub_epi32(EE1h, E1h);
6531
            E14h = _mm_add_epi32(E14h, m128iAdd);
6532
            E13h = _mm_sub_epi32(EE2h, E2h);
6533
            E13h = _mm_add_epi32(E13h, m128iAdd);
6534
            E12h = _mm_sub_epi32(EE3h, E3h);
6535
            E12h = _mm_add_epi32(E12h, m128iAdd);
6536
            E11h = _mm_sub_epi32(EE4h, E4h);
6537
            E11h = _mm_add_epi32(E11h, m128iAdd);
6538
            E10h = _mm_sub_epi32(EE5h, E5h);
6539
            E10h = _mm_add_epi32(E10h, m128iAdd);
6540
            E9h = _mm_sub_epi32(EE6h, E6h);
6541
            E9h = _mm_add_epi32(E9h, m128iAdd);
6542
            E8h = _mm_sub_epi32(EE7h, E7h);
6543
            E8h = _mm_add_epi32(E8h, m128iAdd);
6544
6545
            E0h = _mm_add_epi32(EE0h, E0h);
6546
            E0h = _mm_add_epi32(E0h, m128iAdd);
6547
            E1h = _mm_add_epi32(EE1h, E1h);
6548
            E1h = _mm_add_epi32(E1h, m128iAdd);
6549
            E2h = _mm_add_epi32(EE2h, E2h);
6550
            E2h = _mm_add_epi32(E2h, m128iAdd);
6551
            E3h = _mm_add_epi32(EE3h, E3h);
6552
            E3h = _mm_add_epi32(E3h, m128iAdd);
6553
            E4h = _mm_add_epi32(EE4h, E4h);
6554
            E4h = _mm_add_epi32(E4h, m128iAdd);
6555
            E5h = _mm_add_epi32(EE5h, E5h);
6556
            E5h = _mm_add_epi32(E5h, m128iAdd);
6557
            E6h = _mm_add_epi32(EE6h, E6h);
6558
            E6h = _mm_add_epi32(E6h, m128iAdd);
6559
            E7h = _mm_add_epi32(EE7h, E7h);
6560
            E7h = _mm_add_epi32(E7h, m128iAdd);
6561
6562
            m128iS0 = _mm_packs_epi32(
6563
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6564
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6565
            m128iS1 = _mm_packs_epi32(
6566
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6567
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6568
            m128iS2 = _mm_packs_epi32(
6569
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6570
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6571
            m128iS3 = _mm_packs_epi32(
6572
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6573
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6574
            m128iS4 = _mm_packs_epi32(
6575
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6576
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6577
            m128iS5 = _mm_packs_epi32(
6578
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6579
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6580
            m128iS6 = _mm_packs_epi32(
6581
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6582
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6583
            m128iS7 = _mm_packs_epi32(
6584
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6585
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6586
            m128iS8 = _mm_packs_epi32(
6587
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6588
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6589
            m128iS9 = _mm_packs_epi32(
6590
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6591
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6592
            m128iS10 = _mm_packs_epi32(
6593
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6594
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6595
            m128iS11 = _mm_packs_epi32(
6596
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6597
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6598
            m128iS12 = _mm_packs_epi32(
6599
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6600
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6601
            m128iS13 = _mm_packs_epi32(
6602
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6603
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6604
            m128iS14 = _mm_packs_epi32(
6605
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6606
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6607
            m128iS15 = _mm_packs_epi32(
6608
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6609
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6610
6611
            m128iS31 = _mm_packs_epi32(
6612
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6613
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6614
            m128iS30 = _mm_packs_epi32(
6615
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6616
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6617
            m128iS29 = _mm_packs_epi32(
6618
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6619
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6620
            m128iS28 = _mm_packs_epi32(
6621
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6622
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6623
            m128iS27 = _mm_packs_epi32(
6624
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6625
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6626
            m128iS26 = _mm_packs_epi32(
6627
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6628
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6629
            m128iS25 = _mm_packs_epi32(
6630
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6631
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6632
            m128iS24 = _mm_packs_epi32(
6633
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6634
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6635
            m128iS23 = _mm_packs_epi32(
6636
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6637
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6638
            m128iS22 = _mm_packs_epi32(
6639
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6640
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6641
            m128iS21 = _mm_packs_epi32(
6642
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6643
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6644
            m128iS20 = _mm_packs_epi32(
6645
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6646
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6647
            m128iS19 = _mm_packs_epi32(
6648
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6649
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6650
            m128iS18 = _mm_packs_epi32(
6651
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6652
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6653
            m128iS17 = _mm_packs_epi32(
6654
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6655
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6656
            m128iS16 = _mm_packs_epi32(
6657
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6658
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6659
6660
            if (!j) {
6661
                /*      Inverse the matrix      */
6662
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6663
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6664
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6665
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6666
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6667
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6668
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6669
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6670
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6671
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6672
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6673
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6674
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6675
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6676
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6677
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6678
6679
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6680
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6681
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6682
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6683
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6684
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6685
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6686
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6687
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6688
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6689
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6690
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6691
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6692
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6693
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6694
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6695
6696
                E0h = _mm_unpacklo_epi16(E0l, E8l);
6697
                E1h = _mm_unpacklo_epi16(E1l, E9l);
6698
                E2h = _mm_unpacklo_epi16(E2l, E10l);
6699
                E3h = _mm_unpacklo_epi16(E3l, E11l);
6700
                E4h = _mm_unpacklo_epi16(E4l, E12l);
6701
                E5h = _mm_unpacklo_epi16(E5l, E13l);
6702
                E6h = _mm_unpacklo_epi16(E6l, E14l);
6703
                E7h = _mm_unpacklo_epi16(E7l, E15l);
6704
6705
                E8h = _mm_unpackhi_epi16(E0l, E8l);
6706
                E9h = _mm_unpackhi_epi16(E1l, E9l);
6707
                E10h = _mm_unpackhi_epi16(E2l, E10l);
6708
                E11h = _mm_unpackhi_epi16(E3l, E11l);
6709
                E12h = _mm_unpackhi_epi16(E4l, E12l);
6710
                E13h = _mm_unpackhi_epi16(E5l, E13l);
6711
                E14h = _mm_unpackhi_epi16(E6l, E14l);
6712
                E15h = _mm_unpackhi_epi16(E7l, E15l);
6713
6714
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6715
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6716
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6717
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6718
6719
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6720
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6721
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6722
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6723
6724
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6725
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6726
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6727
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6728
6729
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6730
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6731
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6732
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6733
6734
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6735
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6736
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6737
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6738
6739
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6740
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6741
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6742
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6743
6744
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6745
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6746
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6747
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6748
6749
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6750
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6751
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6752
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6753
6754
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6755
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6756
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6757
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6758
6759
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6760
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6761
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6762
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6763
6764
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6765
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6766
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6767
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6768
6769
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6770
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6771
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6772
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6773
6774
                /*  */
6775
                E0h = _mm_unpacklo_epi16(O0l, O8l);
6776
                E1h = _mm_unpacklo_epi16(O1l, O9l);
6777
                E2h = _mm_unpacklo_epi16(O2l, O10l);
6778
                E3h = _mm_unpacklo_epi16(O3l, O11l);
6779
                E4h = _mm_unpacklo_epi16(O4l, O12l);
6780
                E5h = _mm_unpacklo_epi16(O5l, O13l);
6781
                E6h = _mm_unpacklo_epi16(O6l, O14l);
6782
                E7h = _mm_unpacklo_epi16(O7l, O15l);
6783
6784
                E8h = _mm_unpackhi_epi16(O0l, O8l);
6785
                E9h = _mm_unpackhi_epi16(O1l, O9l);
6786
                E10h = _mm_unpackhi_epi16(O2l, O10l);
6787
                E11h = _mm_unpackhi_epi16(O3l, O11l);
6788
                E12h = _mm_unpackhi_epi16(O4l, O12l);
6789
                E13h = _mm_unpackhi_epi16(O5l, O13l);
6790
                E14h = _mm_unpackhi_epi16(O6l, O14l);
6791
                E15h = _mm_unpackhi_epi16(O7l, O15l);
6792
6793
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6794
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6795
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6796
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6797
6798
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6799
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6800
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6801
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6802
6803
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6804
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6805
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6806
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6807
6808
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6809
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6810
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6811
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6812
6813
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6814
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6815
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6816
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6817
6818
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6819
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6820
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6821
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6822
6823
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6824
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6825
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6826
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6827
6828
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6829
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6830
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6831
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6832
6833
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6834
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6835
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6836
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6837
6838
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6839
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6840
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6841
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6842
6843
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6844
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6845
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6846
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6847
6848
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6849
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6850
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6851
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6852
                /*  */
6853
                _mm_store_si128((__m128i *) (src + i), m128iS0);
6854
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6855
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6856
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6857
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6858
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6859
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6860
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6861
                _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6862
                _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6863
                _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6864
                _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6865
                _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6866
                _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6867
                _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6868
                _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6869
                _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6870
                _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6871
                _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6872
                _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6873
                _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6874
                _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6875
                _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6876
                _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6877
                _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6878
                _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6879
                _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6880
                _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6881
                _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6882
                _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6883
                _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6884
                _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6885
6886
                if (i <= 16) {
6887
                    int k = i + 8;
6888
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
6889
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6890
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6891
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6892
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6893
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6894
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6895
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6896
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6897
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6898
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6899
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6900
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6901
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6902
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6903
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6904
6905
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6906
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6907
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6908
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6909
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6910
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6911
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6912
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6913
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6914
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6915
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6916
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6917
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6918
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6919
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6920
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6921
                } else {
6922
                    m128iS0 = _mm_load_si128((__m128i *) (src));
6923
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6924
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6925
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6926
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6927
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6928
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6929
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6930
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6931
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6932
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6933
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6934
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6935
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6936
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6937
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6938
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6939
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6940
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6941
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6942
                    m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6943
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6944
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6945
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6946
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6947
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6948
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6949
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6950
                    m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6951
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6952
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6953
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6954
                    shift = shift_2nd;
6955
                    m128iAdd = _mm_set1_epi32(add_2nd);
6956
                }
6957
6958
            } else {
6959
                int k, m = 0;
6960
                _mm_storeu_si128((__m128i *) (src), m128iS0);
6961
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6962
                _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6963
                _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6964
                _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6965
                _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6966
                _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6967
                _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6968
                _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6969
                _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6970
                _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6971
                _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6972
                _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6973
                _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6974
                _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6975
                _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6976
6977
                _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6978
                _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6979
                _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6980
                _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6981
                _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6982
                _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6983
                _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6984
                _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6985
                _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6986
                _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6987
                _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6988
                _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6989
                _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6990
                _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6991
                _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6992
                _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6993
                dst = (uint16_t*) _dst + (i * stride);
6994
                for (k = 0; k < 8; k++) {
6995
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6996
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6997
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6998
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6999
                    dst[4] = av_clip_uintp2(
7000
                            dst[4] + src[m + 128],10);
7001
                    dst[5] = av_clip_uintp2(
7002
                            dst[5] + src[m + 128 + 8],10);
7003
                    dst[6] = av_clip_uintp2(
7004
                            dst[6] + src[m + 128 + 16],10);
7005
                    dst[7] = av_clip_uintp2(
7006
                            dst[7] + src[m + 128 + 24],10);
7007
7008
                    dst[8] = av_clip_uintp2(
7009
                            dst[8] + src[m + 256],10);
7010
                    dst[9] = av_clip_uintp2(
7011
                            dst[9] + src[m + 256 + 8],10);
7012
                    dst[10] = av_clip_uintp2(
7013
                            dst[10] + src[m + 256 + 16],10);
7014
                    dst[11] = av_clip_uintp2(
7015
                            dst[11] + src[m + 256 + 24],10);
7016
                    dst[12] = av_clip_uintp2(
7017
                            dst[12] + src[m + 384],10);
7018
                    dst[13] = av_clip_uintp2(
7019
                            dst[13] + src[m + 384 + 8],10);
7020
                    dst[14] = av_clip_uintp2(
7021
                            dst[14] + src[m + 384 + 16],10);
7022
                    dst[15] = av_clip_uintp2(
7023
                            dst[15] + src[m + 384 + 24],10);
7024
7025
                    dst[16] = av_clip_uintp2(
7026
                            dst[16] + src[m + 512],10);
7027
                    dst[17] = av_clip_uintp2(
7028
                            dst[17] + src[m + 512 + 8],10);
7029
                    dst[18] = av_clip_uintp2(
7030
                            dst[18] + src[m + 512 + 16],10);
7031
                    dst[19] = av_clip_uintp2(
7032
                            dst[19] + src[m + 512 + 24],10);
7033
                    dst[20] = av_clip_uintp2(
7034
                            dst[20] + src[m + 640],10);
7035
                    dst[21] = av_clip_uintp2(
7036
                            dst[21] + src[m + 640 + 8],10);
7037
                    dst[22] = av_clip_uintp2(
7038
                            dst[22] + src[m + 640 + 16],10);
7039
                    dst[23] = av_clip_uintp2(
7040
                            dst[23] + src[m + 640 + 24],10);
7041
7042
                    dst[24] = av_clip_uintp2(
7043
                            dst[24] + src[m + 768],10);
7044
                    dst[25] = av_clip_uintp2(
7045
                            dst[25] + src[m + 768 + 8],10);
7046
                    dst[26] = av_clip_uintp2(
7047
                            dst[26] + src[m + 768 + 16],10);
7048
                    dst[27] = av_clip_uintp2(
7049
                            dst[27] + src[m + 768 + 24],10);
7050
                    dst[28] = av_clip_uintp2(
7051
                            dst[28] + src[m + 896],10);
7052
                    dst[29] = av_clip_uintp2(
7053
                            dst[29] + src[m + 896 + 8],10);
7054
                    dst[30] = av_clip_uintp2(
7055
                            dst[30] + src[m + 896 + 16],10);
7056
                    dst[31] = av_clip_uintp2(
7057
                            dst[31] + src[m + 896 + 24],10);
7058
7059
                    m += 1;
7060
                    dst += stride;
7061
                }
7062
                if (i <= 16) {
7063
                    int k = (i + 8) * 4;
7064
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
7065
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7066
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7067
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7068
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7069
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7070
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7071
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7072
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7073
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7074
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7075
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7076
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7077
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7078
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7079
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7080
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7081
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7082
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7083
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7084
                    m128iS20 = _mm_loadu_si128(
7085
                            (__m128i *) (src + 512 + 16 + k));
7086
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7087
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7088
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7089
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7090
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7091
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7092
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7093
                    m128iS28 = _mm_loadu_si128(
7094
                            (__m128i *) (src + 512 + 24 + k));
7095
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7096
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7097
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7098
                }
7099
            }
7100
        }
7101
    }
7102
}
7103
#endif
7104
7105
7106
#if HAVE_SSE4_1
7107
// Add the int32 residual block 'r' (nT x nT, row-major, nT values per row) to
7108
// the prediction samples in 'dst' and clip into the valid pixel range.
7109
// Equivalent to add_residual_fallback<uint8_t> (bit_depth is always 8 here).
7110
void add_residual_8_sse4(uint8_t *dst, ptrdiff_t stride,
7111
                         const int32_t* r, int nT, int bit_depth)
7112
0
{
7113
0
  if (nT==4) {
7114
0
    for (int y=0;y<4;y++) {
7115
0
      uint8_t* drow = dst + y*stride;
7116
7117
0
      __m128i res = _mm_loadu_si128((const __m128i*)(r + y*4));     // 4 x int32 residual
7118
0
      __m128i pix = _mm_cvtsi32_si128(*(const int32_t*)drow);       // 4 x uint8
7119
0
      pix = _mm_cvtepu8_epi32(pix);                                 // -> 4 x int32
7120
7121
0
      __m128i sum = _mm_add_epi32(res, pix);
7122
0
      sum = _mm_packs_epi32(sum, sum);                              // -> int16 (saturate)
7123
0
      sum = _mm_packus_epi16(sum, sum);                             // -> uint8  (clip 0..255)
7124
7125
0
      *(int32_t*)drow = _mm_cvtsi128_si32(sum);
7126
0
    }
7127
0
  }
7128
0
  else {
7129
    // nT is 8, 16 or 32 -> always a multiple of 8
7130
0
    for (int y=0;y<nT;y++) {
7131
0
      const int32_t* rrow = r + y*nT;
7132
0
      uint8_t*       drow = dst + y*stride;
7133
7134
0
      for (int x=0;x<nT;x+=8) {
7135
0
        __m128i r0  = _mm_loadu_si128((const __m128i*)(rrow + x));    // 4 x int32
7136
0
        __m128i r1  = _mm_loadu_si128((const __m128i*)(rrow + x+4));  // 4 x int32
7137
0
        __m128i pix = _mm_loadl_epi64((const __m128i*)(drow + x));    // 8 x uint8
7138
7139
0
        __m128i p0 = _mm_cvtepu8_epi32(pix);                         // 4 x int32
7140
0
        __m128i p1 = _mm_cvtepu8_epi32(_mm_srli_si128(pix,4));       // 4 x int32
7141
7142
0
        __m128i s0 = _mm_add_epi32(r0, p0);
7143
0
        __m128i s1 = _mm_add_epi32(r1, p1);
7144
7145
0
        __m128i p16 = _mm_packs_epi32(s0, s1);                       // 8 x int16 (saturate)
7146
0
        __m128i p8  = _mm_packus_epi16(p16, p16);                    // 8 x uint8 (clip 0..255)
7147
7148
0
        _mm_storel_epi64((__m128i*)(drow + x), p8);
7149
0
      }
7150
0
    }
7151
0
  }
7152
0
}
7153
7154
7155
// 16-bit (high bit-depth) variant. Equivalent to add_residual_fallback<uint16_t>.
7156
void add_residual_16_sse4(uint16_t *dst, ptrdiff_t stride,
7157
                          const int32_t* r, int nT, int bit_depth)
7158
0
{
7159
0
  const int32_t maxval = (1<<bit_depth)-1;
7160
0
  const __m128i vmax  = _mm_set1_epi32(maxval);
7161
0
  const __m128i vzero = _mm_setzero_si128();
7162
7163
0
  if (nT==4) {
7164
0
    for (int y=0;y<4;y++) {
7165
0
      uint16_t* drow = dst + y*stride;
7166
7167
0
      __m128i res = _mm_loadu_si128((const __m128i*)(r + y*4));     // 4 x int32 residual
7168
0
      __m128i pix = _mm_loadl_epi64((const __m128i*)drow);          // 4 x uint16
7169
0
      pix = _mm_cvtepu16_epi32(pix);                                // -> 4 x int32
7170
7171
0
      __m128i sum = _mm_add_epi32(res, pix);
7172
0
      sum = _mm_min_epi32(_mm_max_epi32(sum, vzero), vmax);         // clip 0..maxval
7173
0
      sum = _mm_packus_epi32(sum, sum);                             // -> uint16
7174
7175
0
      _mm_storel_epi64((__m128i*)drow, sum);
7176
0
    }
7177
0
  }
7178
0
  else {
7179
    // nT is 8, 16 or 32 -> always a multiple of 8
7180
0
    for (int y=0;y<nT;y++) {
7181
0
      const int32_t* rrow = r + y*nT;
7182
0
      uint16_t*      drow = dst + y*stride;
7183
7184
0
      for (int x=0;x<nT;x+=8) {
7185
0
        __m128i r0  = _mm_loadu_si128((const __m128i*)(rrow + x));    // 4 x int32
7186
0
        __m128i r1  = _mm_loadu_si128((const __m128i*)(rrow + x+4));  // 4 x int32
7187
0
        __m128i pix = _mm_loadu_si128((const __m128i*)(drow + x));    // 8 x uint16
7188
7189
0
        __m128i p0 = _mm_cvtepu16_epi32(pix);                        // 4 x int32
7190
0
        __m128i p1 = _mm_cvtepu16_epi32(_mm_srli_si128(pix,8));      // 4 x int32
7191
7192
0
        __m128i s0 = _mm_add_epi32(r0, p0);
7193
0
        __m128i s1 = _mm_add_epi32(r1, p1);
7194
7195
0
        s0 = _mm_min_epi32(_mm_max_epi32(s0, vzero), vmax);          // clip 0..maxval
7196
0
        s1 = _mm_min_epi32(_mm_max_epi32(s1, vzero), vmax);
7197
7198
0
        __m128i out = _mm_packus_epi32(s0, s1);                      // 8 x uint16
7199
0
        _mm_storeu_si128((__m128i*)(drow + x), out);
7200
0
      }
7201
0
    }
7202
0
  }
7203
0
}
7204
7205
7206
// Inverse quantization without scaling list, int32 fast path (see acceleration.h).
7207
// Vectorizes the multiply/round/clip 8 coefficients at a time; the scatter into
7208
// coeffBuf[coeffPos[i]] stays scalar (no 16-bit SIMD scatter exists).
7209
void dequant_coeff_block_sse4(int16_t* coeffBuf, const int16_t* coeffList,
7210
                              const int16_t* coeffPos, int nCoeff,
7211
                              int32_t fact, int32_t offset, int32_t bdShift)
7212
0
{
7213
0
  const __m128i vfact = _mm_set1_epi32(fact);
7214
0
  const __m128i voff  = _mm_set1_epi32(offset);
7215
0
  const __m128i vsh   = _mm_cvtsi32_si128(bdShift);
7216
7217
0
  alignas(16) int16_t tmp[8];
7218
0
  int i = 0;
7219
0
  for (; i+8 <= nCoeff; i += 8) {
7220
0
    __m128i c  = _mm_loadu_si128((const __m128i*)(coeffList + i));  // 8 int16
7221
0
    __m128i lo = _mm_cvtepi16_epi32(c);                            // c[i..i+3]
7222
0
    __m128i hi = _mm_cvtepi16_epi32(_mm_srli_si128(c, 8));         // c[i+4..i+7]
7223
0
    lo = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(lo, vfact), voff), vsh);
7224
0
    hi = _mm_sra_epi32(_mm_add_epi32(_mm_mullo_epi32(hi, vfact), voff), vsh);
7225
0
    __m128i r = _mm_packs_epi32(lo, hi);                           // signed sat == Clip3
7226
0
    _mm_store_si128((__m128i*)tmp, r);
7227
0
    for (int k=0;k<8;k++) coeffBuf[ coeffPos[i+k] ] = tmp[k];      // scatter
7228
0
  }
7229
0
  for (; i < nCoeff; i++) {
7230
0
    int32_t v = (coeffList[i]*fact + offset) >> bdShift;
7231
0
    v = v < -32768 ? -32768 : (v > 32767 ? 32767 : v);
7232
0
    coeffBuf[ coeffPos[i] ] = (int16_t)v;
7233
0
  }
7234
0
}
7235
#endif
7236