Coverage Report

Created: 2026-02-26 06:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/libde265/libde265/x86/sse-dct.cc
Line
Count
Source
1
/*
2
 * H.265 video codec.
3
 * Copyright (c) 2013 openHEVC contributors
4
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5
 *
6
 * This file is part of libde265.
7
 *
8
 * libde265 is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU Lesser General Public License as
10
 * published by the Free Software Foundation, either version 3 of
11
 * the License, or (at your option) any later version.
12
 *
13
 * libde265 is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public License
19
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20
 */
21
22
#include "x86/sse-dct.h"
23
#include "libde265/util.h"
24
25
#ifdef HAVE_CONFIG_H
26
#include "config.h"
27
#endif
28
29
#include <emmintrin.h> // SSE2
30
#include <tmmintrin.h> // SSSE3
31
32
#if HAVE_SSE4_1
33
#include <smmintrin.h> // SSE4.1
34
#endif
35
36
37
ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
38
{
39
    {   29, +84, 29,  +84,  29, +84,  29, +84 },
40
    {  +74, +55, +74, +55, +74, +55, +74, +55 },
41
    {   55, -29,  55, -29,  55, -29,  55, -29 },
42
    {  +74, -84, +74, -84, +74, -84, +74, -84 },
43
    {   74, -74,  74, -74,  74, -74,  74, -74 },
44
    {    0, +74,   0, +74,   0, +74,   0, +74 },
45
    {   84, +55,  84, +55,  84, +55,  84, +55 },
46
    {  -74, -29, -74, -29, -74, -29, -74, -29 }
47
};
48
49
ALIGNED_16(static const int16_t) transform4x4[4][8] = {
50
    { 64,  64, 64,  64, 64,  64, 64,  64 },
51
    { 64, -64, 64, -64, 64, -64, 64, -64 },
52
    { 83,  36, 83,  36, 83,  36, 83,  36 },
53
    { 36, -83, 36, -83, 36, -83, 36, -83 }
54
};
55
56
ALIGNED_16(static const int16_t) transform8x8[12][8] =
57
{
58
    {  89,  75,  89,  75, 89,  75, 89,  75 },
59
    {  50,  18,  50,  18, 50,  18, 50,  18 },
60
    {  75, -18,  75, -18, 75, -18, 75, -18 },
61
    { -89, -50, -89, -50,-89, -50,-89, -50 },
62
    {  50, -89,  50, -89, 50, -89, 50, -89 },
63
    {  18,  75,  18,  75, 18,  75, 18,  75 },
64
    {  18, -50,  18, -50, 18, -50, 18, -50 },
65
    {  75, -89,  75, -89, 75, -89, 75, -89 },
66
    {  64,  64,  64,  64, 64,  64, 64,  64 },
67
    {  64, -64,  64, -64, 64, -64, 64, -64 },
68
    {  83,  36,  83,  36, 83,  36, 83,  36 },
69
    {  36, -83,  36, -83, 36, -83, 36, -83 }
70
};
71
72
ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
73
{
74
    {/*1-3*/ /*2-6*/
75
        { 90,  87,  90,  87,  90,  87,  90,  87 },
76
        { 87,  57,  87,  57,  87,  57,  87,  57 },
77
        { 80,   9,  80,   9,  80,   9,  80,   9 },
78
        { 70, -43,  70, -43,  70, -43,  70, -43 },
79
        { 57, -80,  57, -80,  57, -80,  57, -80 },
80
        { 43, -90,  43, -90,  43, -90,  43, -90 },
81
        { 25, -70,  25, -70,  25, -70,  25, -70 },
82
        { 9,  -25,   9, -25,   9, -25,   9, -25 },
83
    },{ /*5-7*/ /*10-14*/
84
        {  80,  70,  80,  70,  80,  70,  80,  70 },
85
        {   9, -43,   9, -43,   9, -43,   9, -43 },
86
        { -70, -87, -70, -87, -70, -87, -70, -87 },
87
        { -87,   9, -87,   9, -87,   9, -87,   9 },
88
        { -25,  90, -25,  90, -25,  90, -25,  90 },
89
        {  57,  25,  57,  25,  57,  25,  57,  25 },
90
        {  90, -80,  90, -80,  90, -80,  90, -80 },
91
        {  43, -57,  43, -57,  43, -57,  43, -57 },
92
    },{ /*9-11*/ /*18-22*/
93
        {  57,  43,  57,  43,  57,  43,  57,  43 },
94
        { -80, -90, -80, -90, -80, -90, -80, -90 },
95
        { -25,  57, -25,  57, -25,  57, -25,  57 },
96
        {  90,  25,  90,  25,  90,  25,  90,  25 },
97
        {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
98
        { -87,  70, -87,  70, -87,  70, -87,  70 },
99
        {  43,   9,  43,   9,  43,   9,  43,   9 },
100
        {  70, -80,  70, -80,  70, -80,  70, -80 },
101
    },{/*13-15*/ /*  26-30   */
102
        {  25,   9,  25,   9,  25,   9,  25,   9 },
103
        { -70, -25, -70, -25, -70, -25, -70, -25 },
104
        {  90,  43,  90,  43,  90,  43,  90,  43 },
105
        { -80, -57, -80, -57, -80, -57, -80, -57 },
106
        {  43,  70,  43,  70,  43,  70,  43,  70 },
107
        {  9,  -80,   9, -80,   9, -80,   9, -80 },
108
        { -57,  87, -57,  87, -57,  87, -57,  87 },
109
        {  87, -90,  87, -90,  87, -90,  87, -90 },
110
    }
111
};
112
113
ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
114
{
115
    { /*2-6*/ /*4-12*/
116
        { 89,  75,  89,  75, 89,  75, 89,  75 },
117
        { 75, -18,  75, -18, 75, -18, 75, -18 },
118
        { 50, -89,  50, -89, 50, -89, 50, -89 },
119
        { 18, -50,  18, -50, 18, -50, 18, -50 },
120
    },{ /*10-14*/  /*20-28*/
121
        {  50,  18,  50,  18,  50,  18,  50,  18 },
122
        { -89, -50, -89, -50, -89, -50, -89, -50 },
123
        {  18,  75,  18,  75,  18,  75,  18,  75 },
124
        {  75, -89,  75, -89,  75, -89,  75, -89 },
125
    }
126
};
127
128
ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
129
{
130
    {/*4-12*/ /*8-24*/
131
        {  83,  36,  83,  36,  83,  36,  83,  36 },
132
        {  36, -83,  36, -83,  36, -83,  36, -83 },
133
    },{ /*0-8*/  /*0-16*/
134
        { 64,  64, 64,  64, 64,  64, 64,  64 },
135
        { 64, -64, 64, -64, 64, -64, 64, -64 },
136
    }
137
};
138
139
140
ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
141
{
142
    { /*   1-3     */
143
        { 90,  90, 90,  90, 90,  90, 90,  90 },
144
        { 90,  82, 90,  82, 90,  82, 90,  82 },
145
        { 88,  67, 88,  67, 88,  67, 88,  67 },
146
        { 85,  46, 85,  46, 85,  46, 85,  46 },
147
        { 82,  22, 82,  22, 82,  22, 82,  22 },
148
        { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
149
        { 73, -31, 73, -31, 73, -31, 73, -31 },
150
        { 67, -54, 67, -54, 67, -54, 67, -54 },
151
        { 61, -73, 61, -73, 61, -73, 61, -73 },
152
        { 54, -85, 54, -85, 54, -85, 54, -85 },
153
        { 46, -90, 46, -90, 46, -90, 46, -90 },
154
        { 38, -88, 38, -88, 38, -88, 38, -88 },
155
        { 31, -78, 31, -78, 31, -78, 31, -78 },
156
        { 22, -61, 22, -61, 22, -61, 22, -61 },
157
        { 13, -38, 13, -38, 13, -38, 13, -38 },
158
        { 4,  -13,  4, -13,  4, -13,  4, -13 },
159
    },{/*  5-7 */
160
        {  88,  85,  88,  85,  88,  85,  88,  85 },
161
        {  67,  46,  67,  46,  67,  46,  67,  46 },
162
        {  31, -13,  31, -13,  31, -13,  31, -13 },
163
        { -13, -67, -13, -67, -13, -67, -13, -67 },
164
        { -54, -90, -54, -90, -54, -90, -54, -90 },
165
        { -82, -73, -82, -73, -82, -73, -82, -73 },
166
        { -90, -22, -90, -22, -90, -22, -90, -22 },
167
        { -78,  38, -78,  38, -78,  38, -78,  38 },
168
        { -46,  82, -46,  82, -46,  82, -46,  82 },
169
        {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
170
        {  38,  54,  38,  54,  38,  54,  38,  54 },
171
        {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
172
        {  90, -61,  90, -61,  90, -61,  90, -61 },
173
        {  85, -90,  85, -90,  85, -90,  85, -90 },
174
        {  61, -78,  61, -78,  61, -78,  61, -78 },
175
        {  22, -31,  22, -31,  22, -31,  22, -31 },
176
    },{/*  9-11   */
177
        {  82,  78,  82,  78,  82,  78,  82,  78 },
178
        {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
179
        { -54, -82, -54, -82, -54, -82, -54, -82 },
180
        { -90, -73, -90, -73, -90, -73, -90, -73 },
181
        { -61,  13, -61,  13, -61,  13, -61,  13 },
182
        {  13,  85,  13,  85,  13,  85,  13,  85 },
183
        {  78,  67,  78,  67,  78,  67,  78,  67 },
184
        {  85, -22,  85, -22,  85, -22,  85, -22 },
185
        {  31, -88,  31, -88,  31, -88,  31, -88 },
186
        { -46, -61, -46, -61, -46, -61, -46, -61 },
187
        { -90,  31, -90,  31, -90,  31, -90,  31 },
188
        { -67,  90, -67,  90, -67,  90, -67,  90 },
189
        {   4,  54,   4,  54,   4,  54,   4,  54 },
190
        {  73, -38,  73, -38,  73, -38,  73, -38 },
191
        {  88, -90,  88, -90,  88, -90,  88, -90 },
192
        {  38, -46,  38, -46,  38, -46,  38, -46 },
193
    },{/*  13-15   */
194
        {  73,  67,  73,  67,  73,  67,  73,  67 },
195
        { -31, -54, -31, -54, -31, -54, -31, -54 },
196
        { -90, -78, -90, -78, -90, -78, -90, -78 },
197
        { -22,  38, -22,  38, -22,  38, -22,  38 },
198
        {  78,  85,  78,  85,  78,  85,  78,  85 },
199
        {  67, -22,  67, -22,  67, -22,  67, -22 },
200
        { -38, -90, -38, -90, -38, -90, -38, -90 },
201
        { -90,   4, -90,   4, -90,   4, -90,   4 },
202
        { -13,  90, -13,  90, -13,  90, -13,  90 },
203
        {  82,  13,  82,  13,  82,  13,  82,  13 },
204
        {  61, -88,  61, -88,  61, -88,  61, -88 },
205
        { -46, -31, -46, -31, -46, -31, -46, -31 },
206
        { -88,  82, -88,  82, -88,  82, -88,  82 },
207
        { -4,   46, -4,   46, -4,   46, -4,   46 },
208
        {  85, -73,  85, -73,  85, -73,  85, -73 },
209
        {  54, -61,  54, -61,  54, -61,  54, -61 },
210
    },{/*  17-19   */
211
        {  61,  54,  61,  54,  61,  54,  61,  54 },
212
        { -73, -85, -73, -85, -73, -85, -73, -85 },
213
        { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
214
        {  82,  88,  82,  88,  82,  88,  82,  88 },
215
        {  31, -46,  31, -46,  31, -46,  31, -46 },
216
        { -88, -61, -88, -61, -88, -61, -88, -61 },
217
        { -13,  82, -13,  82, -13,  82, -13,  82 },
218
        {  90,  13,  90,  13,  90,  13,  90,  13 },
219
        { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
220
        { -90,  38, -90,  38, -90,  38, -90,  38 },
221
        {  22,  67,  22,  67,  22,  67,  22,  67 },
222
        {  85, -78,  85, -78,  85, -78,  85, -78 },
223
        { -38, -22, -38, -22, -38, -22, -38, -22 },
224
        { -78,  90, -78,  90, -78,  90, -78,  90 },
225
        {  54, -31,  54, -31,  54, -31,  54, -31 },
226
        {  67, -73,  67, -73,  67, -73,  67, -73 },
227
    },{ /*  21-23   */
228
        {  46,  38,  46,  38,  46,  38,  46,  38 },
229
        { -90, -88, -90, -88, -90, -88, -90, -88 },
230
        {  38,  73,  38,  73,  38,  73,  38,  73 },
231
        {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
232
        { -90, -67, -90, -67, -90, -67, -90, -67 },
233
        {  31,  90,  31,  90,  31,  90,  31,  90 },
234
        {  61, -46,  61, -46,  61, -46,  61, -46 },
235
        { -88, -31, -88, -31, -88, -31, -88, -31 },
236
        {  22,  85,  22,  85,  22,  85,  22,  85 },
237
        {  67, -78,  67, -78,  67, -78,  67, -78 },
238
        { -85,  13, -85,  13, -85,  13, -85,  13 },
239
        {  13,  61,  13,  61,  13,  61,  13,  61 },
240
        {  73, -90,  73, -90,  73, -90,  73, -90 },
241
        { -82,  54, -82,  54, -82,  54, -82,  54 },
242
        {   4,  22,   4,  22,   4,  22,   4,  22 },
243
        {  78, -82,  78, -82,  78, -82,  78, -82 },
244
    },{ /*  25-27   */
245
        {  31,  22,  31,  22,  31,  22,  31,  22 },
246
        { -78, -61, -78, -61, -78, -61, -78, -61 },
247
        {  90,  85,  90,  85,  90,  85,  90,  85 },
248
        { -61, -90, -61, -90, -61, -90, -61, -90 },
249
        {   4,  73,   4,  73,   4,  73,   4,  73 },
250
        {  54, -38,  54, -38,  54, -38,  54, -38 },
251
        { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
252
        {  82,  46,  82,  46,  82,  46,  82,  46 },
253
        { -38, -78, -38, -78, -38, -78, -38, -78 },
254
        { -22,  90, -22,  90, -22,  90, -22,  90 },
255
        {  73, -82,  73, -82,  73, -82,  73, -82 },
256
        { -90,  54, -90,  54, -90,  54, -90,  54 },
257
        {  67, -13,  67, -13,  67, -13,  67, -13 },
258
        { -13, -31, -13, -31, -13, -31, -13, -31 },
259
        { -46,  67, -46,  67, -46,  67, -46,  67 },
260
        {  85, -88,  85, -88,  85, -88,  85, -88 },
261
    },{/*  29-31   */
262
        {  13,   4,  13,   4,  13,   4,  13,   4 },
263
        { -38, -13, -38, -13, -38, -13, -38, -13 },
264
        {  61,  22,  61,  22,  61,  22,  61,  22 },
265
        { -78, -31, -78, -31, -78, -31, -78, -31 },
266
        {  88,  38,  88,  38,  88,  38,  88,  38 },
267
        { -90, -46, -90, -46, -90, -46, -90, -46 },
268
        {  85,  54,  85,  54,  85,  54,  85,  54 },
269
        { -73, -61, -73, -61, -73, -61, -73, -61 },
270
        {  54,  67,  54,  67,  54,  67,  54,  67 },
271
        { -31, -73, -31, -73, -31, -73, -31, -73 },
272
        {   4,  78,   4,  78,   4,  78,   4,  78 },
273
        {  22, -82,  22, -82,  22, -82,  22, -82 },
274
        { -46,  85, -46,  85, -46,  85, -46,  85 },
275
        {  67, -88,  67, -88,  67, -88,  67, -88 },
276
        { -82,  90, -82,  90, -82,  90, -82,  90 },
277
        {  90, -90,  90, -90,  90, -90,  90, -90 },
278
    }
279
};
280
281
0
#define shift_1st 7
282
0
#define add_1st (1 << (shift_1st - 1))
283
284
285
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
286
0
{
287
0
    uint8_t *dst = (uint8_t*)_dst;
288
0
    ptrdiff_t stride = _stride;
289
0
    int shift = 5;
290
0
    int offset = 16;
291
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r9;
292
293
0
    r9= _mm_setzero_si128();
294
    //r8= _mm_set_epi32(0,0,0,-1);
295
0
    r2= _mm_set1_epi16(offset);
296
297
0
    r0= _mm_load_si128((__m128i*)(coeffs));
298
0
    r1= _mm_load_si128((__m128i*)(coeffs+8));
299
300
301
0
    r0= _mm_adds_epi16(r0,r2);
302
0
    r1= _mm_adds_epi16(r1,r2);
303
304
0
    r0= _mm_srai_epi16(r0,shift);
305
0
    r1= _mm_srai_epi16(r1,shift);
306
307
0
    r3= _mm_loadl_epi64((__m128i*)(dst));
308
0
    r4= _mm_loadl_epi64((__m128i*)(dst + stride));
309
0
    r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
310
0
    r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
311
312
0
    r3= _mm_unpacklo_epi8(r3,r9);
313
0
    r4= _mm_unpacklo_epi8(r4,r9);
314
0
    r5= _mm_unpacklo_epi8(r5,r9);
315
0
    r6= _mm_unpacklo_epi8(r6,r9);
316
0
    r3= _mm_unpacklo_epi64(r3,r4);
317
0
    r4= _mm_unpacklo_epi64(r5,r6);
318
319
320
0
    r3= _mm_adds_epi16(r3,r0);
321
0
    r4= _mm_adds_epi16(r4,r1);
322
323
0
    r3= _mm_packus_epi16(r3,r4);
324
    //r8= _mm_set_epi32(0,0,0,-1);
325
326
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
327
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
328
329
0
    r3= _mm_srli_si128(r3,4);
330
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
331
0
    *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
332
333
0
    r3= _mm_srli_si128(r3,4);
334
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
335
0
    *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
336
337
0
    r3= _mm_srli_si128(r3,4);
338
    //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
339
0
    *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
340
0
}
341
342
343
344
#if HAVE_SSE4_1
345
void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
346
0
                                           ptrdiff_t _stride) {
347
348
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
349
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
350
351
0
    uint8_t *dst = (uint8_t*) _dst;
352
0
    ptrdiff_t stride = _stride;
353
0
    const int16_t *src = coeffs;
354
0
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
355
0
            m128iD;
356
0
    m128iAdd = _mm_set1_epi32(64);
357
358
0
    S0 = _mm_load_si128((__m128i *) (src));
359
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
360
361
0
    m128iAC = _mm_unpacklo_epi16(S0, S8);
362
0
    m128iBD = _mm_unpackhi_epi16(S0, S8);
363
364
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
365
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
366
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
367
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
368
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369
0
    S0 = _mm_add_epi32(S0, m128iAdd);
370
0
    S0 = _mm_srai_epi32(S0, shift_1st);
371
372
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
373
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
374
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
375
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
376
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377
0
    S8 = _mm_add_epi32(S8, m128iAdd);
378
0
    S8 = _mm_srai_epi32(S8, shift_1st);
379
380
0
    m128iA = _mm_packs_epi32(S0, S8);
381
382
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
383
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
384
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
385
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
386
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
387
0
    S0 = _mm_add_epi32(S0, m128iAdd);
388
0
    S0 = _mm_srai_epi32(S0, shift_1st);
389
390
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
391
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
392
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
393
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
394
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
395
0
    S8 = _mm_add_epi32(S8, m128iAdd);
396
0
    S8 = _mm_srai_epi32(S8, shift_1st);
397
398
0
    m128iD = _mm_packs_epi32(S0, S8);
399
400
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
401
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
402
403
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
404
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
405
406
    /*   ###################    */
407
0
    m128iAdd = _mm_set1_epi32(add_2nd);
408
409
0
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
410
0
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
411
412
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
413
0
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
414
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
415
0
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
416
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417
0
    S0 = _mm_add_epi32(S0, m128iAdd);
418
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
419
420
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
421
0
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
422
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
423
0
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
424
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425
0
    S8 = _mm_add_epi32(S8, m128iAdd);
426
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
427
428
0
    m128iA = _mm_packs_epi32(S0, S8);
429
430
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
431
0
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
432
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
433
0
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
434
0
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
435
0
    S0 = _mm_add_epi32(S0, m128iAdd);
436
0
    S0 = _mm_srai_epi32(S0, shift_2nd);
437
438
0
    m128iTmp1 = _mm_madd_epi16(m128iAC,
439
0
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
440
0
    m128iTmp2 = _mm_madd_epi16(m128iBD,
441
0
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
442
0
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
443
0
    S8 = _mm_add_epi32(S8, m128iAdd);
444
0
    S8 = _mm_srai_epi32(S8, shift_2nd);
445
446
0
    m128iD = _mm_packs_epi32(S0, S8);
447
448
//    _mm_storeu_si128((__m128i *) (src), m128iA);
449
//    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
450
451
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
452
0
    S8 = _mm_move_epi64(m128iD); //row 2
453
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
454
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
455
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
456
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
457
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
458
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
459
460
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
461
462
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
463
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
465
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468
469
0
    dst += stride;
470
471
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
472
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
474
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477
478
0
    dst += stride;
479
480
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
481
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
482
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
483
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
484
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
485
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
486
487
0
    dst += stride;
488
489
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
490
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
491
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
492
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
493
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
494
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
495
0
}
496
#endif // SSE4.1
497
498
#if 0
499
void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
500
        ptrdiff_t _stride) {
501
    int i,j;
502
    uint8_t shift_2nd = 10; // 20 - Bit depth
503
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
504
505
    uint16_t *dst = (uint16_t*) _dst;
506
    ptrdiff_t stride = _stride/(sizeof(uint16_t));
507
    int16_t *src = coeffs;
508
    __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
509
            m128iD;
510
511
    m128iAdd = _mm_set1_epi32(64);
512
513
    S0 = _mm_loadu_si128((__m128i *) (src));
514
    S8 = _mm_loadu_si128((__m128i *) (src + 8));
515
516
    m128iAC = _mm_unpacklo_epi16(S0, S8);
517
    m128iBD = _mm_unpackhi_epi16(S0, S8);
518
519
    m128iTmp1 = _mm_madd_epi16(m128iAC,
520
            _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
521
    m128iTmp2 = _mm_madd_epi16(m128iBD,
522
            _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
523
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524
    S0 = _mm_add_epi32(S0, m128iAdd);
525
    S0 = _mm_srai_epi32(S0, shift_1st);
526
527
    m128iTmp1 = _mm_madd_epi16(m128iAC,
528
            _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
529
    m128iTmp2 = _mm_madd_epi16(m128iBD,
530
            _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
531
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532
    S8 = _mm_add_epi32(S8, m128iAdd);
533
    S8 = _mm_srai_epi32(S8, shift_1st);
534
535
    m128iA = _mm_packs_epi32(S0, S8);
536
537
    m128iTmp1 = _mm_madd_epi16(m128iAC,
538
            _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
539
    m128iTmp2 = _mm_madd_epi16(m128iBD,
540
            _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
541
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
542
    S0 = _mm_add_epi32(S0, m128iAdd);
543
    S0 = _mm_srai_epi32(S0, shift_1st);
544
545
    m128iTmp1 = _mm_madd_epi16(m128iAC,
546
            _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
547
    m128iTmp2 = _mm_madd_epi16(m128iBD,
548
            _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
549
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
550
    S8 = _mm_add_epi32(S8, m128iAdd);
551
    S8 = _mm_srai_epi32(S8, shift_1st);
552
553
    m128iD = _mm_packs_epi32(S0, S8);
554
555
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
556
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
557
558
    m128iA = _mm_unpacklo_epi16(S0, S8);
559
    m128iD = _mm_unpackhi_epi16(S0, S8);
560
561
    /*   ###################    */
562
    m128iAdd = _mm_set1_epi32(add_2nd);
563
564
    m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
565
    m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
566
567
    m128iTmp1 = _mm_madd_epi16(m128iAC,
568
            _mm_load_si128((__m128i *) (transform4x4_luma[0])));
569
    m128iTmp2 = _mm_madd_epi16(m128iBD,
570
            _mm_load_si128((__m128i *) (transform4x4_luma[1])));
571
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572
    S0 = _mm_add_epi32(S0, m128iAdd);
573
    S0 = _mm_srai_epi32(S0, shift_2nd);
574
575
    m128iTmp1 = _mm_madd_epi16(m128iAC,
576
            _mm_load_si128((__m128i *) (transform4x4_luma[2])));
577
    m128iTmp2 = _mm_madd_epi16(m128iBD,
578
            _mm_load_si128((__m128i *) (transform4x4_luma[3])));
579
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580
    S8 = _mm_add_epi32(S8, m128iAdd);
581
    S8 = _mm_srai_epi32(S8, shift_2nd);
582
583
    m128iA = _mm_packs_epi32(S0, S8);
584
585
    m128iTmp1 = _mm_madd_epi16(m128iAC,
586
            _mm_load_si128((__m128i *) (transform4x4_luma[4])));
587
    m128iTmp2 = _mm_madd_epi16(m128iBD,
588
            _mm_load_si128((__m128i *) (transform4x4_luma[5])));
589
    S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
590
    S0 = _mm_add_epi32(S0, m128iAdd);
591
    S0 = _mm_srai_epi32(S0, shift_2nd);
592
593
    m128iTmp1 = _mm_madd_epi16(m128iAC,
594
            _mm_load_si128((__m128i *) (transform4x4_luma[6])));
595
    m128iTmp2 = _mm_madd_epi16(m128iBD,
596
            _mm_load_si128((__m128i *) (transform4x4_luma[7])));
597
    S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
598
    S8 = _mm_add_epi32(S8, m128iAdd);
599
    S8 = _mm_srai_epi32(S8, shift_2nd);
600
601
    m128iD = _mm_packs_epi32(S0, S8);
602
603
    _mm_storeu_si128((__m128i *) (src), m128iA);
604
    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
605
    j = 0;
606
    for (i = 0; i < 2; i++) {
607
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
608
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
609
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
610
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
611
        j += 1;
612
        dst += stride;
613
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
614
        dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
615
        dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
616
        dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
617
        j += 1;
618
        dst += stride;
619
    }
620
621
}
622
#endif
623
624
625
#if HAVE_SSE4_1
626
void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
627
0
        ptrdiff_t _stride) {
628
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
629
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
630
631
0
    uint8_t *dst = (uint8_t*) _dst;
632
0
    ptrdiff_t stride = _stride;
633
0
    const int16_t *src = coeffs;
634
635
0
    __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
636
0
    S0 = _mm_load_si128((__m128i *) (src));
637
0
    S8 = _mm_load_si128((__m128i *) (src + 8));
638
0
    m128iAdd = _mm_set1_epi32(add_1st);
639
640
0
    m128Tmp = _mm_unpacklo_epi16(S0, S8);
641
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
642
0
    E1 = _mm_add_epi32(E1, m128iAdd);
643
644
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
645
0
    E2 = _mm_add_epi32(E2, m128iAdd);
646
647
0
    m128Tmp = _mm_unpackhi_epi16(S0, S8);
648
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
649
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
650
651
0
    m128iA = _mm_add_epi32(E1, O1);
652
0
    m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
653
0
    m128Tmp = _mm_add_epi32(E2, O2);
654
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
655
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
656
657
0
    m128iD = _mm_sub_epi32(E2, O2);
658
0
    m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
659
660
0
    m128Tmp = _mm_sub_epi32(E1, O1);
661
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
662
663
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
664
665
0
    S0 = _mm_unpacklo_epi16(m128iA, m128iD);
666
0
    S8 = _mm_unpackhi_epi16(m128iA, m128iD);
667
668
0
    m128iA = _mm_unpacklo_epi16(S0, S8);
669
0
    m128iD = _mm_unpackhi_epi16(S0, S8);
670
671
    /*  ##########################  */
672
673
0
    m128iAdd = _mm_set1_epi32(add_2nd);
674
0
    m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
675
0
    E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
676
0
    E1 = _mm_add_epi32(E1, m128iAdd);
677
678
0
    E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
679
0
    E2 = _mm_add_epi32(E2, m128iAdd);
680
681
0
    m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
682
0
    O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
683
0
    O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
684
685
0
    m128iA = _mm_add_epi32(E1, O1);
686
0
    m128iA = _mm_srai_epi32(m128iA, shift_2nd);
687
0
    m128Tmp = _mm_add_epi32(E2, O2);
688
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
689
0
    m128iA = _mm_packs_epi32(m128iA, m128Tmp);
690
691
0
    m128iD = _mm_sub_epi32(E2, O2);
692
0
    m128iD = _mm_srai_epi32(m128iD, shift_2nd);
693
694
0
    m128Tmp = _mm_sub_epi32(E1, O1);
695
0
    m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
696
697
0
    m128iD = _mm_packs_epi32(m128iD, m128Tmp);
698
699
0
    S0 = _mm_move_epi64(m128iA); //contains row 0
700
0
    S8 = _mm_move_epi64(m128iD); //row 2
701
0
    m128iA = _mm_srli_si128(m128iA, 8); // row 1
702
0
    m128iD = _mm_srli_si128(m128iD, 8); // row 3
703
0
    m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
704
0
    m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
705
0
    S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
706
0
    S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
707
708
    //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
709
710
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
711
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712
0
    m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
713
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716
717
0
    dst += stride;
718
719
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
720
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
722
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725
726
0
    dst += stride;
727
728
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
729
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
730
0
    m128iTmp1 = _mm_adds_epi16(S8, m128iA);
731
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
732
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
733
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
734
735
0
    dst += stride;
736
737
0
    m128iA = _mm_loadl_epi64((__m128i *) dst);
738
0
    m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
739
0
    m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
740
0
    m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
741
    //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
742
0
    *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
743
0
}
744
#endif
745
746
#if 0
747
void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
748
        ptrdiff_t _stride) {
749
    int i;
750
    uint8_t shift_2nd = 10; // 20 - Bit depth
751
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
752
753
    uint16_t *dst = (uint16_t*) _dst;
754
    ptrdiff_t stride = _stride/2;
755
    int16_t *src = coeffs;
756
757
    int j;
758
        __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
759
        S0 = _mm_load_si128((__m128i *) (src));
760
        S8 = _mm_load_si128((__m128i *) (src + 8));
761
        m128iAdd = _mm_set1_epi32(add_1st);
762
763
        m128Tmp = _mm_unpacklo_epi16(S0, S8);
764
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
765
        E1 = _mm_add_epi32(E1, m128iAdd);
766
767
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
768
        E2 = _mm_add_epi32(E2, m128iAdd);
769
770
        m128Tmp = _mm_unpackhi_epi16(S0, S8);
771
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
772
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
773
774
        m128iA = _mm_add_epi32(E1, O1);
775
        m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
776
        m128Tmp = _mm_add_epi32(E2, O2);
777
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
778
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
779
780
        m128iD = _mm_sub_epi32(E2, O2);
781
        m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
782
783
        m128Tmp = _mm_sub_epi32(E1, O1);
784
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
785
786
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
787
788
        S0 = _mm_unpacklo_epi16(m128iA, m128iD);
789
        S8 = _mm_unpackhi_epi16(m128iA, m128iD);
790
791
        m128iA = _mm_unpacklo_epi16(S0, S8);
792
        m128iD = _mm_unpackhi_epi16(S0, S8);
793
794
        /*  ##########################  */
795
796
        m128iAdd = _mm_set1_epi32(add_2nd);
797
        m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
798
        E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
799
        E1 = _mm_add_epi32(E1, m128iAdd);
800
801
        E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
802
        E2 = _mm_add_epi32(E2, m128iAdd);
803
804
        m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
805
        O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
806
        O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
807
808
        m128iA = _mm_add_epi32(E1, O1);
809
        m128iA = _mm_srai_epi32(m128iA, shift_2nd);
810
        m128Tmp = _mm_add_epi32(E2, O2);
811
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
812
        m128iA = _mm_packs_epi32(m128iA, m128Tmp);
813
814
        m128iD = _mm_sub_epi32(E2, O2);
815
        m128iD = _mm_srai_epi32(m128iD, shift_2nd);
816
817
        m128Tmp = _mm_sub_epi32(E1, O1);
818
        m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
819
820
        m128iD = _mm_packs_epi32(m128iD, m128Tmp);
821
        _mm_storeu_si128((__m128i *) (src), m128iA);
822
        _mm_storeu_si128((__m128i *) (src + 8), m128iD);
823
        j = 0;
824
        for (i = 0; i < 2; i++) {
825
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
826
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
827
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
828
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
829
            j += 1;
830
            dst += stride;
831
            dst[0] = av_clip_uintp2(dst[0] + src[j],10);
832
            dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
833
            dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
834
            dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
835
            j += 1;
836
            dst += stride;
837
        }
838
}
839
#endif
840
841
#if HAVE_SSE4_1
842
void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
843
0
        ptrdiff_t _stride) {
844
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
845
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
846
847
0
    uint8_t *dst = (uint8_t*) _dst;
848
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
849
0
    const int16_t *src = coeffs;
850
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
851
0
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
852
0
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
853
854
0
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
855
0
            T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
856
0
    T0= _mm_load_si128((__m128i *) (transform8x8[0]));
857
0
    T1= _mm_load_si128((__m128i *) (transform8x8[1]));
858
0
    T2= _mm_load_si128((__m128i *) (transform8x8[2]));
859
0
    T3= _mm_load_si128((__m128i *) (transform8x8[3]));
860
0
    T4= _mm_load_si128((__m128i *) (transform8x8[4]));
861
0
    T5= _mm_load_si128((__m128i *) (transform8x8[5]));
862
0
    T6= _mm_load_si128((__m128i *) (transform8x8[6]));
863
0
    T7= _mm_load_si128((__m128i *) (transform8x8[7]));
864
0
    T8= _mm_load_si128((__m128i *) (transform8x8[8]));
865
0
    T9= _mm_load_si128((__m128i *) (transform8x8[9]));
866
0
    T10= _mm_load_si128((__m128i *) (transform8x8[10]));
867
0
    T11= _mm_load_si128((__m128i *) (transform8x8[11]));
868
869
0
    m128iAdd = _mm_set1_epi32(add_1st);
870
871
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
872
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
873
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
874
0
    E1l = _mm_madd_epi16(m128Tmp0, T0);
875
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
876
0
    E1h = _mm_madd_epi16(m128Tmp1, T0);
877
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
878
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
879
0
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
880
0
    E2l = _mm_madd_epi16(m128Tmp2, T1);
881
0
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
882
0
    E2h = _mm_madd_epi16(m128Tmp3, T1);
883
0
    O0l = _mm_add_epi32(E1l, E2l);
884
0
    O0h = _mm_add_epi32(E1h, E2h);
885
886
0
    E1l = _mm_madd_epi16(m128Tmp0, T2);
887
0
    E1h = _mm_madd_epi16(m128Tmp1, T2);
888
0
    E2l = _mm_madd_epi16(m128Tmp2, T3);
889
0
    E2h = _mm_madd_epi16(m128Tmp3, T3);
890
891
0
    O1l = _mm_add_epi32(E1l, E2l);
892
0
    O1h = _mm_add_epi32(E1h, E2h);
893
894
0
    E1l = _mm_madd_epi16(m128Tmp0, T4);
895
0
    E1h = _mm_madd_epi16(m128Tmp1, T4);
896
0
    E2l = _mm_madd_epi16(m128Tmp2, T5);
897
0
    E2h = _mm_madd_epi16(m128Tmp3, T5);
898
0
    O2l = _mm_add_epi32(E1l, E2l);
899
0
    O2h = _mm_add_epi32(E1h, E2h);
900
901
0
    E1l = _mm_madd_epi16(m128Tmp0, T6);
902
0
    E1h = _mm_madd_epi16(m128Tmp1, T6);
903
0
    E2l = _mm_madd_epi16(m128Tmp2, T7);
904
0
    E2h = _mm_madd_epi16(m128Tmp3, T7);
905
0
    O3h = _mm_add_epi32(E1h, E2h);
906
0
    O3l = _mm_add_epi32(E1l, E2l);
907
908
    /*    -------     */
909
910
0
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
911
0
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
912
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
913
0
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
914
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
915
0
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
916
917
0
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
918
0
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
919
920
    /*    -------     */
921
922
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
923
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
924
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
925
0
    E00l = _mm_madd_epi16(m128Tmp0, T10);
926
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
927
0
    E00h = _mm_madd_epi16(m128Tmp1, T10);
928
0
    E01l = _mm_madd_epi16(m128Tmp0, T11);
929
0
    E01h = _mm_madd_epi16(m128Tmp1, T11);
930
0
    E0l = _mm_add_epi32(EE0l, E00l);
931
0
    E0l = _mm_add_epi32(E0l, m128iAdd);
932
0
    E0h = _mm_add_epi32(EE0h, E00h);
933
0
    E0h = _mm_add_epi32(E0h, m128iAdd);
934
0
    E3l = _mm_sub_epi32(EE0l, E00l);
935
0
    E3l = _mm_add_epi32(E3l, m128iAdd);
936
0
    E3h = _mm_sub_epi32(EE0h, E00h);
937
0
    E3h = _mm_add_epi32(E3h, m128iAdd);
938
939
0
    E1l = _mm_add_epi32(EE1l, E01l);
940
0
    E1l = _mm_add_epi32(E1l, m128iAdd);
941
0
    E1h = _mm_add_epi32(EE1h, E01h);
942
0
    E1h = _mm_add_epi32(E1h, m128iAdd);
943
0
    E2l = _mm_sub_epi32(EE1l, E01l);
944
0
    E2l = _mm_add_epi32(E2l, m128iAdd);
945
0
    E2h = _mm_sub_epi32(EE1h, E01h);
946
0
    E2h = _mm_add_epi32(E2h, m128iAdd);
947
0
    m128iS0 = _mm_packs_epi32(
948
0
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
949
0
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
950
0
    m128iS1 = _mm_packs_epi32(
951
0
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
952
0
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
953
0
    m128iS2 = _mm_packs_epi32(
954
0
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
955
0
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
956
0
    m128iS3 = _mm_packs_epi32(
957
0
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
958
0
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
959
0
    m128iS4 = _mm_packs_epi32(
960
0
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
961
0
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
962
0
    m128iS5 = _mm_packs_epi32(
963
0
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
964
0
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
965
0
    m128iS6 = _mm_packs_epi32(
966
0
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
967
0
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
968
0
    m128iS7 = _mm_packs_epi32(
969
0
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
970
0
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
971
    /*  Invers matrix   */
972
973
0
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
974
0
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
975
0
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
976
0
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
977
0
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
978
0
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
979
0
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
980
0
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
981
0
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
982
0
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
983
0
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
984
0
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
985
0
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
986
0
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
987
0
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
988
0
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
989
0
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
990
0
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
991
0
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
992
0
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
993
0
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
994
0
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
995
0
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
996
0
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
997
998
0
    m128iAdd = _mm_set1_epi32(add_2nd);
999
1000
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1001
0
    E1l = _mm_madd_epi16(m128Tmp0, T0);
1002
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1003
0
    E1h = _mm_madd_epi16(m128Tmp1, T0);
1004
0
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1005
0
    E2l = _mm_madd_epi16(m128Tmp2, T1);
1006
0
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1007
0
    E2h = _mm_madd_epi16(m128Tmp3, T1);
1008
0
    O0l = _mm_add_epi32(E1l, E2l);
1009
0
    O0h = _mm_add_epi32(E1h, E2h);
1010
0
    E1l = _mm_madd_epi16(m128Tmp0, T2);
1011
0
    E1h = _mm_madd_epi16(m128Tmp1, T2);
1012
0
    E2l = _mm_madd_epi16(m128Tmp2, T3);
1013
0
    E2h = _mm_madd_epi16(m128Tmp3, T3);
1014
0
    O1l = _mm_add_epi32(E1l, E2l);
1015
0
    O1h = _mm_add_epi32(E1h, E2h);
1016
0
    E1l = _mm_madd_epi16(m128Tmp0, T4);
1017
0
    E1h = _mm_madd_epi16(m128Tmp1, T4);
1018
0
    E2l = _mm_madd_epi16(m128Tmp2, T5);
1019
0
    E2h = _mm_madd_epi16(m128Tmp3, T5);
1020
0
    O2l = _mm_add_epi32(E1l, E2l);
1021
0
    O2h = _mm_add_epi32(E1h, E2h);
1022
0
    E1l = _mm_madd_epi16(m128Tmp0, T6);
1023
0
    E1h = _mm_madd_epi16(m128Tmp1, T6);
1024
0
    E2l = _mm_madd_epi16(m128Tmp2, T7);
1025
0
    E2h = _mm_madd_epi16(m128Tmp3, T7);
1026
0
    O3h = _mm_add_epi32(E1h, E2h);
1027
0
    O3l = _mm_add_epi32(E1l, E2l);
1028
1029
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1030
0
    EE0l = _mm_madd_epi16(m128Tmp0, T8);
1031
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1032
0
    EE0h = _mm_madd_epi16(m128Tmp1, T8);
1033
0
    EE1l = _mm_madd_epi16(m128Tmp0, T9);
1034
0
    EE1h = _mm_madd_epi16(m128Tmp1, T9);
1035
1036
0
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1037
0
    E00l = _mm_madd_epi16(m128Tmp0, T10);
1038
0
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1039
0
    E00h = _mm_madd_epi16(m128Tmp1, T10);
1040
0
    E01l = _mm_madd_epi16(m128Tmp0, T11);
1041
0
    E01h = _mm_madd_epi16(m128Tmp1, T11);
1042
0
    E0l = _mm_add_epi32(EE0l, E00l);
1043
0
    E0l = _mm_add_epi32(E0l, m128iAdd);
1044
0
    E0h = _mm_add_epi32(EE0h, E00h);
1045
0
    E0h = _mm_add_epi32(E0h, m128iAdd);
1046
0
    E3l = _mm_sub_epi32(EE0l, E00l);
1047
0
    E3l = _mm_add_epi32(E3l, m128iAdd);
1048
0
    E3h = _mm_sub_epi32(EE0h, E00h);
1049
0
    E3h = _mm_add_epi32(E3h, m128iAdd);
1050
0
    E1l = _mm_add_epi32(EE1l, E01l);
1051
0
    E1l = _mm_add_epi32(E1l, m128iAdd);
1052
0
    E1h = _mm_add_epi32(EE1h, E01h);
1053
0
    E1h = _mm_add_epi32(E1h, m128iAdd);
1054
0
    E2l = _mm_sub_epi32(EE1l, E01l);
1055
0
    E2l = _mm_add_epi32(E2l, m128iAdd);
1056
0
    E2h = _mm_sub_epi32(EE1h, E01h);
1057
0
    E2h = _mm_add_epi32(E2h, m128iAdd);
1058
1059
0
    m128iS0 = _mm_packs_epi32(
1060
0
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1061
0
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1062
0
    m128iS1 = _mm_packs_epi32(
1063
0
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1064
0
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1065
0
    m128iS2 = _mm_packs_epi32(
1066
0
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1067
0
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1068
0
    m128iS3 = _mm_packs_epi32(
1069
0
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1070
0
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1071
0
    m128iS4 = _mm_packs_epi32(
1072
0
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1073
0
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1074
0
    m128iS5 = _mm_packs_epi32(
1075
0
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1076
0
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1077
0
    m128iS6 = _mm_packs_epi32(
1078
0
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1079
0
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1080
0
    m128iS7 = _mm_packs_epi32(
1081
0
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1082
0
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1083
1084
0
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1085
0
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1086
0
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1087
0
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1088
0
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1089
0
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1090
0
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1091
0
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1092
0
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1093
0
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1094
0
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1095
0
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1096
0
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1097
0
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1098
0
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1099
0
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1100
0
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1101
0
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1102
0
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1103
0
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1104
0
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1105
0
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1106
0
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1107
0
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1108
1109
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1110
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1111
1112
0
    E0l = _mm_adds_epi16(E0l, m128iS0);
1113
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1114
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1115
0
    dst += stride;
1116
1117
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1118
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1119
1120
0
    E0l = _mm_adds_epi16(E0l, m128iS1);
1121
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1122
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1123
0
    dst += stride;
1124
1125
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1126
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1127
1128
0
    E0l = _mm_adds_epi16(E0l, m128iS2);
1129
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1130
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1131
0
    dst += stride;
1132
1133
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1134
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1135
1136
0
    E0l = _mm_adds_epi16(E0l, m128iS3);
1137
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1138
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1139
0
    dst += stride;
1140
1141
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1142
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1143
1144
0
    E0l = _mm_adds_epi16(E0l, m128iS4);
1145
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1146
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1147
0
    dst += stride;
1148
1149
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1150
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1151
1152
0
    E0l = _mm_adds_epi16(E0l, m128iS5);
1153
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1154
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1155
0
    dst += stride;
1156
1157
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1158
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1159
1160
0
    E0l = _mm_adds_epi16(E0l, m128iS6);
1161
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1162
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1163
0
    dst += stride;
1164
1165
0
    E0l = _mm_loadl_epi64((__m128i *) dst);
1166
0
    E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1167
1168
0
    E0l = _mm_adds_epi16(E0l, m128iS7);
1169
0
    E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1170
0
    _mm_storel_epi64((__m128i *) dst, E0l);
1171
0
    dst += stride;
1172
1173
0
}
1174
#endif
1175
1176
#if 0
1177
void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
1178
        ptrdiff_t _stride) {
1179
    int i;
1180
    uint16_t *dst = (uint16_t*) _dst;
1181
    ptrdiff_t stride = _stride / sizeof(uint16_t);
1182
    int16_t *src = coeffs;
1183
    uint8_t shift_2nd = 10; // 20 - Bit depth
1184
    uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1185
1186
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1187
            m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1188
            E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1189
            O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1190
    int j;
1191
    m128iAdd = _mm_set1_epi32(add_1st);
1192
1193
    m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1194
    m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1195
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1196
    E1l = _mm_madd_epi16(m128Tmp0,
1197
            _mm_load_si128((__m128i *) (transform8x8[0])));
1198
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1199
    E1h = _mm_madd_epi16(m128Tmp1,
1200
            _mm_load_si128((__m128i *) (transform8x8[0])));
1201
    m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1202
    m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1203
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1204
    E2l = _mm_madd_epi16(m128Tmp2,
1205
            _mm_load_si128((__m128i *) (transform8x8[1])));
1206
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1207
    E2h = _mm_madd_epi16(m128Tmp3,
1208
            _mm_load_si128((__m128i *) (transform8x8[1])));
1209
    O0l = _mm_add_epi32(E1l, E2l);
1210
    O0h = _mm_add_epi32(E1h, E2h);
1211
1212
    E1l = _mm_madd_epi16(m128Tmp0,
1213
            _mm_load_si128((__m128i *) (transform8x8[2])));
1214
    E1h = _mm_madd_epi16(m128Tmp1,
1215
            _mm_load_si128((__m128i *) (transform8x8[2])));
1216
    E2l = _mm_madd_epi16(m128Tmp2,
1217
            _mm_load_si128((__m128i *) (transform8x8[3])));
1218
    E2h = _mm_madd_epi16(m128Tmp3,
1219
            _mm_load_si128((__m128i *) (transform8x8[3])));
1220
1221
    O1l = _mm_add_epi32(E1l, E2l);
1222
    O1h = _mm_add_epi32(E1h, E2h);
1223
1224
    E1l = _mm_madd_epi16(m128Tmp0,
1225
            _mm_load_si128((__m128i *) (transform8x8[4])));
1226
    E1h = _mm_madd_epi16(m128Tmp1,
1227
            _mm_load_si128((__m128i *) (transform8x8[4])));
1228
    E2l = _mm_madd_epi16(m128Tmp2,
1229
            _mm_load_si128((__m128i *) (transform8x8[5])));
1230
    E2h = _mm_madd_epi16(m128Tmp3,
1231
            _mm_load_si128((__m128i *) (transform8x8[5])));
1232
    O2l = _mm_add_epi32(E1l, E2l);
1233
    O2h = _mm_add_epi32(E1h, E2h);
1234
1235
    E1l = _mm_madd_epi16(m128Tmp0,
1236
            _mm_load_si128((__m128i *) (transform8x8[6])));
1237
    E1h = _mm_madd_epi16(m128Tmp1,
1238
            _mm_load_si128((__m128i *) (transform8x8[6])));
1239
    E2l = _mm_madd_epi16(m128Tmp2,
1240
            _mm_load_si128((__m128i *) (transform8x8[7])));
1241
    E2h = _mm_madd_epi16(m128Tmp3,
1242
            _mm_load_si128((__m128i *) (transform8x8[7])));
1243
    O3h = _mm_add_epi32(E1h, E2h);
1244
    O3l = _mm_add_epi32(E1l, E2l);
1245
1246
    /*    -------     */
1247
1248
    m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1249
    m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1250
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1251
    EE0l = _mm_madd_epi16(m128Tmp0,
1252
            _mm_load_si128((__m128i *) (transform8x8[8])));
1253
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1254
    EE0h = _mm_madd_epi16(m128Tmp1,
1255
            _mm_load_si128((__m128i *) (transform8x8[8])));
1256
1257
    EE1l = _mm_madd_epi16(m128Tmp0,
1258
            _mm_load_si128((__m128i *) (transform8x8[9])));
1259
    EE1h = _mm_madd_epi16(m128Tmp1,
1260
            _mm_load_si128((__m128i *) (transform8x8[9])));
1261
1262
    /*    -------     */
1263
1264
    m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1265
    m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1266
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1267
    E00l = _mm_madd_epi16(m128Tmp0,
1268
            _mm_load_si128((__m128i *) (transform8x8[10])));
1269
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1270
    E00h = _mm_madd_epi16(m128Tmp1,
1271
            _mm_load_si128((__m128i *) (transform8x8[10])));
1272
    E01l = _mm_madd_epi16(m128Tmp0,
1273
            _mm_load_si128((__m128i *) (transform8x8[11])));
1274
    E01h = _mm_madd_epi16(m128Tmp1,
1275
            _mm_load_si128((__m128i *) (transform8x8[11])));
1276
    E0l = _mm_add_epi32(EE0l, E00l);
1277
    E0l = _mm_add_epi32(E0l, m128iAdd);
1278
    E0h = _mm_add_epi32(EE0h, E00h);
1279
    E0h = _mm_add_epi32(E0h, m128iAdd);
1280
    E3l = _mm_sub_epi32(EE0l, E00l);
1281
    E3l = _mm_add_epi32(E3l, m128iAdd);
1282
    E3h = _mm_sub_epi32(EE0h, E00h);
1283
    E3h = _mm_add_epi32(E3h, m128iAdd);
1284
1285
    E1l = _mm_add_epi32(EE1l, E01l);
1286
    E1l = _mm_add_epi32(E1l, m128iAdd);
1287
    E1h = _mm_add_epi32(EE1h, E01h);
1288
    E1h = _mm_add_epi32(E1h, m128iAdd);
1289
    E2l = _mm_sub_epi32(EE1l, E01l);
1290
    E2l = _mm_add_epi32(E2l, m128iAdd);
1291
    E2h = _mm_sub_epi32(EE1h, E01h);
1292
    E2h = _mm_add_epi32(E2h, m128iAdd);
1293
    m128iS0 = _mm_packs_epi32(
1294
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1295
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1296
    m128iS1 = _mm_packs_epi32(
1297
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1298
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1299
    m128iS2 = _mm_packs_epi32(
1300
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1301
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1302
    m128iS3 = _mm_packs_epi32(
1303
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1304
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1305
    m128iS4 = _mm_packs_epi32(
1306
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1307
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1308
    m128iS5 = _mm_packs_epi32(
1309
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1310
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1311
    m128iS6 = _mm_packs_epi32(
1312
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1313
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1314
    m128iS7 = _mm_packs_epi32(
1315
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1316
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1317
    /*  Invers matrix   */
1318
1319
    E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1320
    E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1321
    E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1322
    E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1323
    O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1324
    O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1325
    O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1326
    O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1327
    m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1328
    m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1329
    m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1330
    m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1331
    m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1332
    m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1333
    m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1334
    m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1335
    m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1336
    m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1337
    m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1338
    m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1339
    m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1340
    m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1341
    m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1342
    m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1343
1344
    m128iAdd = _mm_set1_epi32(add_2nd);
1345
1346
    m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1347
    E1l = _mm_madd_epi16(m128Tmp0,
1348
            _mm_load_si128((__m128i *) (transform8x8[0])));
1349
    m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1350
    E1h = _mm_madd_epi16(m128Tmp1,
1351
            _mm_load_si128((__m128i *) (transform8x8[0])));
1352
    m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1353
    E2l = _mm_madd_epi16(m128Tmp2,
1354
            _mm_load_si128((__m128i *) (transform8x8[1])));
1355
    m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1356
    E2h = _mm_madd_epi16(m128Tmp3,
1357
            _mm_load_si128((__m128i *) (transform8x8[1])));
1358
    O0l = _mm_add_epi32(E1l, E2l);
1359
    O0h = _mm_add_epi32(E1h, E2h);
1360
    E1l = _mm_madd_epi16(m128Tmp0,
1361
            _mm_load_si128((__m128i *) (transform8x8[2])));
1362
    E1h = _mm_madd_epi16(m128Tmp1,
1363
            _mm_load_si128((__m128i *) (transform8x8[2])));
1364
    E2l = _mm_madd_epi16(m128Tmp2,
1365
            _mm_load_si128((__m128i *) (transform8x8[3])));
1366
    E2h = _mm_madd_epi16(m128Tmp3,
1367
            _mm_load_si128((__m128i *) (transform8x8[3])));
1368
    O1l = _mm_add_epi32(E1l, E2l);
1369
    O1h = _mm_add_epi32(E1h, E2h);
1370
    E1l = _mm_madd_epi16(m128Tmp0,
1371
            _mm_load_si128((__m128i *) (transform8x8[4])));
1372
    E1h = _mm_madd_epi16(m128Tmp1,
1373
            _mm_load_si128((__m128i *) (transform8x8[4])));
1374
    E2l = _mm_madd_epi16(m128Tmp2,
1375
            _mm_load_si128((__m128i *) (transform8x8[5])));
1376
    E2h = _mm_madd_epi16(m128Tmp3,
1377
            _mm_load_si128((__m128i *) (transform8x8[5])));
1378
    O2l = _mm_add_epi32(E1l, E2l);
1379
    O2h = _mm_add_epi32(E1h, E2h);
1380
    E1l = _mm_madd_epi16(m128Tmp0,
1381
            _mm_load_si128((__m128i *) (transform8x8[6])));
1382
    E1h = _mm_madd_epi16(m128Tmp1,
1383
            _mm_load_si128((__m128i *) (transform8x8[6])));
1384
    E2l = _mm_madd_epi16(m128Tmp2,
1385
            _mm_load_si128((__m128i *) (transform8x8[7])));
1386
    E2h = _mm_madd_epi16(m128Tmp3,
1387
            _mm_load_si128((__m128i *) (transform8x8[7])));
1388
    O3h = _mm_add_epi32(E1h, E2h);
1389
    O3l = _mm_add_epi32(E1l, E2l);
1390
1391
    m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1392
    EE0l = _mm_madd_epi16(m128Tmp0,
1393
            _mm_load_si128((__m128i *) (transform8x8[8])));
1394
    m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1395
    EE0h = _mm_madd_epi16(m128Tmp1,
1396
            _mm_load_si128((__m128i *) (transform8x8[8])));
1397
    EE1l = _mm_madd_epi16(m128Tmp0,
1398
            _mm_load_si128((__m128i *) (transform8x8[9])));
1399
    EE1h = _mm_madd_epi16(m128Tmp1,
1400
            _mm_load_si128((__m128i *) (transform8x8[9])));
1401
1402
    m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1403
    E00l = _mm_madd_epi16(m128Tmp0,
1404
            _mm_load_si128((__m128i *) (transform8x8[10])));
1405
    m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1406
    E00h = _mm_madd_epi16(m128Tmp1,
1407
            _mm_load_si128((__m128i *) (transform8x8[10])));
1408
    E01l = _mm_madd_epi16(m128Tmp0,
1409
            _mm_load_si128((__m128i *) (transform8x8[11])));
1410
    E01h = _mm_madd_epi16(m128Tmp1,
1411
            _mm_load_si128((__m128i *) (transform8x8[11])));
1412
    E0l = _mm_add_epi32(EE0l, E00l);
1413
    E0l = _mm_add_epi32(E0l, m128iAdd);
1414
    E0h = _mm_add_epi32(EE0h, E00h);
1415
    E0h = _mm_add_epi32(E0h, m128iAdd);
1416
    E3l = _mm_sub_epi32(EE0l, E00l);
1417
    E3l = _mm_add_epi32(E3l, m128iAdd);
1418
    E3h = _mm_sub_epi32(EE0h, E00h);
1419
    E3h = _mm_add_epi32(E3h, m128iAdd);
1420
    E1l = _mm_add_epi32(EE1l, E01l);
1421
    E1l = _mm_add_epi32(E1l, m128iAdd);
1422
    E1h = _mm_add_epi32(EE1h, E01h);
1423
    E1h = _mm_add_epi32(E1h, m128iAdd);
1424
    E2l = _mm_sub_epi32(EE1l, E01l);
1425
    E2l = _mm_add_epi32(E2l, m128iAdd);
1426
    E2h = _mm_sub_epi32(EE1h, E01h);
1427
    E2h = _mm_add_epi32(E2h, m128iAdd);
1428
1429
    m128iS0 = _mm_packs_epi32(
1430
            _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1431
            _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1432
    m128iS1 = _mm_packs_epi32(
1433
            _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1434
            _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1435
    m128iS2 = _mm_packs_epi32(
1436
            _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1437
            _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1438
    m128iS3 = _mm_packs_epi32(
1439
            _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1440
            _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1441
    m128iS4 = _mm_packs_epi32(
1442
            _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1443
            _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1444
    m128iS5 = _mm_packs_epi32(
1445
            _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1446
            _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1447
    m128iS6 = _mm_packs_epi32(
1448
            _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1449
            _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1450
    m128iS7 = _mm_packs_epi32(
1451
            _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1452
            _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1453
1454
    _mm_store_si128((__m128i *) (src), m128iS0);
1455
    _mm_store_si128((__m128i *) (src + 8), m128iS1);
1456
    _mm_store_si128((__m128i *) (src + 16), m128iS2);
1457
    _mm_store_si128((__m128i *) (src + 24), m128iS3);
1458
    _mm_store_si128((__m128i *) (src + 32), m128iS4);
1459
    _mm_store_si128((__m128i *) (src + 40), m128iS5);
1460
    _mm_store_si128((__m128i *) (src + 48), m128iS6);
1461
    _mm_store_si128((__m128i *) (src + 56), m128iS7);
1462
1463
    j = 0;
1464
    for (i = 0; i < 4; i++) {
1465
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1466
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1467
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1468
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1469
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1470
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1471
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1472
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1473
        j += 1;
1474
        dst += stride;
1475
        dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1476
        dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1477
        dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1478
        dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1479
        dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1480
        dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1481
        dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1482
        dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1483
        j += 1;
1484
        dst += stride;
1485
    }
1486
1487
}
1488
#endif
1489
1490
1491
#if HAVE_SSE4_1
1492
void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
1493
0
        ptrdiff_t _stride) {
1494
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
1495
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1496
0
    int i;
1497
0
    uint8_t *dst = (uint8_t*) _dst;
1498
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
1499
0
    const int16_t *src = coeffs;
1500
0
    int32_t shift;
1501
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1502
0
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1503
0
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1504
0
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1505
0
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1506
0
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1507
0
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1508
0
    __m128i E4l, E5l, E6l, E7l;
1509
0
    __m128i E4h, E5h, E6h, E7h;
1510
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1511
0
    __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1512
1513
1514
    /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1515
    __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1516
    __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1517
    __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1518
1519
    __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1520
1521
    __m128i V00,V01, V10, V11;*/
1522
1523
1524
0
    const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1525
0
    const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1526
0
    const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1527
0
    const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1528
0
    const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1529
0
    const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1530
0
    const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1531
0
    const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1532
0
    const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1533
0
    const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1534
0
    const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1535
0
    const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1536
0
    const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1537
0
    const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1538
0
    const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1539
0
    const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1540
0
    const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1541
0
    const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1542
0
    const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1543
0
    const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1544
0
    const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1545
0
    const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1546
0
    const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1547
0
    const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1548
0
    const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1549
0
    const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1550
0
    const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1551
0
    const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1552
0
    const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1553
0
    const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1554
0
    const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1555
0
    const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1556
1557
0
    const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1558
0
    const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1559
0
    const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1560
0
    const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1561
0
    const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1562
0
    const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1563
0
    const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1564
0
    const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1565
1566
0
    const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1567
0
    const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1568
0
    const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1569
0
    const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1570
1571
1572
1573
0
    int j;
1574
0
    m128iS0 = _mm_load_si128((__m128i *) (src));
1575
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1576
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1577
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1578
0
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1579
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1580
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1581
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1582
0
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1583
0
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1584
0
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1585
0
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1586
0
    m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1587
0
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1588
0
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1589
0
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1590
0
    shift = shift_1st;
1591
0
    m128iAdd = _mm_set1_epi32(add_1st);
1592
1593
0
    for (j = 0; j < 2; j++) {
1594
0
        for (i = 0; i < 16; i += 8) {
1595
1596
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1597
0
            E0l = _mm_madd_epi16(m128Tmp0,T00);
1598
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1599
0
            E0h = _mm_madd_epi16(m128Tmp1,T00);
1600
1601
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1602
0
            E1l = _mm_madd_epi16(m128Tmp2,T10);
1603
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1604
0
            E1h = _mm_madd_epi16(m128Tmp3,T10);
1605
1606
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1607
0
            E2l = _mm_madd_epi16(m128Tmp4,T20);
1608
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1609
0
            E2h = _mm_madd_epi16(m128Tmp5,T20);
1610
1611
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1612
0
            E3l = _mm_madd_epi16(m128Tmp6,T30);
1613
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1614
0
            E3h = _mm_madd_epi16(m128Tmp7,T30);
1615
1616
0
            O0l = _mm_add_epi32(E0l, E1l);
1617
0
            O0l = _mm_add_epi32(O0l, E2l);
1618
0
            O0l = _mm_add_epi32(O0l, E3l);
1619
1620
0
            O0h = _mm_add_epi32(E0h, E1h);
1621
0
            O0h = _mm_add_epi32(O0h, E2h);
1622
0
            O0h = _mm_add_epi32(O0h, E3h);
1623
1624
            /* Compute O1*/
1625
0
            E0l = _mm_madd_epi16(m128Tmp0,T01);
1626
0
            E0h = _mm_madd_epi16(m128Tmp1,T01);
1627
0
            E1l = _mm_madd_epi16(m128Tmp2,T11);
1628
0
            E1h = _mm_madd_epi16(m128Tmp3,T11);
1629
0
            E2l = _mm_madd_epi16(m128Tmp4,T21);
1630
0
            E2h = _mm_madd_epi16(m128Tmp5,T21);
1631
0
            E3l = _mm_madd_epi16(m128Tmp6,T31);
1632
0
            E3h = _mm_madd_epi16(m128Tmp7,T31);
1633
0
            O1l = _mm_add_epi32(E0l, E1l);
1634
0
            O1l = _mm_add_epi32(O1l, E2l);
1635
0
            O1l = _mm_add_epi32(O1l, E3l);
1636
0
            O1h = _mm_add_epi32(E0h, E1h);
1637
0
            O1h = _mm_add_epi32(O1h, E2h);
1638
0
            O1h = _mm_add_epi32(O1h, E3h);
1639
1640
            /* Compute O2*/
1641
0
            E0l = _mm_madd_epi16(m128Tmp0,T02);
1642
0
            E0h = _mm_madd_epi16(m128Tmp1,T02);
1643
0
            E1l = _mm_madd_epi16(m128Tmp2,T12);
1644
0
            E1h = _mm_madd_epi16(m128Tmp3,T12);
1645
0
            E2l = _mm_madd_epi16(m128Tmp4,T22);
1646
0
            E2h = _mm_madd_epi16(m128Tmp5,T22);
1647
0
            E3l = _mm_madd_epi16(m128Tmp6,T32);
1648
0
            E3h = _mm_madd_epi16(m128Tmp7,T32);
1649
0
            O2l = _mm_add_epi32(E0l, E1l);
1650
0
            O2l = _mm_add_epi32(O2l, E2l);
1651
0
            O2l = _mm_add_epi32(O2l, E3l);
1652
1653
0
            O2h = _mm_add_epi32(E0h, E1h);
1654
0
            O2h = _mm_add_epi32(O2h, E2h);
1655
0
            O2h = _mm_add_epi32(O2h, E3h);
1656
1657
            /* Compute O3*/
1658
0
            E0l = _mm_madd_epi16(m128Tmp0,T03);
1659
0
            E0h = _mm_madd_epi16(m128Tmp1,T03);
1660
0
            E1l = _mm_madd_epi16(m128Tmp2,T13);
1661
0
            E1h = _mm_madd_epi16(m128Tmp3,T13);
1662
0
            E2l = _mm_madd_epi16(m128Tmp4,T23);
1663
0
            E2h = _mm_madd_epi16(m128Tmp5,T23);
1664
0
            E3l = _mm_madd_epi16(m128Tmp6,T33);
1665
0
            E3h = _mm_madd_epi16(m128Tmp7,T33);
1666
1667
0
            O3l = _mm_add_epi32(E0l, E1l);
1668
0
            O3l = _mm_add_epi32(O3l, E2l);
1669
0
            O3l = _mm_add_epi32(O3l, E3l);
1670
1671
0
            O3h = _mm_add_epi32(E0h, E1h);
1672
0
            O3h = _mm_add_epi32(O3h, E2h);
1673
0
            O3h = _mm_add_epi32(O3h, E3h);
1674
1675
            /* Compute O4*/
1676
1677
0
            E0l = _mm_madd_epi16(m128Tmp0,T04);
1678
0
            E0h = _mm_madd_epi16(m128Tmp1,T04);
1679
0
            E1l = _mm_madd_epi16(m128Tmp2,T14);
1680
0
            E1h = _mm_madd_epi16(m128Tmp3,T14);
1681
0
            E2l = _mm_madd_epi16(m128Tmp4,T24);
1682
0
            E2h = _mm_madd_epi16(m128Tmp5,T24);
1683
0
            E3l = _mm_madd_epi16(m128Tmp6,T34);
1684
0
            E3h = _mm_madd_epi16(m128Tmp7,T34);
1685
1686
0
            O4l = _mm_add_epi32(E0l, E1l);
1687
0
            O4l = _mm_add_epi32(O4l, E2l);
1688
0
            O4l = _mm_add_epi32(O4l, E3l);
1689
1690
0
            O4h = _mm_add_epi32(E0h, E1h);
1691
0
            O4h = _mm_add_epi32(O4h, E2h);
1692
0
            O4h = _mm_add_epi32(O4h, E3h);
1693
1694
            /* Compute O5*/
1695
0
            E0l = _mm_madd_epi16(m128Tmp0,T05);
1696
0
            E0h = _mm_madd_epi16(m128Tmp1,T05);
1697
0
            E1l = _mm_madd_epi16(m128Tmp2,T15);
1698
0
            E1h = _mm_madd_epi16(m128Tmp3,T15);
1699
0
            E2l = _mm_madd_epi16(m128Tmp4,T25);
1700
0
            E2h = _mm_madd_epi16(m128Tmp5,T25);
1701
0
            E3l = _mm_madd_epi16(m128Tmp6,T35);
1702
0
            E3h = _mm_madd_epi16(m128Tmp7,T35);
1703
1704
0
            O5l = _mm_add_epi32(E0l, E1l);
1705
0
            O5l = _mm_add_epi32(O5l, E2l);
1706
0
            O5l = _mm_add_epi32(O5l, E3l);
1707
1708
0
            O5h = _mm_add_epi32(E0h, E1h);
1709
0
            O5h = _mm_add_epi32(O5h, E2h);
1710
0
            O5h = _mm_add_epi32(O5h, E3h);
1711
1712
            /* Compute O6*/
1713
1714
0
            E0l = _mm_madd_epi16(m128Tmp0,T06);
1715
0
            E0h = _mm_madd_epi16(m128Tmp1,T06);
1716
0
            E1l = _mm_madd_epi16(m128Tmp2,T16);
1717
0
            E1h = _mm_madd_epi16(m128Tmp3,T16);
1718
0
            E2l = _mm_madd_epi16(m128Tmp4,T26);
1719
0
            E2h = _mm_madd_epi16(m128Tmp5,T26);
1720
0
            E3l = _mm_madd_epi16(m128Tmp6,T36);
1721
0
            E3h = _mm_madd_epi16(m128Tmp7,T36);
1722
1723
0
            O6l = _mm_add_epi32(E0l, E1l);
1724
0
            O6l = _mm_add_epi32(O6l, E2l);
1725
0
            O6l = _mm_add_epi32(O6l, E3l);
1726
1727
0
            O6h = _mm_add_epi32(E0h, E1h);
1728
0
            O6h = _mm_add_epi32(O6h, E2h);
1729
0
            O6h = _mm_add_epi32(O6h, E3h);
1730
1731
            /* Compute O7*/
1732
1733
0
            E0l = _mm_madd_epi16(m128Tmp0,T07);
1734
0
            E0h = _mm_madd_epi16(m128Tmp1,T07);
1735
0
            E1l = _mm_madd_epi16(m128Tmp2,T17);
1736
0
            E1h = _mm_madd_epi16(m128Tmp3,T17);
1737
0
            E2l = _mm_madd_epi16(m128Tmp4,T27);
1738
0
            E2h = _mm_madd_epi16(m128Tmp5,T27);
1739
0
            E3l = _mm_madd_epi16(m128Tmp6,T37);
1740
0
            E3h = _mm_madd_epi16(m128Tmp7,T37);
1741
1742
0
            O7l = _mm_add_epi32(E0l, E1l);
1743
0
            O7l = _mm_add_epi32(O7l, E2l);
1744
0
            O7l = _mm_add_epi32(O7l, E3l);
1745
1746
0
            O7h = _mm_add_epi32(E0h, E1h);
1747
0
            O7h = _mm_add_epi32(O7h, E2h);
1748
0
            O7h = _mm_add_epi32(O7h, E3h);
1749
1750
            /*  Compute E0  */
1751
1752
1753
1754
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1755
0
            E0l = _mm_madd_epi16(m128Tmp0,U00);
1756
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1757
0
            E0h = _mm_madd_epi16(m128Tmp1,U00);
1758
1759
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1760
0
            E0l = _mm_add_epi32(E0l,
1761
0
                    _mm_madd_epi16(m128Tmp2,U10));
1762
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1763
0
            E0h = _mm_add_epi32(E0h,
1764
0
                    _mm_madd_epi16(m128Tmp3,U10));
1765
1766
            /*  Compute E1  */
1767
0
            E1l = _mm_madd_epi16(m128Tmp0,U01);
1768
0
            E1h = _mm_madd_epi16(m128Tmp1,U01);
1769
0
            E1l = _mm_add_epi32(E1l,
1770
0
                    _mm_madd_epi16(m128Tmp2,U11));
1771
0
            E1h = _mm_add_epi32(E1h,
1772
0
                    _mm_madd_epi16(m128Tmp3,U11));
1773
1774
            /*  Compute E2  */
1775
0
            E2l = _mm_madd_epi16(m128Tmp0,U02);
1776
0
            E2h = _mm_madd_epi16(m128Tmp1,U02);
1777
0
            E2l = _mm_add_epi32(E2l,
1778
0
                    _mm_madd_epi16(m128Tmp2,U12));
1779
0
            E2h = _mm_add_epi32(E2h,
1780
0
                    _mm_madd_epi16(m128Tmp3,U12));
1781
            /*  Compute E3  */
1782
0
            E3l = _mm_madd_epi16(m128Tmp0,U03);
1783
0
            E3h = _mm_madd_epi16(m128Tmp1,U03);
1784
0
            E3l = _mm_add_epi32(E3l,
1785
0
                    _mm_madd_epi16(m128Tmp2,U13));
1786
0
            E3h = _mm_add_epi32(E3h,
1787
0
                    _mm_madd_epi16(m128Tmp3,U13));
1788
1789
            /*  Compute EE0 and EEE */
1790
1791
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1792
0
            E00l = _mm_madd_epi16(m128Tmp0,V00);
1793
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1794
0
            E00h = _mm_madd_epi16(m128Tmp1,V00);
1795
1796
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1797
0
            EE0l = _mm_madd_epi16(m128Tmp2,V10);
1798
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1799
0
            EE0h = _mm_madd_epi16(m128Tmp3,V10);
1800
1801
0
            E01l = _mm_madd_epi16(m128Tmp0,V01);
1802
0
            E01h = _mm_madd_epi16(m128Tmp1,V01);
1803
1804
0
            EE1l = _mm_madd_epi16(m128Tmp2,V11);
1805
0
            EE1h = _mm_madd_epi16(m128Tmp3,V11);
1806
1807
            /*  Compute EE    */
1808
0
            EE2l = _mm_sub_epi32(EE1l, E01l);
1809
0
            EE3l = _mm_sub_epi32(EE0l, E00l);
1810
0
            EE2h = _mm_sub_epi32(EE1h, E01h);
1811
0
            EE3h = _mm_sub_epi32(EE0h, E00h);
1812
1813
0
            EE0l = _mm_add_epi32(EE0l, E00l);
1814
0
            EE1l = _mm_add_epi32(EE1l, E01l);
1815
0
            EE0h = _mm_add_epi32(EE0h, E00h);
1816
0
            EE1h = _mm_add_epi32(EE1h, E01h);
1817
1818
            /*      Compute E       */
1819
1820
0
            E4l = _mm_sub_epi32(EE3l, E3l);
1821
0
            E4l = _mm_add_epi32(E4l, m128iAdd);
1822
1823
0
            E5l = _mm_sub_epi32(EE2l, E2l);
1824
0
            E5l = _mm_add_epi32(E5l, m128iAdd);
1825
1826
0
            E6l = _mm_sub_epi32(EE1l, E1l);
1827
0
            E6l = _mm_add_epi32(E6l, m128iAdd);
1828
1829
0
            E7l = _mm_sub_epi32(EE0l, E0l);
1830
0
            E7l = _mm_add_epi32(E7l, m128iAdd);
1831
1832
0
            E4h = _mm_sub_epi32(EE3h, E3h);
1833
0
            E4h = _mm_add_epi32(E4h, m128iAdd);
1834
1835
0
            E5h = _mm_sub_epi32(EE2h, E2h);
1836
0
            E5h = _mm_add_epi32(E5h, m128iAdd);
1837
1838
0
            E6h = _mm_sub_epi32(EE1h, E1h);
1839
0
            E6h = _mm_add_epi32(E6h, m128iAdd);
1840
1841
0
            E7h = _mm_sub_epi32(EE0h, E0h);
1842
0
            E7h = _mm_add_epi32(E7h, m128iAdd);
1843
1844
0
            E0l = _mm_add_epi32(EE0l, E0l);
1845
0
            E0l = _mm_add_epi32(E0l, m128iAdd);
1846
1847
0
            E1l = _mm_add_epi32(EE1l, E1l);
1848
0
            E1l = _mm_add_epi32(E1l, m128iAdd);
1849
1850
0
            E2l = _mm_add_epi32(EE2l, E2l);
1851
0
            E2l = _mm_add_epi32(E2l, m128iAdd);
1852
1853
0
            E3l = _mm_add_epi32(EE3l, E3l);
1854
0
            E3l = _mm_add_epi32(E3l, m128iAdd);
1855
1856
0
            E0h = _mm_add_epi32(EE0h, E0h);
1857
0
            E0h = _mm_add_epi32(E0h, m128iAdd);
1858
1859
0
            E1h = _mm_add_epi32(EE1h, E1h);
1860
0
            E1h = _mm_add_epi32(E1h, m128iAdd);
1861
1862
0
            E2h = _mm_add_epi32(EE2h, E2h);
1863
0
            E2h = _mm_add_epi32(E2h, m128iAdd);
1864
1865
0
            E3h = _mm_add_epi32(EE3h, E3h);
1866
0
            E3h = _mm_add_epi32(E3h, m128iAdd);
1867
1868
0
            m128iS0 = _mm_packs_epi32(
1869
0
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1870
0
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1871
0
            m128iS1 = _mm_packs_epi32(
1872
0
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1873
0
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1874
0
            m128iS2 = _mm_packs_epi32(
1875
0
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1876
0
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1877
0
            m128iS3 = _mm_packs_epi32(
1878
0
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1879
0
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1880
1881
0
            m128iS4 = _mm_packs_epi32(
1882
0
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1883
0
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1884
0
            m128iS5 = _mm_packs_epi32(
1885
0
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1886
0
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1887
0
            m128iS6 = _mm_packs_epi32(
1888
0
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1889
0
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1890
0
            m128iS7 = _mm_packs_epi32(
1891
0
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1892
0
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1893
1894
0
            m128iS15 = _mm_packs_epi32(
1895
0
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1896
0
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1897
0
            m128iS14 = _mm_packs_epi32(
1898
0
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1899
0
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1900
0
            m128iS13 = _mm_packs_epi32(
1901
0
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1902
0
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1903
0
            m128iS12 = _mm_packs_epi32(
1904
0
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1905
0
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1906
1907
0
            m128iS11 = _mm_packs_epi32(
1908
0
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1909
0
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1910
0
            m128iS10 = _mm_packs_epi32(
1911
0
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1912
0
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1913
0
            m128iS9 = _mm_packs_epi32(
1914
0
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1915
0
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1916
0
            m128iS8 = _mm_packs_epi32(
1917
0
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1918
0
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1919
1920
1921
1922
0
            if (!j) { //first pass
1923
1924
                /*      Inverse the matrix      */
1925
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1926
0
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1927
0
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1928
0
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1929
0
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1930
0
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1931
0
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1932
0
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1933
1934
0
                E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1935
0
                E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1936
0
                E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1937
0
                E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1938
0
                E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1939
0
                E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1940
0
                E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1941
0
                E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1942
1943
0
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1944
0
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1945
0
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1946
0
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1947
1948
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1949
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1950
0
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1951
0
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1952
1953
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1954
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1955
0
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1956
0
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1957
1958
0
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1959
0
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1960
0
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1961
0
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1962
1963
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1964
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1965
0
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1966
0
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1967
1968
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1969
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1970
0
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1971
0
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1972
1973
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1974
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1975
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1976
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1977
1978
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1979
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1980
0
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1981
0
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1982
1983
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1984
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1985
0
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1986
0
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1987
1988
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1989
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1990
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1991
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1992
1993
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1994
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1995
0
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1996
0
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1997
1998
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1999
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2000
0
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2001
0
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2002
2003
0
                if (!i) {
2004
2005
0
                    r0= m128iS0;    //0
2006
0
                    r1= m128iS1;    //16
2007
0
                    r2= m128iS2;    //32
2008
0
                    r3= m128iS3;    //48
2009
0
                    r4= m128iS4;    //64
2010
0
                    r5= m128iS5;    //80
2011
0
                    r6= m128iS6;    //96
2012
0
                    r7= m128iS7;    //112
2013
0
                    r8= m128iS8;    //128
2014
0
                    r9= m128iS9;    //144
2015
0
                    r10= m128iS10;  //160
2016
0
                    r11= m128iS11;  //176
2017
0
                    r12= m128iS12;  //192
2018
0
                    r13= m128iS13;  //208
2019
0
                    r14= m128iS14;  //224
2020
0
                    r15= m128iS15;  //240
2021
2022
2023
2024
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2025
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2026
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2027
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2028
0
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2029
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2030
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2031
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2032
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2033
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2034
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2035
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2036
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2037
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2038
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2039
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2040
0
                } else {
2041
2042
0
                    r16= m128iS0;    //8
2043
0
                    r17= m128iS1;    //24
2044
0
                    r18= m128iS2;    //40
2045
0
                    r19= m128iS3;    //56
2046
0
                    r20= m128iS4;    //72
2047
0
                    r21= m128iS5;    //88
2048
0
                    r22= m128iS6;    //104
2049
0
                    r23= m128iS7;    //120
2050
0
                    r24= m128iS8;    //136
2051
0
                    r25= m128iS9;    //152
2052
0
                    r26= m128iS10;  //168
2053
0
                    r27= m128iS11;  //184
2054
0
                    r28= m128iS12;  //200
2055
0
                    r29= m128iS13;  //216
2056
0
                    r30= m128iS14;  //232
2057
0
                    r31= m128iS15;  //248
2058
2059
                    //prepare next iteration :
2060
2061
0
                    m128iS0= r0;
2062
0
                    m128iS1= r2;
2063
0
                    m128iS2= r4;
2064
0
                    m128iS3= r6;
2065
0
                    m128iS4= r8;
2066
0
                    m128iS5= r10;
2067
0
                    m128iS6= r12;
2068
0
                    m128iS7= r14;
2069
0
                    m128iS8= r16;
2070
0
                    m128iS9= r18;
2071
0
                    m128iS10=r20;
2072
0
                    m128iS11=r22;
2073
0
                    m128iS12=r24;
2074
0
                    m128iS13=r26;
2075
0
                    m128iS14=r28;
2076
0
                    m128iS15=r30;
2077
2078
0
                    shift = shift_2nd;
2079
0
                    m128iAdd = _mm_set1_epi32(add_2nd);
2080
0
                }
2081
2082
0
            } else {
2083
2084
                //transpose half matrix :
2085
                //instead of having 1 register = 1 half-column,
2086
                //1 register = 1 half-row.
2087
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2088
0
                E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2089
0
                E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2090
0
                E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2091
0
                E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2092
0
                E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2093
0
                E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2094
0
                E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2095
2096
0
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2097
0
                O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2098
0
                O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2099
0
                O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2100
0
                O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2101
0
                O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2102
0
                O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2103
0
                O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2104
2105
2106
0
                m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2107
0
                m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2108
2109
0
                m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2110
0
                m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2111
2112
0
                r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);    //1st half 1st row
2113
0
                r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);    //2nd half 1st row
2114
2115
2116
0
                r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);    //1st half 2nd row
2117
0
                r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);    //2nd half 2nd row
2118
2119
0
                m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2120
0
                m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2121
0
                m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2122
0
                m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2123
2124
2125
0
                r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2126
0
                r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2127
2128
0
                r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2129
0
                r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2130
2131
0
                m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2132
0
                m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2133
0
                m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2134
0
                m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2135
2136
0
                r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2137
0
                r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2138
2139
2140
0
                r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2141
0
                r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2142
2143
0
                m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2144
0
                m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2145
0
                m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2146
0
                m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2147
2148
0
                r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2149
0
                r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2150
2151
2152
0
                r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2153
0
                r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2154
2155
0
                dst = (uint8_t*) (_dst + (i*stride));
2156
0
                m128Tmp0= _mm_setzero_si128();
2157
0
                m128Tmp1= _mm_load_si128((__m128i*)dst);
2158
0
                m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2159
0
                m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2160
0
                m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2161
0
                m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2162
0
                m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2163
0
                m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2164
0
                E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2165
2166
2167
0
                r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2168
0
                r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2169
0
                r0= _mm_packus_epi16(r0,r2);
2170
2171
2172
2173
2174
0
                r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2175
0
                r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2176
0
                r4= _mm_packus_epi16(r4,r6);
2177
2178
2179
0
                r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2180
0
                r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2181
0
                r8= _mm_packus_epi16(r8,r10);
2182
2183
2184
0
                r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2185
0
                r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2186
0
                r12= _mm_packus_epi16(r12,r14);
2187
2188
2189
0
                r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2190
0
                r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2191
0
                r16= _mm_packus_epi16(r16,r18);
2192
2193
2194
0
                r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2195
0
                r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2196
0
                r20= _mm_packus_epi16(r20,r22);
2197
2198
2199
0
                r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2200
0
                r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2201
0
                r24= _mm_packus_epi16(r24,r26);
2202
2203
2204
2205
0
                r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2206
0
                r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2207
0
                r28= _mm_packus_epi16(r28,r30);
2208
2209
0
                _mm_store_si128((__m128i*)dst,r0);
2210
0
                _mm_store_si128((__m128i*)(dst+stride),r4);
2211
0
                _mm_store_si128((__m128i*)(dst+2*stride),r8);
2212
0
                _mm_store_si128((__m128i*)(dst+3*stride),r12);
2213
0
                _mm_store_si128((__m128i*)(dst+4*stride),r16);
2214
0
                _mm_store_si128((__m128i*)(dst+5*stride),r20);
2215
0
                _mm_store_si128((__m128i*)(dst+6*stride),r24);
2216
0
                _mm_store_si128((__m128i*)(dst+7*stride),r28);
2217
2218
2219
2220
0
                if (!i) {
2221
                    //first half done, can store !
2222
2223
2224
0
                    m128iS0= r1;
2225
0
                    m128iS1= r3;
2226
0
                    m128iS2= r5;
2227
0
                    m128iS3= r7;
2228
0
                    m128iS4= r9;
2229
0
                    m128iS5= r11;
2230
0
                    m128iS6= r13;
2231
0
                    m128iS7= r15;
2232
0
                    m128iS8= r17;
2233
0
                    m128iS9= r19;
2234
0
                    m128iS10=r21;
2235
0
                    m128iS11=r23;
2236
0
                    m128iS12=r25;
2237
0
                    m128iS13=r27;
2238
0
                    m128iS14=r29;
2239
0
                    m128iS15=r31;
2240
0
                }
2241
0
            }
2242
0
        }
2243
0
    }
2244
0
}
2245
#endif
2246
2247
2248
#if 0
2249
void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
2250
        ptrdiff_t _stride) {
2251
    int i;
2252
    uint16_t *dst = (uint16_t*) _dst;
2253
    ptrdiff_t stride = _stride / 2;
2254
    int16_t *src = coeffs;
2255
    int32_t shift;
2256
    uint8_t shift_2nd = 10; //20 - bit depth
2257
    uint16_t add_2nd = 1 << 9; //shift - 1;
2258
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2259
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2260
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2261
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2262
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2263
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2264
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2265
    __m128i E4l, E5l, E6l, E7l;
2266
    __m128i E4h, E5h, E6h, E7h;
2267
    int j;
2268
    m128iS0 = _mm_load_si128((__m128i *) (src));
2269
    m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2270
    m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2271
    m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2272
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2273
    m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2274
    m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2275
    m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2276
    m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2277
    m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2278
    m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2279
    m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2280
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2281
    m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2282
    m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2283
    m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2284
    shift = shift_1st;
2285
    m128iAdd = _mm_set1_epi32(add_1st);
2286
2287
    for (j = 0; j < 2; j++) {
2288
        for (i = 0; i < 16; i += 8) {
2289
2290
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2291
            E0l = _mm_madd_epi16(m128Tmp0,
2292
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2293
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2294
            E0h = _mm_madd_epi16(m128Tmp1,
2295
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2296
2297
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2298
            E1l = _mm_madd_epi16(m128Tmp2,
2299
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2300
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2301
            E1h = _mm_madd_epi16(m128Tmp3,
2302
                    _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2303
2304
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2305
            E2l = _mm_madd_epi16(m128Tmp4,
2306
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2307
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2308
            E2h = _mm_madd_epi16(m128Tmp5,
2309
                    _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2310
2311
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2312
            E3l = _mm_madd_epi16(m128Tmp6,
2313
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2314
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2315
            E3h = _mm_madd_epi16(m128Tmp7,
2316
                    _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2317
2318
            O0l = _mm_add_epi32(E0l, E1l);
2319
            O0l = _mm_add_epi32(O0l, E2l);
2320
            O0l = _mm_add_epi32(O0l, E3l);
2321
2322
            O0h = _mm_add_epi32(E0h, E1h);
2323
            O0h = _mm_add_epi32(O0h, E2h);
2324
            O0h = _mm_add_epi32(O0h, E3h);
2325
2326
            /* Compute O1*/
2327
            E0l = _mm_madd_epi16(m128Tmp0,
2328
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2329
            E0h = _mm_madd_epi16(m128Tmp1,
2330
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2331
            E1l = _mm_madd_epi16(m128Tmp2,
2332
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2333
            E1h = _mm_madd_epi16(m128Tmp3,
2334
                    _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2335
            E2l = _mm_madd_epi16(m128Tmp4,
2336
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2337
            E2h = _mm_madd_epi16(m128Tmp5,
2338
                    _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2339
            E3l = _mm_madd_epi16(m128Tmp6,
2340
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2341
            E3h = _mm_madd_epi16(m128Tmp7,
2342
                    _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2343
            O1l = _mm_add_epi32(E0l, E1l);
2344
            O1l = _mm_add_epi32(O1l, E2l);
2345
            O1l = _mm_add_epi32(O1l, E3l);
2346
            O1h = _mm_add_epi32(E0h, E1h);
2347
            O1h = _mm_add_epi32(O1h, E2h);
2348
            O1h = _mm_add_epi32(O1h, E3h);
2349
2350
            /* Compute O2*/
2351
            E0l = _mm_madd_epi16(m128Tmp0,
2352
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2353
            E0h = _mm_madd_epi16(m128Tmp1,
2354
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2355
            E1l = _mm_madd_epi16(m128Tmp2,
2356
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2357
            E1h = _mm_madd_epi16(m128Tmp3,
2358
                    _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2359
            E2l = _mm_madd_epi16(m128Tmp4,
2360
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2361
            E2h = _mm_madd_epi16(m128Tmp5,
2362
                    _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2363
            E3l = _mm_madd_epi16(m128Tmp6,
2364
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2365
            E3h = _mm_madd_epi16(m128Tmp7,
2366
                    _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2367
            O2l = _mm_add_epi32(E0l, E1l);
2368
            O2l = _mm_add_epi32(O2l, E2l);
2369
            O2l = _mm_add_epi32(O2l, E3l);
2370
2371
            O2h = _mm_add_epi32(E0h, E1h);
2372
            O2h = _mm_add_epi32(O2h, E2h);
2373
            O2h = _mm_add_epi32(O2h, E3h);
2374
2375
            /* Compute O3*/
2376
            E0l = _mm_madd_epi16(m128Tmp0,
2377
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2378
            E0h = _mm_madd_epi16(m128Tmp1,
2379
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2380
            E1l = _mm_madd_epi16(m128Tmp2,
2381
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2382
            E1h = _mm_madd_epi16(m128Tmp3,
2383
                    _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2384
            E2l = _mm_madd_epi16(m128Tmp4,
2385
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2386
            E2h = _mm_madd_epi16(m128Tmp5,
2387
                    _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2388
            E3l = _mm_madd_epi16(m128Tmp6,
2389
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2390
            E3h = _mm_madd_epi16(m128Tmp7,
2391
                    _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2392
2393
            O3l = _mm_add_epi32(E0l, E1l);
2394
            O3l = _mm_add_epi32(O3l, E2l);
2395
            O3l = _mm_add_epi32(O3l, E3l);
2396
2397
            O3h = _mm_add_epi32(E0h, E1h);
2398
            O3h = _mm_add_epi32(O3h, E2h);
2399
            O3h = _mm_add_epi32(O3h, E3h);
2400
2401
            /* Compute O4*/
2402
2403
            E0l = _mm_madd_epi16(m128Tmp0,
2404
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2405
            E0h = _mm_madd_epi16(m128Tmp1,
2406
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2407
            E1l = _mm_madd_epi16(m128Tmp2,
2408
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2409
            E1h = _mm_madd_epi16(m128Tmp3,
2410
                    _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2411
            E2l = _mm_madd_epi16(m128Tmp4,
2412
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2413
            E2h = _mm_madd_epi16(m128Tmp5,
2414
                    _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2415
            E3l = _mm_madd_epi16(m128Tmp6,
2416
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2417
            E3h = _mm_madd_epi16(m128Tmp7,
2418
                    _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2419
2420
            O4l = _mm_add_epi32(E0l, E1l);
2421
            O4l = _mm_add_epi32(O4l, E2l);
2422
            O4l = _mm_add_epi32(O4l, E3l);
2423
2424
            O4h = _mm_add_epi32(E0h, E1h);
2425
            O4h = _mm_add_epi32(O4h, E2h);
2426
            O4h = _mm_add_epi32(O4h, E3h);
2427
2428
            /* Compute O5*/
2429
            E0l = _mm_madd_epi16(m128Tmp0,
2430
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2431
            E0h = _mm_madd_epi16(m128Tmp1,
2432
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2433
            E1l = _mm_madd_epi16(m128Tmp2,
2434
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2435
            E1h = _mm_madd_epi16(m128Tmp3,
2436
                    _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2437
            E2l = _mm_madd_epi16(m128Tmp4,
2438
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2439
            E2h = _mm_madd_epi16(m128Tmp5,
2440
                    _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2441
            E3l = _mm_madd_epi16(m128Tmp6,
2442
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2443
            E3h = _mm_madd_epi16(m128Tmp7,
2444
                    _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2445
2446
            O5l = _mm_add_epi32(E0l, E1l);
2447
            O5l = _mm_add_epi32(O5l, E2l);
2448
            O5l = _mm_add_epi32(O5l, E3l);
2449
2450
            O5h = _mm_add_epi32(E0h, E1h);
2451
            O5h = _mm_add_epi32(O5h, E2h);
2452
            O5h = _mm_add_epi32(O5h, E3h);
2453
2454
            /* Compute O6*/
2455
2456
            E0l = _mm_madd_epi16(m128Tmp0,
2457
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2458
            E0h = _mm_madd_epi16(m128Tmp1,
2459
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2460
            E1l = _mm_madd_epi16(m128Tmp2,
2461
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2462
            E1h = _mm_madd_epi16(m128Tmp3,
2463
                    _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2464
            E2l = _mm_madd_epi16(m128Tmp4,
2465
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2466
            E2h = _mm_madd_epi16(m128Tmp5,
2467
                    _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2468
            E3l = _mm_madd_epi16(m128Tmp6,
2469
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2470
            E3h = _mm_madd_epi16(m128Tmp7,
2471
                    _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2472
2473
            O6l = _mm_add_epi32(E0l, E1l);
2474
            O6l = _mm_add_epi32(O6l, E2l);
2475
            O6l = _mm_add_epi32(O6l, E3l);
2476
2477
            O6h = _mm_add_epi32(E0h, E1h);
2478
            O6h = _mm_add_epi32(O6h, E2h);
2479
            O6h = _mm_add_epi32(O6h, E3h);
2480
2481
            /* Compute O7*/
2482
2483
            E0l = _mm_madd_epi16(m128Tmp0,
2484
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2485
            E0h = _mm_madd_epi16(m128Tmp1,
2486
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2487
            E1l = _mm_madd_epi16(m128Tmp2,
2488
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2489
            E1h = _mm_madd_epi16(m128Tmp3,
2490
                    _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2491
            E2l = _mm_madd_epi16(m128Tmp4,
2492
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2493
            E2h = _mm_madd_epi16(m128Tmp5,
2494
                    _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2495
            E3l = _mm_madd_epi16(m128Tmp6,
2496
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2497
            E3h = _mm_madd_epi16(m128Tmp7,
2498
                    _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2499
2500
            O7l = _mm_add_epi32(E0l, E1l);
2501
            O7l = _mm_add_epi32(O7l, E2l);
2502
            O7l = _mm_add_epi32(O7l, E3l);
2503
2504
            O7h = _mm_add_epi32(E0h, E1h);
2505
            O7h = _mm_add_epi32(O7h, E2h);
2506
            O7h = _mm_add_epi32(O7h, E3h);
2507
2508
            /*  Compute E0  */
2509
2510
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2511
            E0l = _mm_madd_epi16(m128Tmp0,
2512
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2513
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2514
            E0h = _mm_madd_epi16(m128Tmp1,
2515
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2516
2517
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2518
            E0l = _mm_add_epi32(E0l,
2519
                    _mm_madd_epi16(m128Tmp2,
2520
                            _mm_load_si128(
2521
                                    (__m128i *) (transform16x16_2[1][0]))));
2522
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2523
            E0h = _mm_add_epi32(E0h,
2524
                    _mm_madd_epi16(m128Tmp3,
2525
                            _mm_load_si128(
2526
                                    (__m128i *) (transform16x16_2[1][0]))));
2527
2528
            /*  Compute E1  */
2529
            E1l = _mm_madd_epi16(m128Tmp0,
2530
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2531
            E1h = _mm_madd_epi16(m128Tmp1,
2532
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2533
            E1l = _mm_add_epi32(E1l,
2534
                    _mm_madd_epi16(m128Tmp2,
2535
                            _mm_load_si128(
2536
                                    (__m128i *) (transform16x16_2[1][1]))));
2537
            E1h = _mm_add_epi32(E1h,
2538
                    _mm_madd_epi16(m128Tmp3,
2539
                            _mm_load_si128(
2540
                                    (__m128i *) (transform16x16_2[1][1]))));
2541
2542
            /*  Compute E2  */
2543
            E2l = _mm_madd_epi16(m128Tmp0,
2544
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2545
            E2h = _mm_madd_epi16(m128Tmp1,
2546
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2547
            E2l = _mm_add_epi32(E2l,
2548
                    _mm_madd_epi16(m128Tmp2,
2549
                            _mm_load_si128(
2550
                                    (__m128i *) (transform16x16_2[1][2]))));
2551
            E2h = _mm_add_epi32(E2h,
2552
                    _mm_madd_epi16(m128Tmp3,
2553
                            _mm_load_si128(
2554
                                    (__m128i *) (transform16x16_2[1][2]))));
2555
            /*  Compute E3  */
2556
            E3l = _mm_madd_epi16(m128Tmp0,
2557
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2558
            E3h = _mm_madd_epi16(m128Tmp1,
2559
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2560
            E3l = _mm_add_epi32(E3l,
2561
                    _mm_madd_epi16(m128Tmp2,
2562
                            _mm_load_si128(
2563
                                    (__m128i *) (transform16x16_2[1][3]))));
2564
            E3h = _mm_add_epi32(E3h,
2565
                    _mm_madd_epi16(m128Tmp3,
2566
                            _mm_load_si128(
2567
                                    (__m128i *) (transform16x16_2[1][3]))));
2568
2569
            /*  Compute EE0 and EEE */
2570
2571
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2572
            E00l = _mm_madd_epi16(m128Tmp0,
2573
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2574
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2575
            E00h = _mm_madd_epi16(m128Tmp1,
2576
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2577
2578
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2579
            EE0l = _mm_madd_epi16(m128Tmp2,
2580
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2581
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2582
            EE0h = _mm_madd_epi16(m128Tmp3,
2583
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2584
2585
            E01l = _mm_madd_epi16(m128Tmp0,
2586
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2587
            E01h = _mm_madd_epi16(m128Tmp1,
2588
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2589
2590
            EE1l = _mm_madd_epi16(m128Tmp2,
2591
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2592
            EE1h = _mm_madd_epi16(m128Tmp3,
2593
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2594
2595
            /*  Compute EE    */
2596
            EE2l = _mm_sub_epi32(EE1l, E01l);
2597
            EE3l = _mm_sub_epi32(EE0l, E00l);
2598
            EE2h = _mm_sub_epi32(EE1h, E01h);
2599
            EE3h = _mm_sub_epi32(EE0h, E00h);
2600
2601
            EE0l = _mm_add_epi32(EE0l, E00l);
2602
            EE1l = _mm_add_epi32(EE1l, E01l);
2603
            EE0h = _mm_add_epi32(EE0h, E00h);
2604
            EE1h = _mm_add_epi32(EE1h, E01h);
2605
2606
            /*      Compute E       */
2607
2608
            E4l = _mm_sub_epi32(EE3l, E3l);
2609
            E4l = _mm_add_epi32(E4l, m128iAdd);
2610
2611
            E5l = _mm_sub_epi32(EE2l, E2l);
2612
            E5l = _mm_add_epi32(E5l, m128iAdd);
2613
2614
            E6l = _mm_sub_epi32(EE1l, E1l);
2615
            E6l = _mm_add_epi32(E6l, m128iAdd);
2616
2617
            E7l = _mm_sub_epi32(EE0l, E0l);
2618
            E7l = _mm_add_epi32(E7l, m128iAdd);
2619
2620
            E4h = _mm_sub_epi32(EE3h, E3h);
2621
            E4h = _mm_add_epi32(E4h, m128iAdd);
2622
2623
            E5h = _mm_sub_epi32(EE2h, E2h);
2624
            E5h = _mm_add_epi32(E5h, m128iAdd);
2625
2626
            E6h = _mm_sub_epi32(EE1h, E1h);
2627
            E6h = _mm_add_epi32(E6h, m128iAdd);
2628
2629
            E7h = _mm_sub_epi32(EE0h, E0h);
2630
            E7h = _mm_add_epi32(E7h, m128iAdd);
2631
2632
            E0l = _mm_add_epi32(EE0l, E0l);
2633
            E0l = _mm_add_epi32(E0l, m128iAdd);
2634
2635
            E1l = _mm_add_epi32(EE1l, E1l);
2636
            E1l = _mm_add_epi32(E1l, m128iAdd);
2637
2638
            E2l = _mm_add_epi32(EE2l, E2l);
2639
            E2l = _mm_add_epi32(E2l, m128iAdd);
2640
2641
            E3l = _mm_add_epi32(EE3l, E3l);
2642
            E3l = _mm_add_epi32(E3l, m128iAdd);
2643
2644
            E0h = _mm_add_epi32(EE0h, E0h);
2645
            E0h = _mm_add_epi32(E0h, m128iAdd);
2646
2647
            E1h = _mm_add_epi32(EE1h, E1h);
2648
            E1h = _mm_add_epi32(E1h, m128iAdd);
2649
2650
            E2h = _mm_add_epi32(EE2h, E2h);
2651
            E2h = _mm_add_epi32(E2h, m128iAdd);
2652
2653
            E3h = _mm_add_epi32(EE3h, E3h);
2654
            E3h = _mm_add_epi32(E3h, m128iAdd);
2655
2656
            m128iS0 = _mm_packs_epi32(
2657
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2658
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2659
            m128iS1 = _mm_packs_epi32(
2660
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2661
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2662
            m128iS2 = _mm_packs_epi32(
2663
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2664
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2665
            m128iS3 = _mm_packs_epi32(
2666
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2667
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2668
2669
            m128iS4 = _mm_packs_epi32(
2670
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2671
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2672
            m128iS5 = _mm_packs_epi32(
2673
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2674
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2675
            m128iS6 = _mm_packs_epi32(
2676
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2677
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2678
            m128iS7 = _mm_packs_epi32(
2679
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2680
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2681
2682
            m128iS15 = _mm_packs_epi32(
2683
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2684
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2685
            m128iS14 = _mm_packs_epi32(
2686
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2687
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2688
            m128iS13 = _mm_packs_epi32(
2689
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2690
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2691
            m128iS12 = _mm_packs_epi32(
2692
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2693
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2694
2695
            m128iS11 = _mm_packs_epi32(
2696
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2697
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2698
            m128iS10 = _mm_packs_epi32(
2699
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2700
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2701
            m128iS9 = _mm_packs_epi32(
2702
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2703
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2704
            m128iS8 = _mm_packs_epi32(
2705
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2706
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2707
2708
            if (!j) {
2709
                /*      Inverse the matrix      */
2710
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2711
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2712
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2713
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2714
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2715
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2716
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2717
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2718
2719
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2720
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2721
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2722
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2723
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2724
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2725
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2726
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2727
2728
                m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2729
                m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2730
                m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2731
                m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2732
2733
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2734
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2735
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2736
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2737
2738
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2739
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2740
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2741
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2742
2743
                m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2744
                m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2745
                m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2746
                m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2747
2748
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2749
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2750
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2751
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2752
2753
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2754
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2755
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2756
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2757
2758
                m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2759
                m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2760
                m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2761
                m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2762
2763
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2764
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2765
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2766
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2767
2768
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2769
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2770
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2771
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2772
2773
                m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2774
                m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2775
                m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2776
                m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2777
2778
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2779
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2780
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2781
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2782
2783
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2784
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2785
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2786
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2787
2788
                /*  */
2789
                _mm_store_si128((__m128i *) (src + i), m128iS0);
2790
                _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2791
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2792
                _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2793
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2794
                _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2795
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2796
                _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2797
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2798
                _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2799
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2800
                _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2801
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2802
                _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2803
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2804
                _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2805
2806
                if (!i) {
2807
                    m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2808
                    m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2809
                    m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2810
                    m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2811
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2812
                    m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2813
                    m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2814
                    m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2815
                    m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2816
                    m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2817
                    m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2818
                    m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2819
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2820
                    m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2821
                    m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2822
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2823
                } else {
2824
                    m128iS0 = _mm_load_si128((__m128i *) (src));
2825
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2826
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2827
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2828
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2829
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2830
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2831
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2832
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2833
                    m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2834
                    m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2835
                    m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2836
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2837
                    m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2838
                    m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2839
                    m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2840
                    shift = shift_2nd;
2841
                    m128iAdd = _mm_set1_epi32(add_2nd);
2842
                }
2843
2844
            } else {
2845
                int k, m = 0;
2846
                _mm_storeu_si128((__m128i *) (src), m128iS0);
2847
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2848
                _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2849
                _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2850
                _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2851
                _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2852
                _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2853
                _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2854
                _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2855
                _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2856
                _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2857
                _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2858
                _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2859
                _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2860
                _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2861
                _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2862
                dst = (uint16_t*) _dst + (i * stride);
2863
2864
                for (k = 0; k < 8; k++) {
2865
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2866
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2867
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2868
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2869
                    dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2870
                    dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2871
                    dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2872
                    dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2873
2874
                    dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2875
                    dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2876
                    dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2877
                    dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2878
                    dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2879
                    dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2880
                    dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2881
                    dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2882
                    m += 1;
2883
                    dst += stride;
2884
                }
2885
                if (!i) {
2886
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2887
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2888
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2889
                    m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2890
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2891
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2892
                    m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2893
                    m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2894
                    m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2895
                    m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2896
                    m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2897
                    m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2898
                    m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2899
                    m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2900
                    m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2901
                    m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2902
                }
2903
            }
2904
        }
2905
    }
2906
2907
}
2908
#endif
2909
2910
2911
#if HAVE_SSE4_1
2912
void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
2913
0
        ptrdiff_t _stride) {
2914
0
    uint8_t shift_2nd = 12; // 20 - Bit depth
2915
0
    uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2916
0
    int i, j;
2917
0
    uint8_t *dst = (uint8_t*) _dst;
2918
0
    ptrdiff_t stride = _stride / sizeof(uint8_t);
2919
0
    int shift;
2920
0
    const int16_t *src = coeffs;
2921
2922
0
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2923
0
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2924
0
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2925
0
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2926
0
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2927
0
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2928
0
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2929
0
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2930
0
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2931
0
            EEE0l, EEE1l, EEE0h, EEE1h;
2932
0
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2933
0
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2934
0
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2935
0
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2936
0
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2937
0
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2938
0
            EE4l, EE7h, EE6h, EE5h, EE4h;
2939
2940
0
    __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2941
0
    __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2942
0
    __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2943
0
    __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2944
2945
2946
0
    m128iS0 = _mm_load_si128((__m128i *) (src));
2947
0
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2948
0
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2949
0
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2950
0
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2951
0
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2952
0
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2953
0
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2954
0
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2955
0
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2956
0
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2957
0
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2958
0
    m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2959
0
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2960
0
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2961
0
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2962
0
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2963
0
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2964
0
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2965
0
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2966
0
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2967
0
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2968
0
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2969
0
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2970
0
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2971
0
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2972
0
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2973
0
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2974
0
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2975
0
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2976
0
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2977
0
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2978
2979
0
    shift = shift_1st;
2980
0
    m128iAdd = _mm_set1_epi32(add_1st);
2981
2982
0
    for (j = 0; j < 2; j++) {
2983
0
        for (i = 0; i < 32; i += 8) {
2984
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2985
0
            E0l = _mm_madd_epi16(m128Tmp0,
2986
0
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2987
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2988
0
            E0h = _mm_madd_epi16(m128Tmp1,
2989
0
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
2990
2991
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2992
0
            E1l = _mm_madd_epi16(m128Tmp2,
2993
0
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
2994
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2995
0
            E1h = _mm_madd_epi16(m128Tmp3,
2996
0
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
2997
2998
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2999
0
            E2l = _mm_madd_epi16(m128Tmp4,
3000
0
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3001
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3002
0
            E2h = _mm_madd_epi16(m128Tmp5,
3003
0
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
3004
3005
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3006
0
            E3l = _mm_madd_epi16(m128Tmp6,
3007
0
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3008
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3009
0
            E3h = _mm_madd_epi16(m128Tmp7,
3010
0
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
3011
3012
0
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3013
0
            E4l = _mm_madd_epi16(m128Tmp8,
3014
0
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3015
0
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3016
0
            E4h = _mm_madd_epi16(m128Tmp9,
3017
0
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
3018
3019
0
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3020
0
            E5l = _mm_madd_epi16(m128Tmp10,
3021
0
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3022
0
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3023
0
            E5h = _mm_madd_epi16(m128Tmp11,
3024
0
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
3025
3026
0
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3027
0
            E6l = _mm_madd_epi16(m128Tmp12,
3028
0
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3029
0
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3030
0
            E6h = _mm_madd_epi16(m128Tmp13,
3031
0
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
3032
3033
0
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3034
0
            E7l = _mm_madd_epi16(m128Tmp14,
3035
0
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3036
0
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3037
0
            E7h = _mm_madd_epi16(m128Tmp15,
3038
0
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
3039
3040
0
            O0l = _mm_add_epi32(E0l, E1l);
3041
0
            O0l = _mm_add_epi32(O0l, E2l);
3042
0
            O0l = _mm_add_epi32(O0l, E3l);
3043
0
            O0l = _mm_add_epi32(O0l, E4l);
3044
0
            O0l = _mm_add_epi32(O0l, E5l);
3045
0
            O0l = _mm_add_epi32(O0l, E6l);
3046
0
            O0l = _mm_add_epi32(O0l, E7l);
3047
3048
0
            O0h = _mm_add_epi32(E0h, E1h);
3049
0
            O0h = _mm_add_epi32(O0h, E2h);
3050
0
            O0h = _mm_add_epi32(O0h, E3h);
3051
0
            O0h = _mm_add_epi32(O0h, E4h);
3052
0
            O0h = _mm_add_epi32(O0h, E5h);
3053
0
            O0h = _mm_add_epi32(O0h, E6h);
3054
0
            O0h = _mm_add_epi32(O0h, E7h);
3055
3056
            /* Compute O1*/
3057
0
            E0l = _mm_madd_epi16(m128Tmp0,
3058
0
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3059
0
            E0h = _mm_madd_epi16(m128Tmp1,
3060
0
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
3061
0
            E1l = _mm_madd_epi16(m128Tmp2,
3062
0
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3063
0
            E1h = _mm_madd_epi16(m128Tmp3,
3064
0
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
3065
0
            E2l = _mm_madd_epi16(m128Tmp4,
3066
0
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3067
0
            E2h = _mm_madd_epi16(m128Tmp5,
3068
0
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
3069
0
            E3l = _mm_madd_epi16(m128Tmp6,
3070
0
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3071
0
            E3h = _mm_madd_epi16(m128Tmp7,
3072
0
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
3073
3074
0
            E4l = _mm_madd_epi16(m128Tmp8,
3075
0
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3076
0
            E4h = _mm_madd_epi16(m128Tmp9,
3077
0
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
3078
0
            E5l = _mm_madd_epi16(m128Tmp10,
3079
0
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3080
0
            E5h = _mm_madd_epi16(m128Tmp11,
3081
0
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
3082
0
            E6l = _mm_madd_epi16(m128Tmp12,
3083
0
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3084
0
            E6h = _mm_madd_epi16(m128Tmp13,
3085
0
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
3086
0
            E7l = _mm_madd_epi16(m128Tmp14,
3087
0
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3088
0
            E7h = _mm_madd_epi16(m128Tmp15,
3089
0
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
3090
3091
0
            O1l = _mm_add_epi32(E0l, E1l);
3092
0
            O1l = _mm_add_epi32(O1l, E2l);
3093
0
            O1l = _mm_add_epi32(O1l, E3l);
3094
0
            O1l = _mm_add_epi32(O1l, E4l);
3095
0
            O1l = _mm_add_epi32(O1l, E5l);
3096
0
            O1l = _mm_add_epi32(O1l, E6l);
3097
0
            O1l = _mm_add_epi32(O1l, E7l);
3098
3099
0
            O1h = _mm_add_epi32(E0h, E1h);
3100
0
            O1h = _mm_add_epi32(O1h, E2h);
3101
0
            O1h = _mm_add_epi32(O1h, E3h);
3102
0
            O1h = _mm_add_epi32(O1h, E4h);
3103
0
            O1h = _mm_add_epi32(O1h, E5h);
3104
0
            O1h = _mm_add_epi32(O1h, E6h);
3105
0
            O1h = _mm_add_epi32(O1h, E7h);
3106
            /* Compute O2*/
3107
0
            E0l = _mm_madd_epi16(m128Tmp0,
3108
0
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3109
0
            E0h = _mm_madd_epi16(m128Tmp1,
3110
0
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
3111
0
            E1l = _mm_madd_epi16(m128Tmp2,
3112
0
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3113
0
            E1h = _mm_madd_epi16(m128Tmp3,
3114
0
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
3115
0
            E2l = _mm_madd_epi16(m128Tmp4,
3116
0
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3117
0
            E2h = _mm_madd_epi16(m128Tmp5,
3118
0
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
3119
0
            E3l = _mm_madd_epi16(m128Tmp6,
3120
0
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3121
0
            E3h = _mm_madd_epi16(m128Tmp7,
3122
0
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
3123
3124
0
            E4l = _mm_madd_epi16(m128Tmp8,
3125
0
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3126
0
            E4h = _mm_madd_epi16(m128Tmp9,
3127
0
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
3128
0
            E5l = _mm_madd_epi16(m128Tmp10,
3129
0
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3130
0
            E5h = _mm_madd_epi16(m128Tmp11,
3131
0
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
3132
0
            E6l = _mm_madd_epi16(m128Tmp12,
3133
0
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3134
0
            E6h = _mm_madd_epi16(m128Tmp13,
3135
0
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
3136
0
            E7l = _mm_madd_epi16(m128Tmp14,
3137
0
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3138
0
            E7h = _mm_madd_epi16(m128Tmp15,
3139
0
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
3140
3141
0
            O2l = _mm_add_epi32(E0l, E1l);
3142
0
            O2l = _mm_add_epi32(O2l, E2l);
3143
0
            O2l = _mm_add_epi32(O2l, E3l);
3144
0
            O2l = _mm_add_epi32(O2l, E4l);
3145
0
            O2l = _mm_add_epi32(O2l, E5l);
3146
0
            O2l = _mm_add_epi32(O2l, E6l);
3147
0
            O2l = _mm_add_epi32(O2l, E7l);
3148
3149
0
            O2h = _mm_add_epi32(E0h, E1h);
3150
0
            O2h = _mm_add_epi32(O2h, E2h);
3151
0
            O2h = _mm_add_epi32(O2h, E3h);
3152
0
            O2h = _mm_add_epi32(O2h, E4h);
3153
0
            O2h = _mm_add_epi32(O2h, E5h);
3154
0
            O2h = _mm_add_epi32(O2h, E6h);
3155
0
            O2h = _mm_add_epi32(O2h, E7h);
3156
            /* Compute O3*/
3157
0
            E0l = _mm_madd_epi16(m128Tmp0,
3158
0
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3159
0
            E0h = _mm_madd_epi16(m128Tmp1,
3160
0
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
3161
0
            E1l = _mm_madd_epi16(m128Tmp2,
3162
0
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3163
0
            E1h = _mm_madd_epi16(m128Tmp3,
3164
0
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
3165
0
            E2l = _mm_madd_epi16(m128Tmp4,
3166
0
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3167
0
            E2h = _mm_madd_epi16(m128Tmp5,
3168
0
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
3169
0
            E3l = _mm_madd_epi16(m128Tmp6,
3170
0
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3171
0
            E3h = _mm_madd_epi16(m128Tmp7,
3172
0
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
3173
3174
0
            E4l = _mm_madd_epi16(m128Tmp8,
3175
0
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3176
0
            E4h = _mm_madd_epi16(m128Tmp9,
3177
0
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
3178
0
            E5l = _mm_madd_epi16(m128Tmp10,
3179
0
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3180
0
            E5h = _mm_madd_epi16(m128Tmp11,
3181
0
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
3182
0
            E6l = _mm_madd_epi16(m128Tmp12,
3183
0
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3184
0
            E6h = _mm_madd_epi16(m128Tmp13,
3185
0
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
3186
0
            E7l = _mm_madd_epi16(m128Tmp14,
3187
0
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3188
0
            E7h = _mm_madd_epi16(m128Tmp15,
3189
0
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
3190
3191
0
            O3l = _mm_add_epi32(E0l, E1l);
3192
0
            O3l = _mm_add_epi32(O3l, E2l);
3193
0
            O3l = _mm_add_epi32(O3l, E3l);
3194
0
            O3l = _mm_add_epi32(O3l, E4l);
3195
0
            O3l = _mm_add_epi32(O3l, E5l);
3196
0
            O3l = _mm_add_epi32(O3l, E6l);
3197
0
            O3l = _mm_add_epi32(O3l, E7l);
3198
3199
0
            O3h = _mm_add_epi32(E0h, E1h);
3200
0
            O3h = _mm_add_epi32(O3h, E2h);
3201
0
            O3h = _mm_add_epi32(O3h, E3h);
3202
0
            O3h = _mm_add_epi32(O3h, E4h);
3203
0
            O3h = _mm_add_epi32(O3h, E5h);
3204
0
            O3h = _mm_add_epi32(O3h, E6h);
3205
0
            O3h = _mm_add_epi32(O3h, E7h);
3206
            /* Compute O4*/
3207
3208
0
            E0l = _mm_madd_epi16(m128Tmp0,
3209
0
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3210
0
            E0h = _mm_madd_epi16(m128Tmp1,
3211
0
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
3212
0
            E1l = _mm_madd_epi16(m128Tmp2,
3213
0
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3214
0
            E1h = _mm_madd_epi16(m128Tmp3,
3215
0
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
3216
0
            E2l = _mm_madd_epi16(m128Tmp4,
3217
0
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3218
0
            E2h = _mm_madd_epi16(m128Tmp5,
3219
0
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
3220
0
            E3l = _mm_madd_epi16(m128Tmp6,
3221
0
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3222
0
            E3h = _mm_madd_epi16(m128Tmp7,
3223
0
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
3224
3225
0
            E4l = _mm_madd_epi16(m128Tmp8,
3226
0
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3227
0
            E4h = _mm_madd_epi16(m128Tmp9,
3228
0
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
3229
0
            E5l = _mm_madd_epi16(m128Tmp10,
3230
0
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3231
0
            E5h = _mm_madd_epi16(m128Tmp11,
3232
0
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
3233
0
            E6l = _mm_madd_epi16(m128Tmp12,
3234
0
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3235
0
            E6h = _mm_madd_epi16(m128Tmp13,
3236
0
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
3237
0
            E7l = _mm_madd_epi16(m128Tmp14,
3238
0
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3239
0
            E7h = _mm_madd_epi16(m128Tmp15,
3240
0
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
3241
3242
0
            O4l = _mm_add_epi32(E0l, E1l);
3243
0
            O4l = _mm_add_epi32(O4l, E2l);
3244
0
            O4l = _mm_add_epi32(O4l, E3l);
3245
0
            O4l = _mm_add_epi32(O4l, E4l);
3246
0
            O4l = _mm_add_epi32(O4l, E5l);
3247
0
            O4l = _mm_add_epi32(O4l, E6l);
3248
0
            O4l = _mm_add_epi32(O4l, E7l);
3249
3250
0
            O4h = _mm_add_epi32(E0h, E1h);
3251
0
            O4h = _mm_add_epi32(O4h, E2h);
3252
0
            O4h = _mm_add_epi32(O4h, E3h);
3253
0
            O4h = _mm_add_epi32(O4h, E4h);
3254
0
            O4h = _mm_add_epi32(O4h, E5h);
3255
0
            O4h = _mm_add_epi32(O4h, E6h);
3256
0
            O4h = _mm_add_epi32(O4h, E7h);
3257
3258
            /* Compute O5*/
3259
0
            E0l = _mm_madd_epi16(m128Tmp0,
3260
0
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3261
0
            E0h = _mm_madd_epi16(m128Tmp1,
3262
0
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
3263
0
            E1l = _mm_madd_epi16(m128Tmp2,
3264
0
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3265
0
            E1h = _mm_madd_epi16(m128Tmp3,
3266
0
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
3267
0
            E2l = _mm_madd_epi16(m128Tmp4,
3268
0
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3269
0
            E2h = _mm_madd_epi16(m128Tmp5,
3270
0
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
3271
0
            E3l = _mm_madd_epi16(m128Tmp6,
3272
0
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3273
0
            E3h = _mm_madd_epi16(m128Tmp7,
3274
0
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
3275
3276
0
            E4l = _mm_madd_epi16(m128Tmp8,
3277
0
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3278
0
            E4h = _mm_madd_epi16(m128Tmp9,
3279
0
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
3280
0
            E5l = _mm_madd_epi16(m128Tmp10,
3281
0
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3282
0
            E5h = _mm_madd_epi16(m128Tmp11,
3283
0
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
3284
0
            E6l = _mm_madd_epi16(m128Tmp12,
3285
0
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3286
0
            E6h = _mm_madd_epi16(m128Tmp13,
3287
0
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
3288
0
            E7l = _mm_madd_epi16(m128Tmp14,
3289
0
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3290
0
            E7h = _mm_madd_epi16(m128Tmp15,
3291
0
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
3292
3293
0
            O5l = _mm_add_epi32(E0l, E1l);
3294
0
            O5l = _mm_add_epi32(O5l, E2l);
3295
0
            O5l = _mm_add_epi32(O5l, E3l);
3296
0
            O5l = _mm_add_epi32(O5l, E4l);
3297
0
            O5l = _mm_add_epi32(O5l, E5l);
3298
0
            O5l = _mm_add_epi32(O5l, E6l);
3299
0
            O5l = _mm_add_epi32(O5l, E7l);
3300
3301
0
            O5h = _mm_add_epi32(E0h, E1h);
3302
0
            O5h = _mm_add_epi32(O5h, E2h);
3303
0
            O5h = _mm_add_epi32(O5h, E3h);
3304
0
            O5h = _mm_add_epi32(O5h, E4h);
3305
0
            O5h = _mm_add_epi32(O5h, E5h);
3306
0
            O5h = _mm_add_epi32(O5h, E6h);
3307
0
            O5h = _mm_add_epi32(O5h, E7h);
3308
3309
            /* Compute O6*/
3310
3311
0
            E0l = _mm_madd_epi16(m128Tmp0,
3312
0
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3313
0
            E0h = _mm_madd_epi16(m128Tmp1,
3314
0
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
3315
0
            E1l = _mm_madd_epi16(m128Tmp2,
3316
0
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3317
0
            E1h = _mm_madd_epi16(m128Tmp3,
3318
0
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
3319
0
            E2l = _mm_madd_epi16(m128Tmp4,
3320
0
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3321
0
            E2h = _mm_madd_epi16(m128Tmp5,
3322
0
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
3323
0
            E3l = _mm_madd_epi16(m128Tmp6,
3324
0
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3325
0
            E3h = _mm_madd_epi16(m128Tmp7,
3326
0
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
3327
3328
0
            E4l = _mm_madd_epi16(m128Tmp8,
3329
0
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3330
0
            E4h = _mm_madd_epi16(m128Tmp9,
3331
0
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
3332
0
            E5l = _mm_madd_epi16(m128Tmp10,
3333
0
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3334
0
            E5h = _mm_madd_epi16(m128Tmp11,
3335
0
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
3336
0
            E6l = _mm_madd_epi16(m128Tmp12,
3337
0
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3338
0
            E6h = _mm_madd_epi16(m128Tmp13,
3339
0
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
3340
0
            E7l = _mm_madd_epi16(m128Tmp14,
3341
0
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3342
0
            E7h = _mm_madd_epi16(m128Tmp15,
3343
0
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
3344
3345
0
            O6l = _mm_add_epi32(E0l, E1l);
3346
0
            O6l = _mm_add_epi32(O6l, E2l);
3347
0
            O6l = _mm_add_epi32(O6l, E3l);
3348
0
            O6l = _mm_add_epi32(O6l, E4l);
3349
0
            O6l = _mm_add_epi32(O6l, E5l);
3350
0
            O6l = _mm_add_epi32(O6l, E6l);
3351
0
            O6l = _mm_add_epi32(O6l, E7l);
3352
3353
0
            O6h = _mm_add_epi32(E0h, E1h);
3354
0
            O6h = _mm_add_epi32(O6h, E2h);
3355
0
            O6h = _mm_add_epi32(O6h, E3h);
3356
0
            O6h = _mm_add_epi32(O6h, E4h);
3357
0
            O6h = _mm_add_epi32(O6h, E5h);
3358
0
            O6h = _mm_add_epi32(O6h, E6h);
3359
0
            O6h = _mm_add_epi32(O6h, E7h);
3360
3361
            /* Compute O7*/
3362
3363
0
            E0l = _mm_madd_epi16(m128Tmp0,
3364
0
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3365
0
            E0h = _mm_madd_epi16(m128Tmp1,
3366
0
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
3367
0
            E1l = _mm_madd_epi16(m128Tmp2,
3368
0
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3369
0
            E1h = _mm_madd_epi16(m128Tmp3,
3370
0
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
3371
0
            E2l = _mm_madd_epi16(m128Tmp4,
3372
0
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3373
0
            E2h = _mm_madd_epi16(m128Tmp5,
3374
0
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
3375
0
            E3l = _mm_madd_epi16(m128Tmp6,
3376
0
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3377
0
            E3h = _mm_madd_epi16(m128Tmp7,
3378
0
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
3379
3380
0
            E4l = _mm_madd_epi16(m128Tmp8,
3381
0
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3382
0
            E4h = _mm_madd_epi16(m128Tmp9,
3383
0
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
3384
0
            E5l = _mm_madd_epi16(m128Tmp10,
3385
0
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3386
0
            E5h = _mm_madd_epi16(m128Tmp11,
3387
0
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
3388
0
            E6l = _mm_madd_epi16(m128Tmp12,
3389
0
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3390
0
            E6h = _mm_madd_epi16(m128Tmp13,
3391
0
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
3392
0
            E7l = _mm_madd_epi16(m128Tmp14,
3393
0
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3394
0
            E7h = _mm_madd_epi16(m128Tmp15,
3395
0
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
3396
3397
0
            O7l = _mm_add_epi32(E0l, E1l);
3398
0
            O7l = _mm_add_epi32(O7l, E2l);
3399
0
            O7l = _mm_add_epi32(O7l, E3l);
3400
0
            O7l = _mm_add_epi32(O7l, E4l);
3401
0
            O7l = _mm_add_epi32(O7l, E5l);
3402
0
            O7l = _mm_add_epi32(O7l, E6l);
3403
0
            O7l = _mm_add_epi32(O7l, E7l);
3404
3405
0
            O7h = _mm_add_epi32(E0h, E1h);
3406
0
            O7h = _mm_add_epi32(O7h, E2h);
3407
0
            O7h = _mm_add_epi32(O7h, E3h);
3408
0
            O7h = _mm_add_epi32(O7h, E4h);
3409
0
            O7h = _mm_add_epi32(O7h, E5h);
3410
0
            O7h = _mm_add_epi32(O7h, E6h);
3411
0
            O7h = _mm_add_epi32(O7h, E7h);
3412
3413
            /* Compute O8*/
3414
3415
0
            E0l = _mm_madd_epi16(m128Tmp0,
3416
0
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3417
0
            E0h = _mm_madd_epi16(m128Tmp1,
3418
0
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
3419
0
            E1l = _mm_madd_epi16(m128Tmp2,
3420
0
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3421
0
            E1h = _mm_madd_epi16(m128Tmp3,
3422
0
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
3423
0
            E2l = _mm_madd_epi16(m128Tmp4,
3424
0
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3425
0
            E2h = _mm_madd_epi16(m128Tmp5,
3426
0
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
3427
0
            E3l = _mm_madd_epi16(m128Tmp6,
3428
0
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3429
0
            E3h = _mm_madd_epi16(m128Tmp7,
3430
0
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
3431
3432
0
            E4l = _mm_madd_epi16(m128Tmp8,
3433
0
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3434
0
            E4h = _mm_madd_epi16(m128Tmp9,
3435
0
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
3436
0
            E5l = _mm_madd_epi16(m128Tmp10,
3437
0
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3438
0
            E5h = _mm_madd_epi16(m128Tmp11,
3439
0
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
3440
0
            E6l = _mm_madd_epi16(m128Tmp12,
3441
0
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3442
0
            E6h = _mm_madd_epi16(m128Tmp13,
3443
0
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
3444
0
            E7l = _mm_madd_epi16(m128Tmp14,
3445
0
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3446
0
            E7h = _mm_madd_epi16(m128Tmp15,
3447
0
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
3448
3449
0
            O8l = _mm_add_epi32(E0l, E1l);
3450
0
            O8l = _mm_add_epi32(O8l, E2l);
3451
0
            O8l = _mm_add_epi32(O8l, E3l);
3452
0
            O8l = _mm_add_epi32(O8l, E4l);
3453
0
            O8l = _mm_add_epi32(O8l, E5l);
3454
0
            O8l = _mm_add_epi32(O8l, E6l);
3455
0
            O8l = _mm_add_epi32(O8l, E7l);
3456
3457
0
            O8h = _mm_add_epi32(E0h, E1h);
3458
0
            O8h = _mm_add_epi32(O8h, E2h);
3459
0
            O8h = _mm_add_epi32(O8h, E3h);
3460
0
            O8h = _mm_add_epi32(O8h, E4h);
3461
0
            O8h = _mm_add_epi32(O8h, E5h);
3462
0
            O8h = _mm_add_epi32(O8h, E6h);
3463
0
            O8h = _mm_add_epi32(O8h, E7h);
3464
3465
            /* Compute O9*/
3466
3467
0
            E0l = _mm_madd_epi16(m128Tmp0,
3468
0
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3469
0
            E0h = _mm_madd_epi16(m128Tmp1,
3470
0
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
3471
0
            E1l = _mm_madd_epi16(m128Tmp2,
3472
0
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3473
0
            E1h = _mm_madd_epi16(m128Tmp3,
3474
0
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
3475
0
            E2l = _mm_madd_epi16(m128Tmp4,
3476
0
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3477
0
            E2h = _mm_madd_epi16(m128Tmp5,
3478
0
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
3479
0
            E3l = _mm_madd_epi16(m128Tmp6,
3480
0
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3481
0
            E3h = _mm_madd_epi16(m128Tmp7,
3482
0
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
3483
3484
0
            E4l = _mm_madd_epi16(m128Tmp8,
3485
0
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3486
0
            E4h = _mm_madd_epi16(m128Tmp9,
3487
0
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
3488
0
            E5l = _mm_madd_epi16(m128Tmp10,
3489
0
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3490
0
            E5h = _mm_madd_epi16(m128Tmp11,
3491
0
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
3492
0
            E6l = _mm_madd_epi16(m128Tmp12,
3493
0
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3494
0
            E6h = _mm_madd_epi16(m128Tmp13,
3495
0
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
3496
0
            E7l = _mm_madd_epi16(m128Tmp14,
3497
0
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3498
0
            E7h = _mm_madd_epi16(m128Tmp15,
3499
0
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
3500
3501
0
            O9l = _mm_add_epi32(E0l, E1l);
3502
0
            O9l = _mm_add_epi32(O9l, E2l);
3503
0
            O9l = _mm_add_epi32(O9l, E3l);
3504
0
            O9l = _mm_add_epi32(O9l, E4l);
3505
0
            O9l = _mm_add_epi32(O9l, E5l);
3506
0
            O9l = _mm_add_epi32(O9l, E6l);
3507
0
            O9l = _mm_add_epi32(O9l, E7l);
3508
3509
0
            O9h = _mm_add_epi32(E0h, E1h);
3510
0
            O9h = _mm_add_epi32(O9h, E2h);
3511
0
            O9h = _mm_add_epi32(O9h, E3h);
3512
0
            O9h = _mm_add_epi32(O9h, E4h);
3513
0
            O9h = _mm_add_epi32(O9h, E5h);
3514
0
            O9h = _mm_add_epi32(O9h, E6h);
3515
0
            O9h = _mm_add_epi32(O9h, E7h);
3516
3517
            /* Compute 10*/
3518
3519
0
            E0l = _mm_madd_epi16(m128Tmp0,
3520
0
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3521
0
            E0h = _mm_madd_epi16(m128Tmp1,
3522
0
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
3523
0
            E1l = _mm_madd_epi16(m128Tmp2,
3524
0
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3525
0
            E1h = _mm_madd_epi16(m128Tmp3,
3526
0
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
3527
0
            E2l = _mm_madd_epi16(m128Tmp4,
3528
0
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3529
0
            E2h = _mm_madd_epi16(m128Tmp5,
3530
0
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
3531
0
            E3l = _mm_madd_epi16(m128Tmp6,
3532
0
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3533
0
            E3h = _mm_madd_epi16(m128Tmp7,
3534
0
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
3535
3536
0
            E4l = _mm_madd_epi16(m128Tmp8,
3537
0
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3538
0
            E4h = _mm_madd_epi16(m128Tmp9,
3539
0
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
3540
0
            E5l = _mm_madd_epi16(m128Tmp10,
3541
0
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3542
0
            E5h = _mm_madd_epi16(m128Tmp11,
3543
0
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
3544
0
            E6l = _mm_madd_epi16(m128Tmp12,
3545
0
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3546
0
            E6h = _mm_madd_epi16(m128Tmp13,
3547
0
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
3548
0
            E7l = _mm_madd_epi16(m128Tmp14,
3549
0
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3550
0
            E7h = _mm_madd_epi16(m128Tmp15,
3551
0
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
3552
3553
0
            O10l = _mm_add_epi32(E0l, E1l);
3554
0
            O10l = _mm_add_epi32(O10l, E2l);
3555
0
            O10l = _mm_add_epi32(O10l, E3l);
3556
0
            O10l = _mm_add_epi32(O10l, E4l);
3557
0
            O10l = _mm_add_epi32(O10l, E5l);
3558
0
            O10l = _mm_add_epi32(O10l, E6l);
3559
0
            O10l = _mm_add_epi32(O10l, E7l);
3560
3561
0
            O10h = _mm_add_epi32(E0h, E1h);
3562
0
            O10h = _mm_add_epi32(O10h, E2h);
3563
0
            O10h = _mm_add_epi32(O10h, E3h);
3564
0
            O10h = _mm_add_epi32(O10h, E4h);
3565
0
            O10h = _mm_add_epi32(O10h, E5h);
3566
0
            O10h = _mm_add_epi32(O10h, E6h);
3567
0
            O10h = _mm_add_epi32(O10h, E7h);
3568
3569
            /* Compute 11*/
3570
3571
0
            E0l = _mm_madd_epi16(m128Tmp0,
3572
0
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3573
0
            E0h = _mm_madd_epi16(m128Tmp1,
3574
0
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
3575
0
            E1l = _mm_madd_epi16(m128Tmp2,
3576
0
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3577
0
            E1h = _mm_madd_epi16(m128Tmp3,
3578
0
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
3579
0
            E2l = _mm_madd_epi16(m128Tmp4,
3580
0
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3581
0
            E2h = _mm_madd_epi16(m128Tmp5,
3582
0
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
3583
0
            E3l = _mm_madd_epi16(m128Tmp6,
3584
0
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3585
0
            E3h = _mm_madd_epi16(m128Tmp7,
3586
0
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
3587
3588
0
            E4l = _mm_madd_epi16(m128Tmp8,
3589
0
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3590
0
            E4h = _mm_madd_epi16(m128Tmp9,
3591
0
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
3592
0
            E5l = _mm_madd_epi16(m128Tmp10,
3593
0
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3594
0
            E5h = _mm_madd_epi16(m128Tmp11,
3595
0
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
3596
0
            E6l = _mm_madd_epi16(m128Tmp12,
3597
0
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3598
0
            E6h = _mm_madd_epi16(m128Tmp13,
3599
0
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
3600
0
            E7l = _mm_madd_epi16(m128Tmp14,
3601
0
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3602
0
            E7h = _mm_madd_epi16(m128Tmp15,
3603
0
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
3604
3605
0
            O11l = _mm_add_epi32(E0l, E1l);
3606
0
            O11l = _mm_add_epi32(O11l, E2l);
3607
0
            O11l = _mm_add_epi32(O11l, E3l);
3608
0
            O11l = _mm_add_epi32(O11l, E4l);
3609
0
            O11l = _mm_add_epi32(O11l, E5l);
3610
0
            O11l = _mm_add_epi32(O11l, E6l);
3611
0
            O11l = _mm_add_epi32(O11l, E7l);
3612
3613
0
            O11h = _mm_add_epi32(E0h, E1h);
3614
0
            O11h = _mm_add_epi32(O11h, E2h);
3615
0
            O11h = _mm_add_epi32(O11h, E3h);
3616
0
            O11h = _mm_add_epi32(O11h, E4h);
3617
0
            O11h = _mm_add_epi32(O11h, E5h);
3618
0
            O11h = _mm_add_epi32(O11h, E6h);
3619
0
            O11h = _mm_add_epi32(O11h, E7h);
3620
3621
            /* Compute 12*/
3622
3623
0
            E0l = _mm_madd_epi16(m128Tmp0,
3624
0
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3625
0
            E0h = _mm_madd_epi16(m128Tmp1,
3626
0
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
3627
0
            E1l = _mm_madd_epi16(m128Tmp2,
3628
0
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3629
0
            E1h = _mm_madd_epi16(m128Tmp3,
3630
0
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
3631
0
            E2l = _mm_madd_epi16(m128Tmp4,
3632
0
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3633
0
            E2h = _mm_madd_epi16(m128Tmp5,
3634
0
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
3635
0
            E3l = _mm_madd_epi16(m128Tmp6,
3636
0
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3637
0
            E3h = _mm_madd_epi16(m128Tmp7,
3638
0
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
3639
3640
0
            E4l = _mm_madd_epi16(m128Tmp8,
3641
0
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3642
0
            E4h = _mm_madd_epi16(m128Tmp9,
3643
0
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
3644
0
            E5l = _mm_madd_epi16(m128Tmp10,
3645
0
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3646
0
            E5h = _mm_madd_epi16(m128Tmp11,
3647
0
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
3648
0
            E6l = _mm_madd_epi16(m128Tmp12,
3649
0
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3650
0
            E6h = _mm_madd_epi16(m128Tmp13,
3651
0
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
3652
0
            E7l = _mm_madd_epi16(m128Tmp14,
3653
0
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3654
0
            E7h = _mm_madd_epi16(m128Tmp15,
3655
0
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
3656
3657
0
            O12l = _mm_add_epi32(E0l, E1l);
3658
0
            O12l = _mm_add_epi32(O12l, E2l);
3659
0
            O12l = _mm_add_epi32(O12l, E3l);
3660
0
            O12l = _mm_add_epi32(O12l, E4l);
3661
0
            O12l = _mm_add_epi32(O12l, E5l);
3662
0
            O12l = _mm_add_epi32(O12l, E6l);
3663
0
            O12l = _mm_add_epi32(O12l, E7l);
3664
3665
0
            O12h = _mm_add_epi32(E0h, E1h);
3666
0
            O12h = _mm_add_epi32(O12h, E2h);
3667
0
            O12h = _mm_add_epi32(O12h, E3h);
3668
0
            O12h = _mm_add_epi32(O12h, E4h);
3669
0
            O12h = _mm_add_epi32(O12h, E5h);
3670
0
            O12h = _mm_add_epi32(O12h, E6h);
3671
0
            O12h = _mm_add_epi32(O12h, E7h);
3672
3673
            /* Compute 13*/
3674
3675
0
            E0l = _mm_madd_epi16(m128Tmp0,
3676
0
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3677
0
            E0h = _mm_madd_epi16(m128Tmp1,
3678
0
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
3679
0
            E1l = _mm_madd_epi16(m128Tmp2,
3680
0
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3681
0
            E1h = _mm_madd_epi16(m128Tmp3,
3682
0
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
3683
0
            E2l = _mm_madd_epi16(m128Tmp4,
3684
0
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3685
0
            E2h = _mm_madd_epi16(m128Tmp5,
3686
0
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
3687
0
            E3l = _mm_madd_epi16(m128Tmp6,
3688
0
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3689
0
            E3h = _mm_madd_epi16(m128Tmp7,
3690
0
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
3691
3692
0
            E4l = _mm_madd_epi16(m128Tmp8,
3693
0
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3694
0
            E4h = _mm_madd_epi16(m128Tmp9,
3695
0
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
3696
0
            E5l = _mm_madd_epi16(m128Tmp10,
3697
0
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3698
0
            E5h = _mm_madd_epi16(m128Tmp11,
3699
0
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
3700
0
            E6l = _mm_madd_epi16(m128Tmp12,
3701
0
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3702
0
            E6h = _mm_madd_epi16(m128Tmp13,
3703
0
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
3704
0
            E7l = _mm_madd_epi16(m128Tmp14,
3705
0
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3706
0
            E7h = _mm_madd_epi16(m128Tmp15,
3707
0
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
3708
3709
0
            O13l = _mm_add_epi32(E0l, E1l);
3710
0
            O13l = _mm_add_epi32(O13l, E2l);
3711
0
            O13l = _mm_add_epi32(O13l, E3l);
3712
0
            O13l = _mm_add_epi32(O13l, E4l);
3713
0
            O13l = _mm_add_epi32(O13l, E5l);
3714
0
            O13l = _mm_add_epi32(O13l, E6l);
3715
0
            O13l = _mm_add_epi32(O13l, E7l);
3716
3717
0
            O13h = _mm_add_epi32(E0h, E1h);
3718
0
            O13h = _mm_add_epi32(O13h, E2h);
3719
0
            O13h = _mm_add_epi32(O13h, E3h);
3720
0
            O13h = _mm_add_epi32(O13h, E4h);
3721
0
            O13h = _mm_add_epi32(O13h, E5h);
3722
0
            O13h = _mm_add_epi32(O13h, E6h);
3723
0
            O13h = _mm_add_epi32(O13h, E7h);
3724
3725
            /* Compute O14  */
3726
3727
0
            E0l = _mm_madd_epi16(m128Tmp0,
3728
0
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3729
0
            E0h = _mm_madd_epi16(m128Tmp1,
3730
0
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
3731
0
            E1l = _mm_madd_epi16(m128Tmp2,
3732
0
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3733
0
            E1h = _mm_madd_epi16(m128Tmp3,
3734
0
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
3735
0
            E2l = _mm_madd_epi16(m128Tmp4,
3736
0
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3737
0
            E2h = _mm_madd_epi16(m128Tmp5,
3738
0
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
3739
0
            E3l = _mm_madd_epi16(m128Tmp6,
3740
0
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3741
0
            E3h = _mm_madd_epi16(m128Tmp7,
3742
0
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
3743
3744
0
            E4l = _mm_madd_epi16(m128Tmp8,
3745
0
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3746
0
            E4h = _mm_madd_epi16(m128Tmp9,
3747
0
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
3748
0
            E5l = _mm_madd_epi16(m128Tmp10,
3749
0
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3750
0
            E5h = _mm_madd_epi16(m128Tmp11,
3751
0
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
3752
0
            E6l = _mm_madd_epi16(m128Tmp12,
3753
0
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3754
0
            E6h = _mm_madd_epi16(m128Tmp13,
3755
0
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
3756
0
            E7l = _mm_madd_epi16(m128Tmp14,
3757
0
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3758
0
            E7h = _mm_madd_epi16(m128Tmp15,
3759
0
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
3760
3761
0
            O14l = _mm_add_epi32(E0l, E1l);
3762
0
            O14l = _mm_add_epi32(O14l, E2l);
3763
0
            O14l = _mm_add_epi32(O14l, E3l);
3764
0
            O14l = _mm_add_epi32(O14l, E4l);
3765
0
            O14l = _mm_add_epi32(O14l, E5l);
3766
0
            O14l = _mm_add_epi32(O14l, E6l);
3767
0
            O14l = _mm_add_epi32(O14l, E7l);
3768
3769
0
            O14h = _mm_add_epi32(E0h, E1h);
3770
0
            O14h = _mm_add_epi32(O14h, E2h);
3771
0
            O14h = _mm_add_epi32(O14h, E3h);
3772
0
            O14h = _mm_add_epi32(O14h, E4h);
3773
0
            O14h = _mm_add_epi32(O14h, E5h);
3774
0
            O14h = _mm_add_epi32(O14h, E6h);
3775
0
            O14h = _mm_add_epi32(O14h, E7h);
3776
3777
            /* Compute O15*/
3778
3779
0
            E0l = _mm_madd_epi16(m128Tmp0,
3780
0
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3781
0
            E0h = _mm_madd_epi16(m128Tmp1,
3782
0
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
3783
0
            E1l = _mm_madd_epi16(m128Tmp2,
3784
0
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3785
0
            E1h = _mm_madd_epi16(m128Tmp3,
3786
0
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
3787
0
            E2l = _mm_madd_epi16(m128Tmp4,
3788
0
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3789
0
            E2h = _mm_madd_epi16(m128Tmp5,
3790
0
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
3791
0
            E3l = _mm_madd_epi16(m128Tmp6,
3792
0
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3793
0
            E3h = _mm_madd_epi16(m128Tmp7,
3794
0
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
3795
3796
0
            E4l = _mm_madd_epi16(m128Tmp8,
3797
0
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3798
0
            E4h = _mm_madd_epi16(m128Tmp9,
3799
0
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
3800
0
            E5l = _mm_madd_epi16(m128Tmp10,
3801
0
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3802
0
            E5h = _mm_madd_epi16(m128Tmp11,
3803
0
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
3804
0
            E6l = _mm_madd_epi16(m128Tmp12,
3805
0
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3806
0
            E6h = _mm_madd_epi16(m128Tmp13,
3807
0
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
3808
0
            E7l = _mm_madd_epi16(m128Tmp14,
3809
0
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3810
0
            E7h = _mm_madd_epi16(m128Tmp15,
3811
0
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
3812
3813
0
            O15l = _mm_add_epi32(E0l, E1l);
3814
0
            O15l = _mm_add_epi32(O15l, E2l);
3815
0
            O15l = _mm_add_epi32(O15l, E3l);
3816
0
            O15l = _mm_add_epi32(O15l, E4l);
3817
0
            O15l = _mm_add_epi32(O15l, E5l);
3818
0
            O15l = _mm_add_epi32(O15l, E6l);
3819
0
            O15l = _mm_add_epi32(O15l, E7l);
3820
3821
0
            O15h = _mm_add_epi32(E0h, E1h);
3822
0
            O15h = _mm_add_epi32(O15h, E2h);
3823
0
            O15h = _mm_add_epi32(O15h, E3h);
3824
0
            O15h = _mm_add_epi32(O15h, E4h);
3825
0
            O15h = _mm_add_epi32(O15h, E5h);
3826
0
            O15h = _mm_add_epi32(O15h, E6h);
3827
0
            O15h = _mm_add_epi32(O15h, E7h);
3828
            /*  Compute E0  */
3829
3830
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3831
0
            E0l = _mm_madd_epi16(m128Tmp0,
3832
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3833
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3834
0
            E0h = _mm_madd_epi16(m128Tmp1,
3835
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3836
3837
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3838
0
            E0l = _mm_add_epi32(E0l,
3839
0
                    _mm_madd_epi16(m128Tmp2,
3840
0
                            _mm_load_si128(
3841
0
                                    (__m128i *) (transform16x16_1[1][0]))));
3842
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3843
0
            E0h = _mm_add_epi32(E0h,
3844
0
                    _mm_madd_epi16(m128Tmp3,
3845
0
                            _mm_load_si128(
3846
0
                                    (__m128i *) (transform16x16_1[1][0]))));
3847
3848
0
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3849
0
            E0l = _mm_add_epi32(E0l,
3850
0
                    _mm_madd_epi16(m128Tmp4,
3851
0
                            _mm_load_si128(
3852
0
                                    (__m128i *) (transform16x16_1[2][0]))));
3853
0
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3854
0
            E0h = _mm_add_epi32(E0h,
3855
0
                    _mm_madd_epi16(m128Tmp5,
3856
0
                            _mm_load_si128(
3857
0
                                    (__m128i *) (transform16x16_1[2][0]))));
3858
3859
0
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3860
0
            E0l = _mm_add_epi32(E0l,
3861
0
                    _mm_madd_epi16(m128Tmp6,
3862
0
                            _mm_load_si128(
3863
0
                                    (__m128i *) (transform16x16_1[3][0]))));
3864
0
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3865
0
            E0h = _mm_add_epi32(E0h,
3866
0
                    _mm_madd_epi16(m128Tmp7,
3867
0
                            _mm_load_si128(
3868
0
                                    (__m128i *) (transform16x16_1[3][0]))));
3869
3870
            /*  Compute E1  */
3871
0
            E1l = _mm_madd_epi16(m128Tmp0,
3872
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3873
0
            E1h = _mm_madd_epi16(m128Tmp1,
3874
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3875
0
            E1l = _mm_add_epi32(E1l,
3876
0
                    _mm_madd_epi16(m128Tmp2,
3877
0
                            _mm_load_si128(
3878
0
                                    (__m128i *) (transform16x16_1[1][1]))));
3879
0
            E1h = _mm_add_epi32(E1h,
3880
0
                    _mm_madd_epi16(m128Tmp3,
3881
0
                            _mm_load_si128(
3882
0
                                    (__m128i *) (transform16x16_1[1][1]))));
3883
0
            E1l = _mm_add_epi32(E1l,
3884
0
                    _mm_madd_epi16(m128Tmp4,
3885
0
                            _mm_load_si128(
3886
0
                                    (__m128i *) (transform16x16_1[2][1]))));
3887
0
            E1h = _mm_add_epi32(E1h,
3888
0
                    _mm_madd_epi16(m128Tmp5,
3889
0
                            _mm_load_si128(
3890
0
                                    (__m128i *) (transform16x16_1[2][1]))));
3891
0
            E1l = _mm_add_epi32(E1l,
3892
0
                    _mm_madd_epi16(m128Tmp6,
3893
0
                            _mm_load_si128(
3894
0
                                    (__m128i *) (transform16x16_1[3][1]))));
3895
0
            E1h = _mm_add_epi32(E1h,
3896
0
                    _mm_madd_epi16(m128Tmp7,
3897
0
                            _mm_load_si128(
3898
0
                                    (__m128i *) (transform16x16_1[3][1]))));
3899
3900
            /*  Compute E2  */
3901
0
            E2l = _mm_madd_epi16(m128Tmp0,
3902
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3903
0
            E2h = _mm_madd_epi16(m128Tmp1,
3904
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3905
0
            E2l = _mm_add_epi32(E2l,
3906
0
                    _mm_madd_epi16(m128Tmp2,
3907
0
                            _mm_load_si128(
3908
0
                                    (__m128i *) (transform16x16_1[1][2]))));
3909
0
            E2h = _mm_add_epi32(E2h,
3910
0
                    _mm_madd_epi16(m128Tmp3,
3911
0
                            _mm_load_si128(
3912
0
                                    (__m128i *) (transform16x16_1[1][2]))));
3913
0
            E2l = _mm_add_epi32(E2l,
3914
0
                    _mm_madd_epi16(m128Tmp4,
3915
0
                            _mm_load_si128(
3916
0
                                    (__m128i *) (transform16x16_1[2][2]))));
3917
0
            E2h = _mm_add_epi32(E2h,
3918
0
                    _mm_madd_epi16(m128Tmp5,
3919
0
                            _mm_load_si128(
3920
0
                                    (__m128i *) (transform16x16_1[2][2]))));
3921
0
            E2l = _mm_add_epi32(E2l,
3922
0
                    _mm_madd_epi16(m128Tmp6,
3923
0
                            _mm_load_si128(
3924
0
                                    (__m128i *) (transform16x16_1[3][2]))));
3925
0
            E2h = _mm_add_epi32(E2h,
3926
0
                    _mm_madd_epi16(m128Tmp7,
3927
0
                            _mm_load_si128(
3928
0
                                    (__m128i *) (transform16x16_1[3][2]))));
3929
3930
            /*  Compute E3  */
3931
0
            E3l = _mm_madd_epi16(m128Tmp0,
3932
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3933
0
            E3h = _mm_madd_epi16(m128Tmp1,
3934
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3935
0
            E3l = _mm_add_epi32(E3l,
3936
0
                    _mm_madd_epi16(m128Tmp2,
3937
0
                            _mm_load_si128(
3938
0
                                    (__m128i *) (transform16x16_1[1][3]))));
3939
0
            E3h = _mm_add_epi32(E3h,
3940
0
                    _mm_madd_epi16(m128Tmp3,
3941
0
                            _mm_load_si128(
3942
0
                                    (__m128i *) (transform16x16_1[1][3]))));
3943
0
            E3l = _mm_add_epi32(E3l,
3944
0
                    _mm_madd_epi16(m128Tmp4,
3945
0
                            _mm_load_si128(
3946
0
                                    (__m128i *) (transform16x16_1[2][3]))));
3947
0
            E3h = _mm_add_epi32(E3h,
3948
0
                    _mm_madd_epi16(m128Tmp5,
3949
0
                            _mm_load_si128(
3950
0
                                    (__m128i *) (transform16x16_1[2][3]))));
3951
0
            E3l = _mm_add_epi32(E3l,
3952
0
                    _mm_madd_epi16(m128Tmp6,
3953
0
                            _mm_load_si128(
3954
0
                                    (__m128i *) (transform16x16_1[3][3]))));
3955
0
            E3h = _mm_add_epi32(E3h,
3956
0
                    _mm_madd_epi16(m128Tmp7,
3957
0
                            _mm_load_si128(
3958
0
                                    (__m128i *) (transform16x16_1[3][3]))));
3959
3960
            /*  Compute E4  */
3961
0
            E4l = _mm_madd_epi16(m128Tmp0,
3962
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3963
0
            E4h = _mm_madd_epi16(m128Tmp1,
3964
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3965
0
            E4l = _mm_add_epi32(E4l,
3966
0
                    _mm_madd_epi16(m128Tmp2,
3967
0
                            _mm_load_si128(
3968
0
                                    (__m128i *) (transform16x16_1[1][4]))));
3969
0
            E4h = _mm_add_epi32(E4h,
3970
0
                    _mm_madd_epi16(m128Tmp3,
3971
0
                            _mm_load_si128(
3972
0
                                    (__m128i *) (transform16x16_1[1][4]))));
3973
0
            E4l = _mm_add_epi32(E4l,
3974
0
                    _mm_madd_epi16(m128Tmp4,
3975
0
                            _mm_load_si128(
3976
0
                                    (__m128i *) (transform16x16_1[2][4]))));
3977
0
            E4h = _mm_add_epi32(E4h,
3978
0
                    _mm_madd_epi16(m128Tmp5,
3979
0
                            _mm_load_si128(
3980
0
                                    (__m128i *) (transform16x16_1[2][4]))));
3981
0
            E4l = _mm_add_epi32(E4l,
3982
0
                    _mm_madd_epi16(m128Tmp6,
3983
0
                            _mm_load_si128(
3984
0
                                    (__m128i *) (transform16x16_1[3][4]))));
3985
0
            E4h = _mm_add_epi32(E4h,
3986
0
                    _mm_madd_epi16(m128Tmp7,
3987
0
                            _mm_load_si128(
3988
0
                                    (__m128i *) (transform16x16_1[3][4]))));
3989
3990
            /*  Compute E3  */
3991
0
            E5l = _mm_madd_epi16(m128Tmp0,
3992
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3993
0
            E5h = _mm_madd_epi16(m128Tmp1,
3994
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3995
0
            E5l = _mm_add_epi32(E5l,
3996
0
                    _mm_madd_epi16(m128Tmp2,
3997
0
                            _mm_load_si128(
3998
0
                                    (__m128i *) (transform16x16_1[1][5]))));
3999
0
            E5h = _mm_add_epi32(E5h,
4000
0
                    _mm_madd_epi16(m128Tmp3,
4001
0
                            _mm_load_si128(
4002
0
                                    (__m128i *) (transform16x16_1[1][5]))));
4003
0
            E5l = _mm_add_epi32(E5l,
4004
0
                    _mm_madd_epi16(m128Tmp4,
4005
0
                            _mm_load_si128(
4006
0
                                    (__m128i *) (transform16x16_1[2][5]))));
4007
0
            E5h = _mm_add_epi32(E5h,
4008
0
                    _mm_madd_epi16(m128Tmp5,
4009
0
                            _mm_load_si128(
4010
0
                                    (__m128i *) (transform16x16_1[2][5]))));
4011
0
            E5l = _mm_add_epi32(E5l,
4012
0
                    _mm_madd_epi16(m128Tmp6,
4013
0
                            _mm_load_si128(
4014
0
                                    (__m128i *) (transform16x16_1[3][5]))));
4015
0
            E5h = _mm_add_epi32(E5h,
4016
0
                    _mm_madd_epi16(m128Tmp7,
4017
0
                            _mm_load_si128(
4018
0
                                    (__m128i *) (transform16x16_1[3][5]))));
4019
4020
            /*  Compute E6  */
4021
0
            E6l = _mm_madd_epi16(m128Tmp0,
4022
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4023
0
            E6h = _mm_madd_epi16(m128Tmp1,
4024
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4025
0
            E6l = _mm_add_epi32(E6l,
4026
0
                    _mm_madd_epi16(m128Tmp2,
4027
0
                            _mm_load_si128(
4028
0
                                    (__m128i *) (transform16x16_1[1][6]))));
4029
0
            E6h = _mm_add_epi32(E6h,
4030
0
                    _mm_madd_epi16(m128Tmp3,
4031
0
                            _mm_load_si128(
4032
0
                                    (__m128i *) (transform16x16_1[1][6]))));
4033
0
            E6l = _mm_add_epi32(E6l,
4034
0
                    _mm_madd_epi16(m128Tmp4,
4035
0
                            _mm_load_si128(
4036
0
                                    (__m128i *) (transform16x16_1[2][6]))));
4037
0
            E6h = _mm_add_epi32(E6h,
4038
0
                    _mm_madd_epi16(m128Tmp5,
4039
0
                            _mm_load_si128(
4040
0
                                    (__m128i *) (transform16x16_1[2][6]))));
4041
0
            E6l = _mm_add_epi32(E6l,
4042
0
                    _mm_madd_epi16(m128Tmp6,
4043
0
                            _mm_load_si128(
4044
0
                                    (__m128i *) (transform16x16_1[3][6]))));
4045
0
            E6h = _mm_add_epi32(E6h,
4046
0
                    _mm_madd_epi16(m128Tmp7,
4047
0
                            _mm_load_si128(
4048
0
                                    (__m128i *) (transform16x16_1[3][6]))));
4049
4050
            /*  Compute E7  */
4051
0
            E7l = _mm_madd_epi16(m128Tmp0,
4052
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4053
0
            E7h = _mm_madd_epi16(m128Tmp1,
4054
0
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4055
0
            E7l = _mm_add_epi32(E7l,
4056
0
                    _mm_madd_epi16(m128Tmp2,
4057
0
                            _mm_load_si128(
4058
0
                                    (__m128i *) (transform16x16_1[1][7]))));
4059
0
            E7h = _mm_add_epi32(E7h,
4060
0
                    _mm_madd_epi16(m128Tmp3,
4061
0
                            _mm_load_si128(
4062
0
                                    (__m128i *) (transform16x16_1[1][7]))));
4063
0
            E7l = _mm_add_epi32(E7l,
4064
0
                    _mm_madd_epi16(m128Tmp4,
4065
0
                            _mm_load_si128(
4066
0
                                    (__m128i *) (transform16x16_1[2][7]))));
4067
0
            E7h = _mm_add_epi32(E7h,
4068
0
                    _mm_madd_epi16(m128Tmp5,
4069
0
                            _mm_load_si128(
4070
0
                                    (__m128i *) (transform16x16_1[2][7]))));
4071
0
            E7l = _mm_add_epi32(E7l,
4072
0
                    _mm_madd_epi16(m128Tmp6,
4073
0
                            _mm_load_si128(
4074
0
                                    (__m128i *) (transform16x16_1[3][7]))));
4075
0
            E7h = _mm_add_epi32(E7h,
4076
0
                    _mm_madd_epi16(m128Tmp7,
4077
0
                            _mm_load_si128(
4078
0
                                    (__m128i *) (transform16x16_1[3][7]))));
4079
4080
            /*  Compute EE0 and EEE */
4081
4082
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4083
0
            E00l = _mm_madd_epi16(m128Tmp0,
4084
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4085
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4086
0
            E00h = _mm_madd_epi16(m128Tmp1,
4087
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4088
4089
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4090
0
            E00l = _mm_add_epi32(E00l,
4091
0
                    _mm_madd_epi16(m128Tmp2,
4092
0
                            _mm_load_si128(
4093
0
                                    (__m128i *) (transform16x16_2[1][0]))));
4094
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4095
0
            E00h = _mm_add_epi32(E00h,
4096
0
                    _mm_madd_epi16(m128Tmp3,
4097
0
                            _mm_load_si128(
4098
0
                                    (__m128i *) (transform16x16_2[1][0]))));
4099
4100
0
            E01l = _mm_madd_epi16(m128Tmp0,
4101
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4102
0
            E01h = _mm_madd_epi16(m128Tmp1,
4103
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4104
0
            E01l = _mm_add_epi32(E01l,
4105
0
                    _mm_madd_epi16(m128Tmp2,
4106
0
                            _mm_load_si128(
4107
0
                                    (__m128i *) (transform16x16_2[1][1]))));
4108
0
            E01h = _mm_add_epi32(E01h,
4109
0
                    _mm_madd_epi16(m128Tmp3,
4110
0
                            _mm_load_si128(
4111
0
                                    (__m128i *) (transform16x16_2[1][1]))));
4112
4113
0
            E02l = _mm_madd_epi16(m128Tmp0,
4114
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4115
0
            E02h = _mm_madd_epi16(m128Tmp1,
4116
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4117
0
            E02l = _mm_add_epi32(E02l,
4118
0
                    _mm_madd_epi16(m128Tmp2,
4119
0
                            _mm_load_si128(
4120
0
                                    (__m128i *) (transform16x16_2[1][2]))));
4121
0
            E02h = _mm_add_epi32(E02h,
4122
0
                    _mm_madd_epi16(m128Tmp3,
4123
0
                            _mm_load_si128(
4124
0
                                    (__m128i *) (transform16x16_2[1][2]))));
4125
4126
0
            E03l = _mm_madd_epi16(m128Tmp0,
4127
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4128
0
            E03h = _mm_madd_epi16(m128Tmp1,
4129
0
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4130
0
            E03l = _mm_add_epi32(E03l,
4131
0
                    _mm_madd_epi16(m128Tmp2,
4132
0
                            _mm_load_si128(
4133
0
                                    (__m128i *) (transform16x16_2[1][3]))));
4134
0
            E03h = _mm_add_epi32(E03h,
4135
0
                    _mm_madd_epi16(m128Tmp3,
4136
0
                            _mm_load_si128(
4137
0
                                    (__m128i *) (transform16x16_2[1][3]))));
4138
4139
            /*  Compute EE0 and EEE */
4140
4141
0
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4142
0
            EE0l = _mm_madd_epi16(m128Tmp0,
4143
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4144
0
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4145
0
            EE0h = _mm_madd_epi16(m128Tmp1,
4146
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4147
4148
0
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4149
0
            EEE0l = _mm_madd_epi16(m128Tmp2,
4150
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4151
0
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4152
0
            EEE0h = _mm_madd_epi16(m128Tmp3,
4153
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4154
4155
0
            EE1l = _mm_madd_epi16(m128Tmp0,
4156
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4157
0
            EE1h = _mm_madd_epi16(m128Tmp1,
4158
0
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4159
4160
0
            EEE1l = _mm_madd_epi16(m128Tmp2,
4161
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4162
0
            EEE1h = _mm_madd_epi16(m128Tmp3,
4163
0
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4164
4165
            /*  Compute EE    */
4166
4167
0
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
4168
0
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
4169
0
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
4170
0
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
4171
4172
0
            EE0l = _mm_add_epi32(EEE0l, EE0l);
4173
0
            EE1l = _mm_add_epi32(EEE1l, EE1l);
4174
0
            EE0h = _mm_add_epi32(EEE0h, EE0h);
4175
0
            EE1h = _mm_add_epi32(EEE1h, EE1h);
4176
            /**/
4177
4178
0
            EE7l = _mm_sub_epi32(EE0l, E00l);
4179
0
            EE6l = _mm_sub_epi32(EE1l, E01l);
4180
0
            EE5l = _mm_sub_epi32(EE2l, E02l);
4181
0
            EE4l = _mm_sub_epi32(EE3l, E03l);
4182
4183
0
            EE7h = _mm_sub_epi32(EE0h, E00h);
4184
0
            EE6h = _mm_sub_epi32(EE1h, E01h);
4185
0
            EE5h = _mm_sub_epi32(EE2h, E02h);
4186
0
            EE4h = _mm_sub_epi32(EE3h, E03h);
4187
4188
0
            EE0l = _mm_add_epi32(EE0l, E00l);
4189
0
            EE1l = _mm_add_epi32(EE1l, E01l);
4190
0
            EE2l = _mm_add_epi32(EE2l, E02l);
4191
0
            EE3l = _mm_add_epi32(EE3l, E03l);
4192
4193
0
            EE0h = _mm_add_epi32(EE0h, E00h);
4194
0
            EE1h = _mm_add_epi32(EE1h, E01h);
4195
0
            EE2h = _mm_add_epi32(EE2h, E02h);
4196
0
            EE3h = _mm_add_epi32(EE3h, E03h);
4197
            /*      Compute E       */
4198
4199
0
            E15l = _mm_sub_epi32(EE0l, E0l);
4200
0
            E15l = _mm_add_epi32(E15l, m128iAdd);
4201
0
            E14l = _mm_sub_epi32(EE1l, E1l);
4202
0
            E14l = _mm_add_epi32(E14l, m128iAdd);
4203
0
            E13l = _mm_sub_epi32(EE2l, E2l);
4204
0
            E13l = _mm_add_epi32(E13l, m128iAdd);
4205
0
            E12l = _mm_sub_epi32(EE3l, E3l);
4206
0
            E12l = _mm_add_epi32(E12l, m128iAdd);
4207
0
            E11l = _mm_sub_epi32(EE4l, E4l);
4208
0
            E11l = _mm_add_epi32(E11l, m128iAdd);
4209
0
            E10l = _mm_sub_epi32(EE5l, E5l);
4210
0
            E10l = _mm_add_epi32(E10l, m128iAdd);
4211
0
            E9l = _mm_sub_epi32(EE6l, E6l);
4212
0
            E9l = _mm_add_epi32(E9l, m128iAdd);
4213
0
            E8l = _mm_sub_epi32(EE7l, E7l);
4214
0
            E8l = _mm_add_epi32(E8l, m128iAdd);
4215
4216
0
            E0l = _mm_add_epi32(EE0l, E0l);
4217
0
            E0l = _mm_add_epi32(E0l, m128iAdd);
4218
0
            E1l = _mm_add_epi32(EE1l, E1l);
4219
0
            E1l = _mm_add_epi32(E1l, m128iAdd);
4220
0
            E2l = _mm_add_epi32(EE2l, E2l);
4221
0
            E2l = _mm_add_epi32(E2l, m128iAdd);
4222
0
            E3l = _mm_add_epi32(EE3l, E3l);
4223
0
            E3l = _mm_add_epi32(E3l, m128iAdd);
4224
0
            E4l = _mm_add_epi32(EE4l, E4l);
4225
0
            E4l = _mm_add_epi32(E4l, m128iAdd);
4226
0
            E5l = _mm_add_epi32(EE5l, E5l);
4227
0
            E5l = _mm_add_epi32(E5l, m128iAdd);
4228
0
            E6l = _mm_add_epi32(EE6l, E6l);
4229
0
            E6l = _mm_add_epi32(E6l, m128iAdd);
4230
0
            E7l = _mm_add_epi32(EE7l, E7l);
4231
0
            E7l = _mm_add_epi32(E7l, m128iAdd);
4232
4233
0
            E15h = _mm_sub_epi32(EE0h, E0h);
4234
0
            E15h = _mm_add_epi32(E15h, m128iAdd);
4235
0
            E14h = _mm_sub_epi32(EE1h, E1h);
4236
0
            E14h = _mm_add_epi32(E14h, m128iAdd);
4237
0
            E13h = _mm_sub_epi32(EE2h, E2h);
4238
0
            E13h = _mm_add_epi32(E13h, m128iAdd);
4239
0
            E12h = _mm_sub_epi32(EE3h, E3h);
4240
0
            E12h = _mm_add_epi32(E12h, m128iAdd);
4241
0
            E11h = _mm_sub_epi32(EE4h, E4h);
4242
0
            E11h = _mm_add_epi32(E11h, m128iAdd);
4243
0
            E10h = _mm_sub_epi32(EE5h, E5h);
4244
0
            E10h = _mm_add_epi32(E10h, m128iAdd);
4245
0
            E9h = _mm_sub_epi32(EE6h, E6h);
4246
0
            E9h = _mm_add_epi32(E9h, m128iAdd);
4247
0
            E8h = _mm_sub_epi32(EE7h, E7h);
4248
0
            E8h = _mm_add_epi32(E8h, m128iAdd);
4249
4250
0
            E0h = _mm_add_epi32(EE0h, E0h);
4251
0
            E0h = _mm_add_epi32(E0h, m128iAdd);
4252
0
            E1h = _mm_add_epi32(EE1h, E1h);
4253
0
            E1h = _mm_add_epi32(E1h, m128iAdd);
4254
0
            E2h = _mm_add_epi32(EE2h, E2h);
4255
0
            E2h = _mm_add_epi32(E2h, m128iAdd);
4256
0
            E3h = _mm_add_epi32(EE3h, E3h);
4257
0
            E3h = _mm_add_epi32(E3h, m128iAdd);
4258
0
            E4h = _mm_add_epi32(EE4h, E4h);
4259
0
            E4h = _mm_add_epi32(E4h, m128iAdd);
4260
0
            E5h = _mm_add_epi32(EE5h, E5h);
4261
0
            E5h = _mm_add_epi32(E5h, m128iAdd);
4262
0
            E6h = _mm_add_epi32(EE6h, E6h);
4263
0
            E6h = _mm_add_epi32(E6h, m128iAdd);
4264
0
            E7h = _mm_add_epi32(EE7h, E7h);
4265
0
            E7h = _mm_add_epi32(E7h, m128iAdd);
4266
4267
0
            m128iS0 = _mm_packs_epi32(
4268
0
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4269
0
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4270
0
            m128iS1 = _mm_packs_epi32(
4271
0
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4272
0
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4273
0
            m128iS2 = _mm_packs_epi32(
4274
0
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4275
0
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4276
0
            m128iS3 = _mm_packs_epi32(
4277
0
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4278
0
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4279
0
            m128iS4 = _mm_packs_epi32(
4280
0
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4281
0
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4282
0
            m128iS5 = _mm_packs_epi32(
4283
0
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4284
0
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4285
0
            m128iS6 = _mm_packs_epi32(
4286
0
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4287
0
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4288
0
            m128iS7 = _mm_packs_epi32(
4289
0
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4290
0
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4291
0
            m128iS8 = _mm_packs_epi32(
4292
0
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4293
0
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4294
0
            m128iS9 = _mm_packs_epi32(
4295
0
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4296
0
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4297
0
            m128iS10 = _mm_packs_epi32(
4298
0
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4299
0
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4300
0
            m128iS11 = _mm_packs_epi32(
4301
0
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4302
0
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4303
0
            m128iS12 = _mm_packs_epi32(
4304
0
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4305
0
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4306
0
            m128iS13 = _mm_packs_epi32(
4307
0
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4308
0
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4309
0
            m128iS14 = _mm_packs_epi32(
4310
0
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4311
0
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4312
0
            m128iS15 = _mm_packs_epi32(
4313
0
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4314
0
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4315
4316
0
            m128iS31 = _mm_packs_epi32(
4317
0
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4318
0
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4319
0
            m128iS30 = _mm_packs_epi32(
4320
0
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4321
0
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4322
0
            m128iS29 = _mm_packs_epi32(
4323
0
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4324
0
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4325
0
            m128iS28 = _mm_packs_epi32(
4326
0
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4327
0
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4328
0
            m128iS27 = _mm_packs_epi32(
4329
0
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4330
0
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4331
0
            m128iS26 = _mm_packs_epi32(
4332
0
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4333
0
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4334
0
            m128iS25 = _mm_packs_epi32(
4335
0
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4336
0
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4337
0
            m128iS24 = _mm_packs_epi32(
4338
0
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4339
0
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4340
0
            m128iS23 = _mm_packs_epi32(
4341
0
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4342
0
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4343
0
            m128iS22 = _mm_packs_epi32(
4344
0
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4345
0
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4346
0
            m128iS21 = _mm_packs_epi32(
4347
0
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4348
0
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4349
0
            m128iS20 = _mm_packs_epi32(
4350
0
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4351
0
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4352
0
            m128iS19 = _mm_packs_epi32(
4353
0
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4354
0
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4355
0
            m128iS18 = _mm_packs_epi32(
4356
0
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4357
0
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4358
0
            m128iS17 = _mm_packs_epi32(
4359
0
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4360
0
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4361
0
            m128iS16 = _mm_packs_epi32(
4362
0
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4363
0
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4364
4365
0
            if (!j) {
4366
                /*      Inverse the matrix      */
4367
0
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4368
0
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4369
0
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4370
0
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4371
0
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4372
0
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4373
0
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4374
0
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4375
0
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4376
0
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4377
0
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4378
0
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4379
0
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4380
0
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4381
0
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4382
0
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4383
4384
0
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4385
0
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4386
0
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4387
0
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4388
0
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4389
0
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4390
0
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4391
0
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4392
0
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4393
0
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4394
0
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4395
0
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4396
0
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4397
0
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4398
0
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4399
0
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4400
4401
0
                E0h = _mm_unpacklo_epi16(E0l, E8l);
4402
0
                E1h = _mm_unpacklo_epi16(E1l, E9l);
4403
0
                E2h = _mm_unpacklo_epi16(E2l, E10l);
4404
0
                E3h = _mm_unpacklo_epi16(E3l, E11l);
4405
0
                E4h = _mm_unpacklo_epi16(E4l, E12l);
4406
0
                E5h = _mm_unpacklo_epi16(E5l, E13l);
4407
0
                E6h = _mm_unpacklo_epi16(E6l, E14l);
4408
0
                E7h = _mm_unpacklo_epi16(E7l, E15l);
4409
4410
0
                E8h = _mm_unpackhi_epi16(E0l, E8l);
4411
0
                E9h = _mm_unpackhi_epi16(E1l, E9l);
4412
0
                E10h = _mm_unpackhi_epi16(E2l, E10l);
4413
0
                E11h = _mm_unpackhi_epi16(E3l, E11l);
4414
0
                E12h = _mm_unpackhi_epi16(E4l, E12l);
4415
0
                E13h = _mm_unpackhi_epi16(E5l, E13l);
4416
0
                E14h = _mm_unpackhi_epi16(E6l, E14l);
4417
0
                E15h = _mm_unpackhi_epi16(E7l, E15l);
4418
4419
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4420
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4421
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4422
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4423
4424
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4425
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4426
0
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4427
0
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4428
4429
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4430
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4431
0
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4432
0
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4433
4434
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4435
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4436
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4437
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4438
4439
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4440
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4441
0
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4442
0
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4443
4444
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4445
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4446
0
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4447
0
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4448
4449
0
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4450
0
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4451
0
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4452
0
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4453
4454
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4455
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4456
0
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4457
0
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4458
4459
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4460
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4461
0
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4462
0
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4463
4464
0
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4465
0
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4466
0
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4467
0
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4468
4469
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4470
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4471
0
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4472
0
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4473
4474
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4475
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4476
0
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4477
0
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4478
4479
                /*  */
4480
0
                E0h = _mm_unpacklo_epi16(O0l, O8l);
4481
0
                E1h = _mm_unpacklo_epi16(O1l, O9l);
4482
0
                E2h = _mm_unpacklo_epi16(O2l, O10l);
4483
0
                E3h = _mm_unpacklo_epi16(O3l, O11l);
4484
0
                E4h = _mm_unpacklo_epi16(O4l, O12l);
4485
0
                E5h = _mm_unpacklo_epi16(O5l, O13l);
4486
0
                E6h = _mm_unpacklo_epi16(O6l, O14l);
4487
0
                E7h = _mm_unpacklo_epi16(O7l, O15l);
4488
4489
0
                E8h = _mm_unpackhi_epi16(O0l, O8l);
4490
0
                E9h = _mm_unpackhi_epi16(O1l, O9l);
4491
0
                E10h = _mm_unpackhi_epi16(O2l, O10l);
4492
0
                E11h = _mm_unpackhi_epi16(O3l, O11l);
4493
0
                E12h = _mm_unpackhi_epi16(O4l, O12l);
4494
0
                E13h = _mm_unpackhi_epi16(O5l, O13l);
4495
0
                E14h = _mm_unpackhi_epi16(O6l, O14l);
4496
0
                E15h = _mm_unpackhi_epi16(O7l, O15l);
4497
4498
0
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4499
0
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4500
0
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4501
0
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4502
4503
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4504
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4505
0
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4506
0
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4507
4508
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4509
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4510
0
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4511
0
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4512
4513
0
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4514
0
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4515
0
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4516
0
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4517
4518
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4519
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4520
0
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4521
0
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4522
4523
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4524
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4525
0
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4526
0
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4527
4528
0
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4529
0
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4530
0
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4531
0
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4532
4533
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4534
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4535
0
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4536
0
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4537
4538
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4539
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4540
0
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4541
0
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4542
4543
0
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4544
0
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4545
0
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4546
0
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4547
4548
0
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4549
0
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4550
0
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4551
0
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4552
4553
0
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4554
0
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4555
0
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4556
0
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4557
4558
0
                if(i==0){
4559
0
                    int k = 8;
4560
0
                    r0=m128iS0;
4561
0
                    r1=m128iS1;
4562
0
                    r2=m128iS2;
4563
0
                    r3=m128iS3;
4564
0
                    r4=m128iS4;
4565
0
                    r5=m128iS5;
4566
0
                    r6=m128iS6;
4567
0
                    r7=m128iS7;
4568
0
                    r8=m128iS8;
4569
0
                    r9=m128iS9;
4570
0
                    r10=m128iS10;
4571
0
                    r11=m128iS11;
4572
0
                    r12=m128iS12;
4573
0
                    r13=m128iS13;
4574
0
                    r14=m128iS14;
4575
0
                    r15=m128iS15;
4576
0
                    r16=m128iS16;
4577
0
                    r17=m128iS17;
4578
0
                    r18=m128iS18;
4579
0
                    r19=m128iS19;
4580
0
                    r20=m128iS20;
4581
0
                    r21=m128iS21;
4582
0
                    r22=m128iS22;
4583
0
                    r23=m128iS23;
4584
0
                    r24=m128iS24;
4585
0
                    r25=m128iS25;
4586
0
                    r26=m128iS26;
4587
0
                    r27=m128iS27;
4588
0
                    r28=m128iS28;
4589
0
                    r29=m128iS29;
4590
0
                    r30=m128iS30;
4591
0
                    r31=m128iS31;
4592
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
4593
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4594
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4595
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4596
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4597
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4598
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4599
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4600
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4601
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4602
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4603
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4604
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4605
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4606
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4607
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4608
4609
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4610
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4611
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4612
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4613
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4614
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4615
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4616
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4617
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4618
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4619
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4620
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4621
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4622
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4623
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4624
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4625
4626
0
                }else if(i ==8){
4627
4628
0
                    r32=m128iS0;
4629
0
                    r33=m128iS1;
4630
0
                    r34=m128iS2;
4631
0
                    r35=m128iS3;
4632
0
                    r36=m128iS4;
4633
0
                    r37=m128iS5;
4634
0
                    r38=m128iS6;
4635
0
                    r39=m128iS7;
4636
0
                    r40=m128iS8;
4637
0
                    r41=m128iS9;
4638
0
                    r42=m128iS10;
4639
0
                    r43=m128iS11;
4640
0
                    r44=m128iS12;
4641
0
                    r45=m128iS13;
4642
0
                    r46=m128iS14;
4643
0
                    r47=m128iS15;
4644
0
                    r48=m128iS16;
4645
0
                    r49=m128iS17;
4646
0
                    r50=m128iS18;
4647
0
                    r51=m128iS19;
4648
0
                    r52=m128iS20;
4649
0
                    r53=m128iS21;
4650
0
                    r54=m128iS22;
4651
0
                    r55=m128iS23;
4652
0
                    r56=m128iS24;
4653
0
                    r57=m128iS25;
4654
0
                    r58=m128iS26;
4655
0
                    r59=m128iS27;
4656
0
                    r60=m128iS28;
4657
0
                    r61=m128iS29;
4658
0
                    r62=m128iS30;
4659
0
                    r63=m128iS31;
4660
4661
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4662
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4663
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4664
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4665
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4666
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4667
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4668
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4669
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4670
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4671
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4672
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4673
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4674
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4675
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4676
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4677
4678
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4679
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4680
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4681
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4682
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4683
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4684
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4685
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4686
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4687
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4688
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4689
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4690
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4691
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4692
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4693
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4694
4695
4696
0
                }else if(i ==16){
4697
4698
0
                    r64=m128iS0;
4699
0
                    r65=m128iS1;
4700
0
                    r66=m128iS2;
4701
0
                    r67=m128iS3;
4702
0
                    r68=m128iS4;
4703
0
                    r69=m128iS5;
4704
0
                    r70=m128iS6;
4705
0
                    r71=m128iS7;
4706
0
                    r72=m128iS8;
4707
0
                    r73=m128iS9;
4708
0
                    r74=m128iS10;
4709
0
                    r75=m128iS11;
4710
0
                    r76=m128iS12;
4711
0
                    r77=m128iS13;
4712
0
                    r78=m128iS14;
4713
0
                    r79=m128iS15;
4714
0
                    r80=m128iS16;
4715
0
                    r81=m128iS17;
4716
0
                    r82=m128iS18;
4717
0
                    r83=m128iS19;
4718
0
                    r84=m128iS20;
4719
0
                    r85=m128iS21;
4720
0
                    r86=m128iS22;
4721
0
                    r87=m128iS23;
4722
0
                    r88=m128iS24;
4723
0
                    r89=m128iS25;
4724
0
                    r90=m128iS26;
4725
0
                    r91=m128iS27;
4726
0
                    r92=m128iS28;
4727
0
                    r93=m128iS29;
4728
0
                    r94=m128iS30;
4729
0
                    r95=m128iS31;
4730
4731
0
                    m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4732
0
                    m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4733
0
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4734
0
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4735
0
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4736
0
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4737
0
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4738
0
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4739
0
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4740
0
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4741
0
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4742
0
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4743
0
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4744
0
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4745
0
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4746
0
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4747
4748
0
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4749
0
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4750
0
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4751
0
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4752
0
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4753
0
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4754
0
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4755
0
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4756
0
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4757
0
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4758
0
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4759
0
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4760
0
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4761
0
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4762
0
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4763
0
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4764
4765
0
                }else{
4766
0
                    r96=m128iS0;
4767
0
                    r97=m128iS1;
4768
0
                    r98=m128iS2;
4769
0
                    r99=m128iS3;
4770
0
                    r100=m128iS4;
4771
0
                    r101=m128iS5;
4772
0
                    r102=m128iS6;
4773
0
                    r103=m128iS7;
4774
0
                    r104=m128iS8;
4775
0
                    r105=m128iS9;
4776
0
                    r106=m128iS10;
4777
0
                    r107=m128iS11;
4778
0
                    r108=m128iS12;
4779
0
                    r109=m128iS13;
4780
0
                    r110=m128iS14;
4781
0
                    r111=m128iS15;
4782
0
                    r112=m128iS16;
4783
0
                    r113=m128iS17;
4784
0
                    r114=m128iS18;
4785
0
                    r115=m128iS19;
4786
0
                    r116=m128iS20;
4787
0
                    r117=m128iS21;
4788
0
                    r118=m128iS22;
4789
0
                    r119=m128iS23;
4790
0
                    r120=m128iS24;
4791
0
                    r121=m128iS25;
4792
0
                    r122=m128iS26;
4793
0
                    r123=m128iS27;
4794
0
                    r124=m128iS28;
4795
0
                    r125=m128iS29;
4796
0
                    r126=m128iS30;
4797
0
                    r127=m128iS31;
4798
4799
                    //load data for next j :
4800
0
                    m128iS0 =  r0;
4801
0
                    m128iS1 =  r4;
4802
0
                    m128iS2 =  r8;
4803
0
                    m128iS3 =  r12;
4804
0
                    m128iS4 =  r16;
4805
0
                    m128iS5 =  r20;
4806
0
                    m128iS6 =  r24;
4807
0
                    m128iS7 =  r28;
4808
0
                    m128iS8 =  r32;
4809
0
                    m128iS9 =  r36;
4810
0
                    m128iS10 = r40;
4811
0
                    m128iS11 = r44;
4812
0
                    m128iS12 = r48;
4813
0
                    m128iS13 = r52;
4814
0
                    m128iS14 = r56;
4815
0
                    m128iS15 = r60;
4816
0
                    m128iS16 = r64;
4817
0
                    m128iS17 = r68;
4818
0
                    m128iS18 = r72;
4819
0
                    m128iS19 = r76;
4820
0
                    m128iS20 = r80;
4821
0
                    m128iS21 = r84;
4822
0
                    m128iS22 = r88;
4823
0
                    m128iS23 = r92;
4824
0
                    m128iS24 = r96;
4825
0
                    m128iS25 = r100;
4826
0
                    m128iS26 = r104;
4827
0
                    m128iS27 = r108;
4828
0
                    m128iS28 = r112;
4829
0
                    m128iS29 = r116;
4830
0
                    m128iS30 = r120;
4831
0
                    m128iS31 =r124;
4832
0
                    shift = shift_2nd;
4833
0
                    m128iAdd = _mm_set1_epi32(add_2nd);
4834
4835
4836
0
                }
4837
4838
0
            } else {
4839
4840
                //Transpose Matrix
4841
4842
0
                E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4843
0
                E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4844
0
                E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4845
0
                E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4846
0
                E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4847
0
                E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4848
0
                E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4849
0
                E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4850
0
                E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4851
0
                E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4852
0
                E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4853
0
                E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4854
0
                E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4855
0
                E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4856
0
                E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4857
0
                E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4858
4859
4860
0
                E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4861
0
                E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4862
0
                E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4863
0
                E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4864
0
                E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4865
0
                E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4866
0
                E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4867
0
                E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4868
0
                E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4869
0
                E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4870
0
                E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4871
0
                E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4872
0
                E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4873
0
                E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4874
0
                E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4875
0
                E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4876
4877
0
                m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4878
0
                m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4879
0
                m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4880
0
                m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4881
0
                m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4882
0
                m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4883
0
                m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4884
0
                m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4885
4886
0
                m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4887
0
                m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4888
4889
4890
0
                m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4891
0
                m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4892
4893
                //second row
4894
4895
0
                m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4896
0
                m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4897
4898
0
                m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4899
0
                m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4900
4901
               //third row
4902
4903
0
                m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4904
0
                m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4905
0
                m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4906
0
                m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4907
0
                m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4908
0
                m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4909
0
                m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4910
0
                m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4911
4912
4913
0
                m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4914
0
                m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4915
4916
0
                m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4917
0
                m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4918
4919
                //fourth row
4920
4921
0
                m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4922
0
                m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4923
4924
0
                m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4925
0
                m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4926
4927
                //fifth row
4928
4929
0
                m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4930
0
                m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4931
0
                m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4932
0
                m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4933
0
                m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4934
0
                m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4935
0
                m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4936
0
                m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4937
4938
0
                m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4939
0
                m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4940
4941
4942
0
                m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4943
0
                m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4944
4945
                //sixth row
4946
4947
0
                m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4948
0
                m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4949
4950
4951
0
                m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4952
0
                m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4953
4954
               //seventh row
4955
4956
0
                m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4957
0
                m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4958
0
                m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4959
0
                m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4960
0
                m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4961
0
                m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4962
0
                m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4963
0
                m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4964
4965
4966
0
                m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4967
0
                m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4968
4969
4970
0
                m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4971
0
                m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4972
4973
                //last row
4974
4975
4976
0
                m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4977
0
                m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4978
4979
0
                m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4980
0
                m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4981
4982
4983
0
                m128Tmp0=_mm_setzero_si128();
4984
4985
4986
                //store
4987
0
                dst = (uint8_t*) _dst + i*stride;
4988
4989
4990
0
                E0l= _mm_load_si128((__m128i*)dst); //16 values
4991
0
                E1l= _mm_load_si128((__m128i*)(dst+16));
4992
0
                E2l= _mm_load_si128((__m128i*)(dst+stride));
4993
0
                E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4994
0
                E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4995
0
                E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4996
0
                E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4997
0
                E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4998
0
                E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4999
0
                E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
5000
0
                E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5001
0
                E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5002
0
                E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5003
0
                E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5004
0
                E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5005
0
                E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5006
5007
0
                m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5008
0
                m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5009
0
                m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5010
5011
0
                m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5012
0
                m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5013
0
                m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5014
5015
0
                m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5016
0
                m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5017
0
                m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5018
5019
0
                m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5020
0
                m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5021
0
                m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5022
5023
0
                m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5024
0
                m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5025
0
                m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5026
5027
0
                m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5028
0
                m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5029
0
                m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5030
5031
0
                m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5032
0
                m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5033
0
                m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5034
5035
0
                m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5036
0
                m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5037
0
                m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5038
5039
0
                m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5040
0
                m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5041
0
                m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5042
5043
0
                m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5044
0
                m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5045
0
                m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5046
5047
0
                m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5048
0
                m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5049
0
                m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5050
5051
0
                m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5052
0
                m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5053
0
                m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5054
5055
0
                m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5056
0
                m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5057
0
                m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5058
5059
0
                m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5060
0
                m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5061
0
                m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5062
5063
0
                m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5064
0
                m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5065
0
                m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5066
5067
0
                m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5068
0
                m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5069
0
                m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5070
5071
5072
0
                _mm_store_si128((__m128i*)dst,m128iS0);
5073
0
                _mm_store_si128((__m128i*)(dst+16),m128iS2);
5074
0
                _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5075
0
                _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5076
0
                _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5077
0
                _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5078
0
                _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5079
0
                _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5080
0
                _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5081
0
                _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5082
0
                _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5083
0
                _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5084
0
                _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5085
0
                _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5086
0
                _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5087
0
                _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5088
5089
5090
0
                if(i==0){
5091
                    //load next values :
5092
0
                    m128iS0 =  r1;
5093
0
                    m128iS1 =  r5;
5094
0
                    m128iS2 =  r9;
5095
0
                    m128iS3 =  r13;
5096
0
                    m128iS4 =  r17;
5097
0
                    m128iS5 =  r21;
5098
0
                    m128iS6 =  r25;
5099
0
                    m128iS7 =  r29;
5100
0
                    m128iS8 =  r33;
5101
0
                    m128iS9 =  r37;
5102
0
                    m128iS10 = r41;
5103
0
                    m128iS11 = r45;
5104
0
                    m128iS12 = r49;
5105
0
                    m128iS13 = r53;
5106
0
                    m128iS14 = r57;
5107
0
                    m128iS15 = r61;
5108
0
                    m128iS16 = r65;
5109
0
                    m128iS17 = r69;
5110
0
                    m128iS18 = r73;
5111
0
                    m128iS19 = r77;
5112
0
                    m128iS20 = r81;
5113
0
                    m128iS21 = r85;
5114
0
                    m128iS22 = r89;
5115
0
                    m128iS23 = r93;
5116
0
                    m128iS24 = r97;
5117
0
                    m128iS25 = r101;
5118
0
                    m128iS26 = r105;
5119
0
                    m128iS27 = r109;
5120
0
                    m128iS28 = r113;
5121
0
                    m128iS29 = r117;
5122
0
                    m128iS30 = r121;
5123
0
                    m128iS31 =r125;
5124
5125
0
                }else if(i ==8){
5126
                    //load next values :
5127
0
                    m128iS0 =  r2;
5128
0
                    m128iS1 =  r6;
5129
0
                    m128iS2 =  r10;
5130
0
                    m128iS3 =  r14;
5131
0
                    m128iS4 =  r18;
5132
0
                    m128iS5 =  r22;
5133
0
                    m128iS6 =  r26;
5134
0
                    m128iS7 =  r30;
5135
0
                    m128iS8 =  r34;
5136
0
                    m128iS9 =  r38;
5137
0
                    m128iS10 = r42;
5138
0
                    m128iS11 = r46;
5139
0
                    m128iS12 = r50;
5140
0
                    m128iS13 = r54;
5141
0
                    m128iS14 = r58;
5142
0
                    m128iS15 = r62;
5143
0
                    m128iS16 = r66;
5144
0
                    m128iS17 = r70;
5145
0
                    m128iS18 = r74;
5146
0
                    m128iS19 = r78;
5147
0
                    m128iS20 = r82;
5148
0
                    m128iS21 = r86;
5149
0
                    m128iS22 = r90;
5150
0
                    m128iS23 = r94;
5151
0
                    m128iS24 = r98;
5152
0
                    m128iS25 = r102;
5153
0
                    m128iS26 = r106;
5154
0
                    m128iS27 = r110;
5155
0
                    m128iS28 = r114;
5156
0
                    m128iS29 = r118;
5157
0
                    m128iS30 = r122;
5158
0
                    m128iS31 =r126;
5159
5160
0
                }else if(i==16)
5161
0
                {
5162
                    //load next values :
5163
0
                    m128iS0 =  r3;
5164
0
                    m128iS1 =  r7;
5165
0
                    m128iS2 =  r11;
5166
0
                    m128iS3 =  r15;
5167
0
                    m128iS4 =  r19;
5168
0
                    m128iS5 =  r23;
5169
0
                    m128iS6 =  r27;
5170
0
                    m128iS7 =  r31;
5171
0
                    m128iS8 =  r35;
5172
0
                    m128iS9 =  r39;
5173
0
                    m128iS10 = r43;
5174
0
                    m128iS11 = r47;
5175
0
                    m128iS12 = r51;
5176
0
                    m128iS13 = r55;
5177
0
                    m128iS14 = r59;
5178
0
                    m128iS15 = r63;
5179
0
                    m128iS16 = r67;
5180
0
                    m128iS17 = r71;
5181
0
                    m128iS18 = r75;
5182
0
                    m128iS19 = r79;
5183
0
                    m128iS20 = r83;
5184
0
                    m128iS21 = r87;
5185
0
                    m128iS22 = r91;
5186
0
                    m128iS23 = r95;
5187
0
                    m128iS24 = r99;
5188
0
                    m128iS25 = r103;
5189
0
                    m128iS26 = r107;
5190
0
                    m128iS27 = r111;
5191
0
                    m128iS28 = r115;
5192
0
                    m128iS29 = r119;
5193
0
                    m128iS30 = r123;
5194
0
                    m128iS31 =r127;
5195
0
                }
5196
0
            }
5197
0
        }
5198
0
    }
5199
0
}
5200
#endif
5201
5202
5203
#if 0
5204
void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
5205
        ptrdiff_t _stride) {
5206
    int i, j;
5207
    uint16_t *dst = (uint16_t*) _dst;
5208
    ptrdiff_t stride = _stride / 2;
5209
    int shift;
5210
    uint8_t shift_2nd = 10; //20 - bit depth
5211
    uint16_t add_2nd = 1<<9; //shift2 - 1
5212
    int16_t *src = coeffs;
5213
5214
    __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5215
            m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5216
            m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5217
            m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5218
            E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5219
            O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5220
            E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5221
    __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5222
    __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5223
            EEE0l, EEE1l, EEE0h, EEE1h;
5224
    __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5225
            m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5226
            m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5227
            m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5228
            O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5229
            O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5230
            EE4l, EE7h, EE6h, EE5h, EE4h;
5231
    m128iS0 = _mm_load_si128((__m128i *) (src));
5232
    m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5233
    m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5234
    m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5235
    m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5236
    m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5237
    m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5238
    m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5239
    m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5240
    m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5241
    m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5242
    m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5243
    m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5244
    m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5245
    m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5246
    m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5247
    m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5248
    m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5249
    m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5250
    m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5251
    m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5252
    m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5253
    m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5254
    m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5255
    m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5256
    m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5257
    m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5258
    m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5259
    m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5260
    m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5261
    m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5262
    m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5263
5264
    shift = shift_1st;
5265
    m128iAdd = _mm_set1_epi32(add_1st);
5266
5267
    for (j = 0; j < 2; j++) {
5268
        for (i = 0; i < 32; i += 8) {
5269
            m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5270
            E0l = _mm_madd_epi16(m128Tmp0,
5271
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5272
            m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5273
            E0h = _mm_madd_epi16(m128Tmp1,
5274
                    _mm_load_si128((__m128i *) (transform32x32[0][0])));
5275
5276
            m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5277
            E1l = _mm_madd_epi16(m128Tmp2,
5278
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5279
            m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5280
            E1h = _mm_madd_epi16(m128Tmp3,
5281
                    _mm_load_si128((__m128i *) (transform32x32[1][0])));
5282
5283
            m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5284
            E2l = _mm_madd_epi16(m128Tmp4,
5285
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5286
            m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5287
            E2h = _mm_madd_epi16(m128Tmp5,
5288
                    _mm_load_si128((__m128i *) (transform32x32[2][0])));
5289
5290
            m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5291
            E3l = _mm_madd_epi16(m128Tmp6,
5292
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5293
            m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5294
            E3h = _mm_madd_epi16(m128Tmp7,
5295
                    _mm_load_si128((__m128i *) (transform32x32[3][0])));
5296
5297
            m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5298
            E4l = _mm_madd_epi16(m128Tmp8,
5299
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5300
            m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5301
            E4h = _mm_madd_epi16(m128Tmp9,
5302
                    _mm_load_si128((__m128i *) (transform32x32[4][0])));
5303
5304
            m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5305
            E5l = _mm_madd_epi16(m128Tmp10,
5306
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5307
            m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5308
            E5h = _mm_madd_epi16(m128Tmp11,
5309
                    _mm_load_si128((__m128i *) (transform32x32[5][0])));
5310
5311
            m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5312
            E6l = _mm_madd_epi16(m128Tmp12,
5313
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5314
            m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5315
            E6h = _mm_madd_epi16(m128Tmp13,
5316
                    _mm_load_si128((__m128i *) (transform32x32[6][0])));
5317
5318
            m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5319
            E7l = _mm_madd_epi16(m128Tmp14,
5320
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5321
            m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5322
            E7h = _mm_madd_epi16(m128Tmp15,
5323
                    _mm_load_si128((__m128i *) (transform32x32[7][0])));
5324
5325
            O0l = _mm_add_epi32(E0l, E1l);
5326
            O0l = _mm_add_epi32(O0l, E2l);
5327
            O0l = _mm_add_epi32(O0l, E3l);
5328
            O0l = _mm_add_epi32(O0l, E4l);
5329
            O0l = _mm_add_epi32(O0l, E5l);
5330
            O0l = _mm_add_epi32(O0l, E6l);
5331
            O0l = _mm_add_epi32(O0l, E7l);
5332
5333
            O0h = _mm_add_epi32(E0h, E1h);
5334
            O0h = _mm_add_epi32(O0h, E2h);
5335
            O0h = _mm_add_epi32(O0h, E3h);
5336
            O0h = _mm_add_epi32(O0h, E4h);
5337
            O0h = _mm_add_epi32(O0h, E5h);
5338
            O0h = _mm_add_epi32(O0h, E6h);
5339
            O0h = _mm_add_epi32(O0h, E7h);
5340
5341
            /* Compute O1*/
5342
            E0l = _mm_madd_epi16(m128Tmp0,
5343
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5344
            E0h = _mm_madd_epi16(m128Tmp1,
5345
                    _mm_load_si128((__m128i *) (transform32x32[0][1])));
5346
            E1l = _mm_madd_epi16(m128Tmp2,
5347
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5348
            E1h = _mm_madd_epi16(m128Tmp3,
5349
                    _mm_load_si128((__m128i *) (transform32x32[1][1])));
5350
            E2l = _mm_madd_epi16(m128Tmp4,
5351
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5352
            E2h = _mm_madd_epi16(m128Tmp5,
5353
                    _mm_load_si128((__m128i *) (transform32x32[2][1])));
5354
            E3l = _mm_madd_epi16(m128Tmp6,
5355
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5356
            E3h = _mm_madd_epi16(m128Tmp7,
5357
                    _mm_load_si128((__m128i *) (transform32x32[3][1])));
5358
5359
            E4l = _mm_madd_epi16(m128Tmp8,
5360
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5361
            E4h = _mm_madd_epi16(m128Tmp9,
5362
                    _mm_load_si128((__m128i *) (transform32x32[4][1])));
5363
            E5l = _mm_madd_epi16(m128Tmp10,
5364
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5365
            E5h = _mm_madd_epi16(m128Tmp11,
5366
                    _mm_load_si128((__m128i *) (transform32x32[5][1])));
5367
            E6l = _mm_madd_epi16(m128Tmp12,
5368
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5369
            E6h = _mm_madd_epi16(m128Tmp13,
5370
                    _mm_load_si128((__m128i *) (transform32x32[6][1])));
5371
            E7l = _mm_madd_epi16(m128Tmp14,
5372
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5373
            E7h = _mm_madd_epi16(m128Tmp15,
5374
                    _mm_load_si128((__m128i *) (transform32x32[7][1])));
5375
5376
            O1l = _mm_add_epi32(E0l, E1l);
5377
            O1l = _mm_add_epi32(O1l, E2l);
5378
            O1l = _mm_add_epi32(O1l, E3l);
5379
            O1l = _mm_add_epi32(O1l, E4l);
5380
            O1l = _mm_add_epi32(O1l, E5l);
5381
            O1l = _mm_add_epi32(O1l, E6l);
5382
            O1l = _mm_add_epi32(O1l, E7l);
5383
5384
            O1h = _mm_add_epi32(E0h, E1h);
5385
            O1h = _mm_add_epi32(O1h, E2h);
5386
            O1h = _mm_add_epi32(O1h, E3h);
5387
            O1h = _mm_add_epi32(O1h, E4h);
5388
            O1h = _mm_add_epi32(O1h, E5h);
5389
            O1h = _mm_add_epi32(O1h, E6h);
5390
            O1h = _mm_add_epi32(O1h, E7h);
5391
            /* Compute O2*/
5392
            E0l = _mm_madd_epi16(m128Tmp0,
5393
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5394
            E0h = _mm_madd_epi16(m128Tmp1,
5395
                    _mm_load_si128((__m128i *) (transform32x32[0][2])));
5396
            E1l = _mm_madd_epi16(m128Tmp2,
5397
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5398
            E1h = _mm_madd_epi16(m128Tmp3,
5399
                    _mm_load_si128((__m128i *) (transform32x32[1][2])));
5400
            E2l = _mm_madd_epi16(m128Tmp4,
5401
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5402
            E2h = _mm_madd_epi16(m128Tmp5,
5403
                    _mm_load_si128((__m128i *) (transform32x32[2][2])));
5404
            E3l = _mm_madd_epi16(m128Tmp6,
5405
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5406
            E3h = _mm_madd_epi16(m128Tmp7,
5407
                    _mm_load_si128((__m128i *) (transform32x32[3][2])));
5408
5409
            E4l = _mm_madd_epi16(m128Tmp8,
5410
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5411
            E4h = _mm_madd_epi16(m128Tmp9,
5412
                    _mm_load_si128((__m128i *) (transform32x32[4][2])));
5413
            E5l = _mm_madd_epi16(m128Tmp10,
5414
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5415
            E5h = _mm_madd_epi16(m128Tmp11,
5416
                    _mm_load_si128((__m128i *) (transform32x32[5][2])));
5417
            E6l = _mm_madd_epi16(m128Tmp12,
5418
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5419
            E6h = _mm_madd_epi16(m128Tmp13,
5420
                    _mm_load_si128((__m128i *) (transform32x32[6][2])));
5421
            E7l = _mm_madd_epi16(m128Tmp14,
5422
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5423
            E7h = _mm_madd_epi16(m128Tmp15,
5424
                    _mm_load_si128((__m128i *) (transform32x32[7][2])));
5425
5426
            O2l = _mm_add_epi32(E0l, E1l);
5427
            O2l = _mm_add_epi32(O2l, E2l);
5428
            O2l = _mm_add_epi32(O2l, E3l);
5429
            O2l = _mm_add_epi32(O2l, E4l);
5430
            O2l = _mm_add_epi32(O2l, E5l);
5431
            O2l = _mm_add_epi32(O2l, E6l);
5432
            O2l = _mm_add_epi32(O2l, E7l);
5433
5434
            O2h = _mm_add_epi32(E0h, E1h);
5435
            O2h = _mm_add_epi32(O2h, E2h);
5436
            O2h = _mm_add_epi32(O2h, E3h);
5437
            O2h = _mm_add_epi32(O2h, E4h);
5438
            O2h = _mm_add_epi32(O2h, E5h);
5439
            O2h = _mm_add_epi32(O2h, E6h);
5440
            O2h = _mm_add_epi32(O2h, E7h);
5441
            /* Compute O3*/
5442
            E0l = _mm_madd_epi16(m128Tmp0,
5443
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5444
            E0h = _mm_madd_epi16(m128Tmp1,
5445
                    _mm_load_si128((__m128i *) (transform32x32[0][3])));
5446
            E1l = _mm_madd_epi16(m128Tmp2,
5447
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5448
            E1h = _mm_madd_epi16(m128Tmp3,
5449
                    _mm_load_si128((__m128i *) (transform32x32[1][3])));
5450
            E2l = _mm_madd_epi16(m128Tmp4,
5451
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5452
            E2h = _mm_madd_epi16(m128Tmp5,
5453
                    _mm_load_si128((__m128i *) (transform32x32[2][3])));
5454
            E3l = _mm_madd_epi16(m128Tmp6,
5455
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5456
            E3h = _mm_madd_epi16(m128Tmp7,
5457
                    _mm_load_si128((__m128i *) (transform32x32[3][3])));
5458
5459
            E4l = _mm_madd_epi16(m128Tmp8,
5460
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5461
            E4h = _mm_madd_epi16(m128Tmp9,
5462
                    _mm_load_si128((__m128i *) (transform32x32[4][3])));
5463
            E5l = _mm_madd_epi16(m128Tmp10,
5464
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5465
            E5h = _mm_madd_epi16(m128Tmp11,
5466
                    _mm_load_si128((__m128i *) (transform32x32[5][3])));
5467
            E6l = _mm_madd_epi16(m128Tmp12,
5468
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5469
            E6h = _mm_madd_epi16(m128Tmp13,
5470
                    _mm_load_si128((__m128i *) (transform32x32[6][3])));
5471
            E7l = _mm_madd_epi16(m128Tmp14,
5472
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5473
            E7h = _mm_madd_epi16(m128Tmp15,
5474
                    _mm_load_si128((__m128i *) (transform32x32[7][3])));
5475
5476
            O3l = _mm_add_epi32(E0l, E1l);
5477
            O3l = _mm_add_epi32(O3l, E2l);
5478
            O3l = _mm_add_epi32(O3l, E3l);
5479
            O3l = _mm_add_epi32(O3l, E4l);
5480
            O3l = _mm_add_epi32(O3l, E5l);
5481
            O3l = _mm_add_epi32(O3l, E6l);
5482
            O3l = _mm_add_epi32(O3l, E7l);
5483
5484
            O3h = _mm_add_epi32(E0h, E1h);
5485
            O3h = _mm_add_epi32(O3h, E2h);
5486
            O3h = _mm_add_epi32(O3h, E3h);
5487
            O3h = _mm_add_epi32(O3h, E4h);
5488
            O3h = _mm_add_epi32(O3h, E5h);
5489
            O3h = _mm_add_epi32(O3h, E6h);
5490
            O3h = _mm_add_epi32(O3h, E7h);
5491
            /* Compute O4*/
5492
5493
            E0l = _mm_madd_epi16(m128Tmp0,
5494
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5495
            E0h = _mm_madd_epi16(m128Tmp1,
5496
                    _mm_load_si128((__m128i *) (transform32x32[0][4])));
5497
            E1l = _mm_madd_epi16(m128Tmp2,
5498
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5499
            E1h = _mm_madd_epi16(m128Tmp3,
5500
                    _mm_load_si128((__m128i *) (transform32x32[1][4])));
5501
            E2l = _mm_madd_epi16(m128Tmp4,
5502
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5503
            E2h = _mm_madd_epi16(m128Tmp5,
5504
                    _mm_load_si128((__m128i *) (transform32x32[2][4])));
5505
            E3l = _mm_madd_epi16(m128Tmp6,
5506
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5507
            E3h = _mm_madd_epi16(m128Tmp7,
5508
                    _mm_load_si128((__m128i *) (transform32x32[3][4])));
5509
5510
            E4l = _mm_madd_epi16(m128Tmp8,
5511
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5512
            E4h = _mm_madd_epi16(m128Tmp9,
5513
                    _mm_load_si128((__m128i *) (transform32x32[4][4])));
5514
            E5l = _mm_madd_epi16(m128Tmp10,
5515
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5516
            E5h = _mm_madd_epi16(m128Tmp11,
5517
                    _mm_load_si128((__m128i *) (transform32x32[5][4])));
5518
            E6l = _mm_madd_epi16(m128Tmp12,
5519
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5520
            E6h = _mm_madd_epi16(m128Tmp13,
5521
                    _mm_load_si128((__m128i *) (transform32x32[6][4])));
5522
            E7l = _mm_madd_epi16(m128Tmp14,
5523
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5524
            E7h = _mm_madd_epi16(m128Tmp15,
5525
                    _mm_load_si128((__m128i *) (transform32x32[7][4])));
5526
5527
            O4l = _mm_add_epi32(E0l, E1l);
5528
            O4l = _mm_add_epi32(O4l, E2l);
5529
            O4l = _mm_add_epi32(O4l, E3l);
5530
            O4l = _mm_add_epi32(O4l, E4l);
5531
            O4l = _mm_add_epi32(O4l, E5l);
5532
            O4l = _mm_add_epi32(O4l, E6l);
5533
            O4l = _mm_add_epi32(O4l, E7l);
5534
5535
            O4h = _mm_add_epi32(E0h, E1h);
5536
            O4h = _mm_add_epi32(O4h, E2h);
5537
            O4h = _mm_add_epi32(O4h, E3h);
5538
            O4h = _mm_add_epi32(O4h, E4h);
5539
            O4h = _mm_add_epi32(O4h, E5h);
5540
            O4h = _mm_add_epi32(O4h, E6h);
5541
            O4h = _mm_add_epi32(O4h, E7h);
5542
5543
            /* Compute O5*/
5544
            E0l = _mm_madd_epi16(m128Tmp0,
5545
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5546
            E0h = _mm_madd_epi16(m128Tmp1,
5547
                    _mm_load_si128((__m128i *) (transform32x32[0][5])));
5548
            E1l = _mm_madd_epi16(m128Tmp2,
5549
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5550
            E1h = _mm_madd_epi16(m128Tmp3,
5551
                    _mm_load_si128((__m128i *) (transform32x32[1][5])));
5552
            E2l = _mm_madd_epi16(m128Tmp4,
5553
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5554
            E2h = _mm_madd_epi16(m128Tmp5,
5555
                    _mm_load_si128((__m128i *) (transform32x32[2][5])));
5556
            E3l = _mm_madd_epi16(m128Tmp6,
5557
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5558
            E3h = _mm_madd_epi16(m128Tmp7,
5559
                    _mm_load_si128((__m128i *) (transform32x32[3][5])));
5560
5561
            E4l = _mm_madd_epi16(m128Tmp8,
5562
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5563
            E4h = _mm_madd_epi16(m128Tmp9,
5564
                    _mm_load_si128((__m128i *) (transform32x32[4][5])));
5565
            E5l = _mm_madd_epi16(m128Tmp10,
5566
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5567
            E5h = _mm_madd_epi16(m128Tmp11,
5568
                    _mm_load_si128((__m128i *) (transform32x32[5][5])));
5569
            E6l = _mm_madd_epi16(m128Tmp12,
5570
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5571
            E6h = _mm_madd_epi16(m128Tmp13,
5572
                    _mm_load_si128((__m128i *) (transform32x32[6][5])));
5573
            E7l = _mm_madd_epi16(m128Tmp14,
5574
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5575
            E7h = _mm_madd_epi16(m128Tmp15,
5576
                    _mm_load_si128((__m128i *) (transform32x32[7][5])));
5577
5578
            O5l = _mm_add_epi32(E0l, E1l);
5579
            O5l = _mm_add_epi32(O5l, E2l);
5580
            O5l = _mm_add_epi32(O5l, E3l);
5581
            O5l = _mm_add_epi32(O5l, E4l);
5582
            O5l = _mm_add_epi32(O5l, E5l);
5583
            O5l = _mm_add_epi32(O5l, E6l);
5584
            O5l = _mm_add_epi32(O5l, E7l);
5585
5586
            O5h = _mm_add_epi32(E0h, E1h);
5587
            O5h = _mm_add_epi32(O5h, E2h);
5588
            O5h = _mm_add_epi32(O5h, E3h);
5589
            O5h = _mm_add_epi32(O5h, E4h);
5590
            O5h = _mm_add_epi32(O5h, E5h);
5591
            O5h = _mm_add_epi32(O5h, E6h);
5592
            O5h = _mm_add_epi32(O5h, E7h);
5593
5594
            /* Compute O6*/
5595
5596
            E0l = _mm_madd_epi16(m128Tmp0,
5597
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5598
            E0h = _mm_madd_epi16(m128Tmp1,
5599
                    _mm_load_si128((__m128i *) (transform32x32[0][6])));
5600
            E1l = _mm_madd_epi16(m128Tmp2,
5601
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5602
            E1h = _mm_madd_epi16(m128Tmp3,
5603
                    _mm_load_si128((__m128i *) (transform32x32[1][6])));
5604
            E2l = _mm_madd_epi16(m128Tmp4,
5605
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5606
            E2h = _mm_madd_epi16(m128Tmp5,
5607
                    _mm_load_si128((__m128i *) (transform32x32[2][6])));
5608
            E3l = _mm_madd_epi16(m128Tmp6,
5609
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5610
            E3h = _mm_madd_epi16(m128Tmp7,
5611
                    _mm_load_si128((__m128i *) (transform32x32[3][6])));
5612
5613
            E4l = _mm_madd_epi16(m128Tmp8,
5614
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5615
            E4h = _mm_madd_epi16(m128Tmp9,
5616
                    _mm_load_si128((__m128i *) (transform32x32[4][6])));
5617
            E5l = _mm_madd_epi16(m128Tmp10,
5618
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5619
            E5h = _mm_madd_epi16(m128Tmp11,
5620
                    _mm_load_si128((__m128i *) (transform32x32[5][6])));
5621
            E6l = _mm_madd_epi16(m128Tmp12,
5622
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5623
            E6h = _mm_madd_epi16(m128Tmp13,
5624
                    _mm_load_si128((__m128i *) (transform32x32[6][6])));
5625
            E7l = _mm_madd_epi16(m128Tmp14,
5626
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5627
            E7h = _mm_madd_epi16(m128Tmp15,
5628
                    _mm_load_si128((__m128i *) (transform32x32[7][6])));
5629
5630
            O6l = _mm_add_epi32(E0l, E1l);
5631
            O6l = _mm_add_epi32(O6l, E2l);
5632
            O6l = _mm_add_epi32(O6l, E3l);
5633
            O6l = _mm_add_epi32(O6l, E4l);
5634
            O6l = _mm_add_epi32(O6l, E5l);
5635
            O6l = _mm_add_epi32(O6l, E6l);
5636
            O6l = _mm_add_epi32(O6l, E7l);
5637
5638
            O6h = _mm_add_epi32(E0h, E1h);
5639
            O6h = _mm_add_epi32(O6h, E2h);
5640
            O6h = _mm_add_epi32(O6h, E3h);
5641
            O6h = _mm_add_epi32(O6h, E4h);
5642
            O6h = _mm_add_epi32(O6h, E5h);
5643
            O6h = _mm_add_epi32(O6h, E6h);
5644
            O6h = _mm_add_epi32(O6h, E7h);
5645
5646
            /* Compute O7*/
5647
5648
            E0l = _mm_madd_epi16(m128Tmp0,
5649
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5650
            E0h = _mm_madd_epi16(m128Tmp1,
5651
                    _mm_load_si128((__m128i *) (transform32x32[0][7])));
5652
            E1l = _mm_madd_epi16(m128Tmp2,
5653
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5654
            E1h = _mm_madd_epi16(m128Tmp3,
5655
                    _mm_load_si128((__m128i *) (transform32x32[1][7])));
5656
            E2l = _mm_madd_epi16(m128Tmp4,
5657
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5658
            E2h = _mm_madd_epi16(m128Tmp5,
5659
                    _mm_load_si128((__m128i *) (transform32x32[2][7])));
5660
            E3l = _mm_madd_epi16(m128Tmp6,
5661
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5662
            E3h = _mm_madd_epi16(m128Tmp7,
5663
                    _mm_load_si128((__m128i *) (transform32x32[3][7])));
5664
5665
            E4l = _mm_madd_epi16(m128Tmp8,
5666
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5667
            E4h = _mm_madd_epi16(m128Tmp9,
5668
                    _mm_load_si128((__m128i *) (transform32x32[4][7])));
5669
            E5l = _mm_madd_epi16(m128Tmp10,
5670
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5671
            E5h = _mm_madd_epi16(m128Tmp11,
5672
                    _mm_load_si128((__m128i *) (transform32x32[5][7])));
5673
            E6l = _mm_madd_epi16(m128Tmp12,
5674
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5675
            E6h = _mm_madd_epi16(m128Tmp13,
5676
                    _mm_load_si128((__m128i *) (transform32x32[6][7])));
5677
            E7l = _mm_madd_epi16(m128Tmp14,
5678
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5679
            E7h = _mm_madd_epi16(m128Tmp15,
5680
                    _mm_load_si128((__m128i *) (transform32x32[7][7])));
5681
5682
            O7l = _mm_add_epi32(E0l, E1l);
5683
            O7l = _mm_add_epi32(O7l, E2l);
5684
            O7l = _mm_add_epi32(O7l, E3l);
5685
            O7l = _mm_add_epi32(O7l, E4l);
5686
            O7l = _mm_add_epi32(O7l, E5l);
5687
            O7l = _mm_add_epi32(O7l, E6l);
5688
            O7l = _mm_add_epi32(O7l, E7l);
5689
5690
            O7h = _mm_add_epi32(E0h, E1h);
5691
            O7h = _mm_add_epi32(O7h, E2h);
5692
            O7h = _mm_add_epi32(O7h, E3h);
5693
            O7h = _mm_add_epi32(O7h, E4h);
5694
            O7h = _mm_add_epi32(O7h, E5h);
5695
            O7h = _mm_add_epi32(O7h, E6h);
5696
            O7h = _mm_add_epi32(O7h, E7h);
5697
5698
            /* Compute O8*/
5699
5700
            E0l = _mm_madd_epi16(m128Tmp0,
5701
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5702
            E0h = _mm_madd_epi16(m128Tmp1,
5703
                    _mm_load_si128((__m128i *) (transform32x32[0][8])));
5704
            E1l = _mm_madd_epi16(m128Tmp2,
5705
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5706
            E1h = _mm_madd_epi16(m128Tmp3,
5707
                    _mm_load_si128((__m128i *) (transform32x32[1][8])));
5708
            E2l = _mm_madd_epi16(m128Tmp4,
5709
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5710
            E2h = _mm_madd_epi16(m128Tmp5,
5711
                    _mm_load_si128((__m128i *) (transform32x32[2][8])));
5712
            E3l = _mm_madd_epi16(m128Tmp6,
5713
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5714
            E3h = _mm_madd_epi16(m128Tmp7,
5715
                    _mm_load_si128((__m128i *) (transform32x32[3][8])));
5716
5717
            E4l = _mm_madd_epi16(m128Tmp8,
5718
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5719
            E4h = _mm_madd_epi16(m128Tmp9,
5720
                    _mm_load_si128((__m128i *) (transform32x32[4][8])));
5721
            E5l = _mm_madd_epi16(m128Tmp10,
5722
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5723
            E5h = _mm_madd_epi16(m128Tmp11,
5724
                    _mm_load_si128((__m128i *) (transform32x32[5][8])));
5725
            E6l = _mm_madd_epi16(m128Tmp12,
5726
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5727
            E6h = _mm_madd_epi16(m128Tmp13,
5728
                    _mm_load_si128((__m128i *) (transform32x32[6][8])));
5729
            E7l = _mm_madd_epi16(m128Tmp14,
5730
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5731
            E7h = _mm_madd_epi16(m128Tmp15,
5732
                    _mm_load_si128((__m128i *) (transform32x32[7][8])));
5733
5734
            O8l = _mm_add_epi32(E0l, E1l);
5735
            O8l = _mm_add_epi32(O8l, E2l);
5736
            O8l = _mm_add_epi32(O8l, E3l);
5737
            O8l = _mm_add_epi32(O8l, E4l);
5738
            O8l = _mm_add_epi32(O8l, E5l);
5739
            O8l = _mm_add_epi32(O8l, E6l);
5740
            O8l = _mm_add_epi32(O8l, E7l);
5741
5742
            O8h = _mm_add_epi32(E0h, E1h);
5743
            O8h = _mm_add_epi32(O8h, E2h);
5744
            O8h = _mm_add_epi32(O8h, E3h);
5745
            O8h = _mm_add_epi32(O8h, E4h);
5746
            O8h = _mm_add_epi32(O8h, E5h);
5747
            O8h = _mm_add_epi32(O8h, E6h);
5748
            O8h = _mm_add_epi32(O8h, E7h);
5749
5750
            /* Compute O9*/
5751
5752
            E0l = _mm_madd_epi16(m128Tmp0,
5753
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5754
            E0h = _mm_madd_epi16(m128Tmp1,
5755
                    _mm_load_si128((__m128i *) (transform32x32[0][9])));
5756
            E1l = _mm_madd_epi16(m128Tmp2,
5757
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5758
            E1h = _mm_madd_epi16(m128Tmp3,
5759
                    _mm_load_si128((__m128i *) (transform32x32[1][9])));
5760
            E2l = _mm_madd_epi16(m128Tmp4,
5761
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5762
            E2h = _mm_madd_epi16(m128Tmp5,
5763
                    _mm_load_si128((__m128i *) (transform32x32[2][9])));
5764
            E3l = _mm_madd_epi16(m128Tmp6,
5765
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5766
            E3h = _mm_madd_epi16(m128Tmp7,
5767
                    _mm_load_si128((__m128i *) (transform32x32[3][9])));
5768
5769
            E4l = _mm_madd_epi16(m128Tmp8,
5770
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5771
            E4h = _mm_madd_epi16(m128Tmp9,
5772
                    _mm_load_si128((__m128i *) (transform32x32[4][9])));
5773
            E5l = _mm_madd_epi16(m128Tmp10,
5774
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5775
            E5h = _mm_madd_epi16(m128Tmp11,
5776
                    _mm_load_si128((__m128i *) (transform32x32[5][9])));
5777
            E6l = _mm_madd_epi16(m128Tmp12,
5778
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5779
            E6h = _mm_madd_epi16(m128Tmp13,
5780
                    _mm_load_si128((__m128i *) (transform32x32[6][9])));
5781
            E7l = _mm_madd_epi16(m128Tmp14,
5782
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5783
            E7h = _mm_madd_epi16(m128Tmp15,
5784
                    _mm_load_si128((__m128i *) (transform32x32[7][9])));
5785
5786
            O9l = _mm_add_epi32(E0l, E1l);
5787
            O9l = _mm_add_epi32(O9l, E2l);
5788
            O9l = _mm_add_epi32(O9l, E3l);
5789
            O9l = _mm_add_epi32(O9l, E4l);
5790
            O9l = _mm_add_epi32(O9l, E5l);
5791
            O9l = _mm_add_epi32(O9l, E6l);
5792
            O9l = _mm_add_epi32(O9l, E7l);
5793
5794
            O9h = _mm_add_epi32(E0h, E1h);
5795
            O9h = _mm_add_epi32(O9h, E2h);
5796
            O9h = _mm_add_epi32(O9h, E3h);
5797
            O9h = _mm_add_epi32(O9h, E4h);
5798
            O9h = _mm_add_epi32(O9h, E5h);
5799
            O9h = _mm_add_epi32(O9h, E6h);
5800
            O9h = _mm_add_epi32(O9h, E7h);
5801
5802
            /* Compute 10*/
5803
5804
            E0l = _mm_madd_epi16(m128Tmp0,
5805
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5806
            E0h = _mm_madd_epi16(m128Tmp1,
5807
                    _mm_load_si128((__m128i *) (transform32x32[0][10])));
5808
            E1l = _mm_madd_epi16(m128Tmp2,
5809
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5810
            E1h = _mm_madd_epi16(m128Tmp3,
5811
                    _mm_load_si128((__m128i *) (transform32x32[1][10])));
5812
            E2l = _mm_madd_epi16(m128Tmp4,
5813
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5814
            E2h = _mm_madd_epi16(m128Tmp5,
5815
                    _mm_load_si128((__m128i *) (transform32x32[2][10])));
5816
            E3l = _mm_madd_epi16(m128Tmp6,
5817
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5818
            E3h = _mm_madd_epi16(m128Tmp7,
5819
                    _mm_load_si128((__m128i *) (transform32x32[3][10])));
5820
5821
            E4l = _mm_madd_epi16(m128Tmp8,
5822
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5823
            E4h = _mm_madd_epi16(m128Tmp9,
5824
                    _mm_load_si128((__m128i *) (transform32x32[4][10])));
5825
            E5l = _mm_madd_epi16(m128Tmp10,
5826
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5827
            E5h = _mm_madd_epi16(m128Tmp11,
5828
                    _mm_load_si128((__m128i *) (transform32x32[5][10])));
5829
            E6l = _mm_madd_epi16(m128Tmp12,
5830
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5831
            E6h = _mm_madd_epi16(m128Tmp13,
5832
                    _mm_load_si128((__m128i *) (transform32x32[6][10])));
5833
            E7l = _mm_madd_epi16(m128Tmp14,
5834
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5835
            E7h = _mm_madd_epi16(m128Tmp15,
5836
                    _mm_load_si128((__m128i *) (transform32x32[7][10])));
5837
5838
            O10l = _mm_add_epi32(E0l, E1l);
5839
            O10l = _mm_add_epi32(O10l, E2l);
5840
            O10l = _mm_add_epi32(O10l, E3l);
5841
            O10l = _mm_add_epi32(O10l, E4l);
5842
            O10l = _mm_add_epi32(O10l, E5l);
5843
            O10l = _mm_add_epi32(O10l, E6l);
5844
            O10l = _mm_add_epi32(O10l, E7l);
5845
5846
            O10h = _mm_add_epi32(E0h, E1h);
5847
            O10h = _mm_add_epi32(O10h, E2h);
5848
            O10h = _mm_add_epi32(O10h, E3h);
5849
            O10h = _mm_add_epi32(O10h, E4h);
5850
            O10h = _mm_add_epi32(O10h, E5h);
5851
            O10h = _mm_add_epi32(O10h, E6h);
5852
            O10h = _mm_add_epi32(O10h, E7h);
5853
5854
            /* Compute 11*/
5855
5856
            E0l = _mm_madd_epi16(m128Tmp0,
5857
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5858
            E0h = _mm_madd_epi16(m128Tmp1,
5859
                    _mm_load_si128((__m128i *) (transform32x32[0][11])));
5860
            E1l = _mm_madd_epi16(m128Tmp2,
5861
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5862
            E1h = _mm_madd_epi16(m128Tmp3,
5863
                    _mm_load_si128((__m128i *) (transform32x32[1][11])));
5864
            E2l = _mm_madd_epi16(m128Tmp4,
5865
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5866
            E2h = _mm_madd_epi16(m128Tmp5,
5867
                    _mm_load_si128((__m128i *) (transform32x32[2][11])));
5868
            E3l = _mm_madd_epi16(m128Tmp6,
5869
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5870
            E3h = _mm_madd_epi16(m128Tmp7,
5871
                    _mm_load_si128((__m128i *) (transform32x32[3][11])));
5872
5873
            E4l = _mm_madd_epi16(m128Tmp8,
5874
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5875
            E4h = _mm_madd_epi16(m128Tmp9,
5876
                    _mm_load_si128((__m128i *) (transform32x32[4][11])));
5877
            E5l = _mm_madd_epi16(m128Tmp10,
5878
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5879
            E5h = _mm_madd_epi16(m128Tmp11,
5880
                    _mm_load_si128((__m128i *) (transform32x32[5][11])));
5881
            E6l = _mm_madd_epi16(m128Tmp12,
5882
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5883
            E6h = _mm_madd_epi16(m128Tmp13,
5884
                    _mm_load_si128((__m128i *) (transform32x32[6][11])));
5885
            E7l = _mm_madd_epi16(m128Tmp14,
5886
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5887
            E7h = _mm_madd_epi16(m128Tmp15,
5888
                    _mm_load_si128((__m128i *) (transform32x32[7][11])));
5889
5890
            O11l = _mm_add_epi32(E0l, E1l);
5891
            O11l = _mm_add_epi32(O11l, E2l);
5892
            O11l = _mm_add_epi32(O11l, E3l);
5893
            O11l = _mm_add_epi32(O11l, E4l);
5894
            O11l = _mm_add_epi32(O11l, E5l);
5895
            O11l = _mm_add_epi32(O11l, E6l);
5896
            O11l = _mm_add_epi32(O11l, E7l);
5897
5898
            O11h = _mm_add_epi32(E0h, E1h);
5899
            O11h = _mm_add_epi32(O11h, E2h);
5900
            O11h = _mm_add_epi32(O11h, E3h);
5901
            O11h = _mm_add_epi32(O11h, E4h);
5902
            O11h = _mm_add_epi32(O11h, E5h);
5903
            O11h = _mm_add_epi32(O11h, E6h);
5904
            O11h = _mm_add_epi32(O11h, E7h);
5905
5906
            /* Compute 12*/
5907
5908
            E0l = _mm_madd_epi16(m128Tmp0,
5909
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5910
            E0h = _mm_madd_epi16(m128Tmp1,
5911
                    _mm_load_si128((__m128i *) (transform32x32[0][12])));
5912
            E1l = _mm_madd_epi16(m128Tmp2,
5913
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5914
            E1h = _mm_madd_epi16(m128Tmp3,
5915
                    _mm_load_si128((__m128i *) (transform32x32[1][12])));
5916
            E2l = _mm_madd_epi16(m128Tmp4,
5917
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5918
            E2h = _mm_madd_epi16(m128Tmp5,
5919
                    _mm_load_si128((__m128i *) (transform32x32[2][12])));
5920
            E3l = _mm_madd_epi16(m128Tmp6,
5921
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5922
            E3h = _mm_madd_epi16(m128Tmp7,
5923
                    _mm_load_si128((__m128i *) (transform32x32[3][12])));
5924
5925
            E4l = _mm_madd_epi16(m128Tmp8,
5926
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5927
            E4h = _mm_madd_epi16(m128Tmp9,
5928
                    _mm_load_si128((__m128i *) (transform32x32[4][12])));
5929
            E5l = _mm_madd_epi16(m128Tmp10,
5930
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5931
            E5h = _mm_madd_epi16(m128Tmp11,
5932
                    _mm_load_si128((__m128i *) (transform32x32[5][12])));
5933
            E6l = _mm_madd_epi16(m128Tmp12,
5934
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5935
            E6h = _mm_madd_epi16(m128Tmp13,
5936
                    _mm_load_si128((__m128i *) (transform32x32[6][12])));
5937
            E7l = _mm_madd_epi16(m128Tmp14,
5938
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5939
            E7h = _mm_madd_epi16(m128Tmp15,
5940
                    _mm_load_si128((__m128i *) (transform32x32[7][12])));
5941
5942
            O12l = _mm_add_epi32(E0l, E1l);
5943
            O12l = _mm_add_epi32(O12l, E2l);
5944
            O12l = _mm_add_epi32(O12l, E3l);
5945
            O12l = _mm_add_epi32(O12l, E4l);
5946
            O12l = _mm_add_epi32(O12l, E5l);
5947
            O12l = _mm_add_epi32(O12l, E6l);
5948
            O12l = _mm_add_epi32(O12l, E7l);
5949
5950
            O12h = _mm_add_epi32(E0h, E1h);
5951
            O12h = _mm_add_epi32(O12h, E2h);
5952
            O12h = _mm_add_epi32(O12h, E3h);
5953
            O12h = _mm_add_epi32(O12h, E4h);
5954
            O12h = _mm_add_epi32(O12h, E5h);
5955
            O12h = _mm_add_epi32(O12h, E6h);
5956
            O12h = _mm_add_epi32(O12h, E7h);
5957
5958
            /* Compute 13*/
5959
5960
            E0l = _mm_madd_epi16(m128Tmp0,
5961
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5962
            E0h = _mm_madd_epi16(m128Tmp1,
5963
                    _mm_load_si128((__m128i *) (transform32x32[0][13])));
5964
            E1l = _mm_madd_epi16(m128Tmp2,
5965
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5966
            E1h = _mm_madd_epi16(m128Tmp3,
5967
                    _mm_load_si128((__m128i *) (transform32x32[1][13])));
5968
            E2l = _mm_madd_epi16(m128Tmp4,
5969
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5970
            E2h = _mm_madd_epi16(m128Tmp5,
5971
                    _mm_load_si128((__m128i *) (transform32x32[2][13])));
5972
            E3l = _mm_madd_epi16(m128Tmp6,
5973
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5974
            E3h = _mm_madd_epi16(m128Tmp7,
5975
                    _mm_load_si128((__m128i *) (transform32x32[3][13])));
5976
5977
            E4l = _mm_madd_epi16(m128Tmp8,
5978
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5979
            E4h = _mm_madd_epi16(m128Tmp9,
5980
                    _mm_load_si128((__m128i *) (transform32x32[4][13])));
5981
            E5l = _mm_madd_epi16(m128Tmp10,
5982
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5983
            E5h = _mm_madd_epi16(m128Tmp11,
5984
                    _mm_load_si128((__m128i *) (transform32x32[5][13])));
5985
            E6l = _mm_madd_epi16(m128Tmp12,
5986
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5987
            E6h = _mm_madd_epi16(m128Tmp13,
5988
                    _mm_load_si128((__m128i *) (transform32x32[6][13])));
5989
            E7l = _mm_madd_epi16(m128Tmp14,
5990
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
5991
            E7h = _mm_madd_epi16(m128Tmp15,
5992
                    _mm_load_si128((__m128i *) (transform32x32[7][13])));
5993
5994
            O13l = _mm_add_epi32(E0l, E1l);
5995
            O13l = _mm_add_epi32(O13l, E2l);
5996
            O13l = _mm_add_epi32(O13l, E3l);
5997
            O13l = _mm_add_epi32(O13l, E4l);
5998
            O13l = _mm_add_epi32(O13l, E5l);
5999
            O13l = _mm_add_epi32(O13l, E6l);
6000
            O13l = _mm_add_epi32(O13l, E7l);
6001
6002
            O13h = _mm_add_epi32(E0h, E1h);
6003
            O13h = _mm_add_epi32(O13h, E2h);
6004
            O13h = _mm_add_epi32(O13h, E3h);
6005
            O13h = _mm_add_epi32(O13h, E4h);
6006
            O13h = _mm_add_epi32(O13h, E5h);
6007
            O13h = _mm_add_epi32(O13h, E6h);
6008
            O13h = _mm_add_epi32(O13h, E7h);
6009
6010
            /* Compute O14  */
6011
6012
            E0l = _mm_madd_epi16(m128Tmp0,
6013
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6014
            E0h = _mm_madd_epi16(m128Tmp1,
6015
                    _mm_load_si128((__m128i *) (transform32x32[0][14])));
6016
            E1l = _mm_madd_epi16(m128Tmp2,
6017
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6018
            E1h = _mm_madd_epi16(m128Tmp3,
6019
                    _mm_load_si128((__m128i *) (transform32x32[1][14])));
6020
            E2l = _mm_madd_epi16(m128Tmp4,
6021
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6022
            E2h = _mm_madd_epi16(m128Tmp5,
6023
                    _mm_load_si128((__m128i *) (transform32x32[2][14])));
6024
            E3l = _mm_madd_epi16(m128Tmp6,
6025
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6026
            E3h = _mm_madd_epi16(m128Tmp7,
6027
                    _mm_load_si128((__m128i *) (transform32x32[3][14])));
6028
6029
            E4l = _mm_madd_epi16(m128Tmp8,
6030
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6031
            E4h = _mm_madd_epi16(m128Tmp9,
6032
                    _mm_load_si128((__m128i *) (transform32x32[4][14])));
6033
            E5l = _mm_madd_epi16(m128Tmp10,
6034
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6035
            E5h = _mm_madd_epi16(m128Tmp11,
6036
                    _mm_load_si128((__m128i *) (transform32x32[5][14])));
6037
            E6l = _mm_madd_epi16(m128Tmp12,
6038
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6039
            E6h = _mm_madd_epi16(m128Tmp13,
6040
                    _mm_load_si128((__m128i *) (transform32x32[6][14])));
6041
            E7l = _mm_madd_epi16(m128Tmp14,
6042
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6043
            E7h = _mm_madd_epi16(m128Tmp15,
6044
                    _mm_load_si128((__m128i *) (transform32x32[7][14])));
6045
6046
            O14l = _mm_add_epi32(E0l, E1l);
6047
            O14l = _mm_add_epi32(O14l, E2l);
6048
            O14l = _mm_add_epi32(O14l, E3l);
6049
            O14l = _mm_add_epi32(O14l, E4l);
6050
            O14l = _mm_add_epi32(O14l, E5l);
6051
            O14l = _mm_add_epi32(O14l, E6l);
6052
            O14l = _mm_add_epi32(O14l, E7l);
6053
6054
            O14h = _mm_add_epi32(E0h, E1h);
6055
            O14h = _mm_add_epi32(O14h, E2h);
6056
            O14h = _mm_add_epi32(O14h, E3h);
6057
            O14h = _mm_add_epi32(O14h, E4h);
6058
            O14h = _mm_add_epi32(O14h, E5h);
6059
            O14h = _mm_add_epi32(O14h, E6h);
6060
            O14h = _mm_add_epi32(O14h, E7h);
6061
6062
            /* Compute O15*/
6063
6064
            E0l = _mm_madd_epi16(m128Tmp0,
6065
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6066
            E0h = _mm_madd_epi16(m128Tmp1,
6067
                    _mm_load_si128((__m128i *) (transform32x32[0][15])));
6068
            E1l = _mm_madd_epi16(m128Tmp2,
6069
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6070
            E1h = _mm_madd_epi16(m128Tmp3,
6071
                    _mm_load_si128((__m128i *) (transform32x32[1][15])));
6072
            E2l = _mm_madd_epi16(m128Tmp4,
6073
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6074
            E2h = _mm_madd_epi16(m128Tmp5,
6075
                    _mm_load_si128((__m128i *) (transform32x32[2][15])));
6076
            E3l = _mm_madd_epi16(m128Tmp6,
6077
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6078
            E3h = _mm_madd_epi16(m128Tmp7,
6079
                    _mm_load_si128((__m128i *) (transform32x32[3][15])));
6080
6081
            E4l = _mm_madd_epi16(m128Tmp8,
6082
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6083
            E4h = _mm_madd_epi16(m128Tmp9,
6084
                    _mm_load_si128((__m128i *) (transform32x32[4][15])));
6085
            E5l = _mm_madd_epi16(m128Tmp10,
6086
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6087
            E5h = _mm_madd_epi16(m128Tmp11,
6088
                    _mm_load_si128((__m128i *) (transform32x32[5][15])));
6089
            E6l = _mm_madd_epi16(m128Tmp12,
6090
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6091
            E6h = _mm_madd_epi16(m128Tmp13,
6092
                    _mm_load_si128((__m128i *) (transform32x32[6][15])));
6093
            E7l = _mm_madd_epi16(m128Tmp14,
6094
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6095
            E7h = _mm_madd_epi16(m128Tmp15,
6096
                    _mm_load_si128((__m128i *) (transform32x32[7][15])));
6097
6098
            O15l = _mm_add_epi32(E0l, E1l);
6099
            O15l = _mm_add_epi32(O15l, E2l);
6100
            O15l = _mm_add_epi32(O15l, E3l);
6101
            O15l = _mm_add_epi32(O15l, E4l);
6102
            O15l = _mm_add_epi32(O15l, E5l);
6103
            O15l = _mm_add_epi32(O15l, E6l);
6104
            O15l = _mm_add_epi32(O15l, E7l);
6105
6106
            O15h = _mm_add_epi32(E0h, E1h);
6107
            O15h = _mm_add_epi32(O15h, E2h);
6108
            O15h = _mm_add_epi32(O15h, E3h);
6109
            O15h = _mm_add_epi32(O15h, E4h);
6110
            O15h = _mm_add_epi32(O15h, E5h);
6111
            O15h = _mm_add_epi32(O15h, E6h);
6112
            O15h = _mm_add_epi32(O15h, E7h);
6113
            /*  Compute E0  */
6114
6115
            m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6116
            E0l = _mm_madd_epi16(m128Tmp0,
6117
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6118
            m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6119
            E0h = _mm_madd_epi16(m128Tmp1,
6120
                    _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6121
6122
            m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6123
            E0l = _mm_add_epi32(E0l,
6124
                    _mm_madd_epi16(m128Tmp2,
6125
                            _mm_load_si128(
6126
                                    (__m128i *) (transform16x16_1[1][0]))));
6127
            m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6128
            E0h = _mm_add_epi32(E0h,
6129
                    _mm_madd_epi16(m128Tmp3,
6130
                            _mm_load_si128(
6131
                                    (__m128i *) (transform16x16_1[1][0]))));
6132
6133
            m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6134
            E0l = _mm_add_epi32(E0l,
6135
                    _mm_madd_epi16(m128Tmp4,
6136
                            _mm_load_si128(
6137
                                    (__m128i *) (transform16x16_1[2][0]))));
6138
            m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6139
            E0h = _mm_add_epi32(E0h,
6140
                    _mm_madd_epi16(m128Tmp5,
6141
                            _mm_load_si128(
6142
                                    (__m128i *) (transform16x16_1[2][0]))));
6143
6144
            m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6145
            E0l = _mm_add_epi32(E0l,
6146
                    _mm_madd_epi16(m128Tmp6,
6147
                            _mm_load_si128(
6148
                                    (__m128i *) (transform16x16_1[3][0]))));
6149
            m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6150
            E0h = _mm_add_epi32(E0h,
6151
                    _mm_madd_epi16(m128Tmp7,
6152
                            _mm_load_si128(
6153
                                    (__m128i *) (transform16x16_1[3][0]))));
6154
6155
            /*  Compute E1  */
6156
            E1l = _mm_madd_epi16(m128Tmp0,
6157
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6158
            E1h = _mm_madd_epi16(m128Tmp1,
6159
                    _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6160
            E1l = _mm_add_epi32(E1l,
6161
                    _mm_madd_epi16(m128Tmp2,
6162
                            _mm_load_si128(
6163
                                    (__m128i *) (transform16x16_1[1][1]))));
6164
            E1h = _mm_add_epi32(E1h,
6165
                    _mm_madd_epi16(m128Tmp3,
6166
                            _mm_load_si128(
6167
                                    (__m128i *) (transform16x16_1[1][1]))));
6168
            E1l = _mm_add_epi32(E1l,
6169
                    _mm_madd_epi16(m128Tmp4,
6170
                            _mm_load_si128(
6171
                                    (__m128i *) (transform16x16_1[2][1]))));
6172
            E1h = _mm_add_epi32(E1h,
6173
                    _mm_madd_epi16(m128Tmp5,
6174
                            _mm_load_si128(
6175
                                    (__m128i *) (transform16x16_1[2][1]))));
6176
            E1l = _mm_add_epi32(E1l,
6177
                    _mm_madd_epi16(m128Tmp6,
6178
                            _mm_load_si128(
6179
                                    (__m128i *) (transform16x16_1[3][1]))));
6180
            E1h = _mm_add_epi32(E1h,
6181
                    _mm_madd_epi16(m128Tmp7,
6182
                            _mm_load_si128(
6183
                                    (__m128i *) (transform16x16_1[3][1]))));
6184
6185
            /*  Compute E2  */
6186
            E2l = _mm_madd_epi16(m128Tmp0,
6187
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6188
            E2h = _mm_madd_epi16(m128Tmp1,
6189
                    _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6190
            E2l = _mm_add_epi32(E2l,
6191
                    _mm_madd_epi16(m128Tmp2,
6192
                            _mm_load_si128(
6193
                                    (__m128i *) (transform16x16_1[1][2]))));
6194
            E2h = _mm_add_epi32(E2h,
6195
                    _mm_madd_epi16(m128Tmp3,
6196
                            _mm_load_si128(
6197
                                    (__m128i *) (transform16x16_1[1][2]))));
6198
            E2l = _mm_add_epi32(E2l,
6199
                    _mm_madd_epi16(m128Tmp4,
6200
                            _mm_load_si128(
6201
                                    (__m128i *) (transform16x16_1[2][2]))));
6202
            E2h = _mm_add_epi32(E2h,
6203
                    _mm_madd_epi16(m128Tmp5,
6204
                            _mm_load_si128(
6205
                                    (__m128i *) (transform16x16_1[2][2]))));
6206
            E2l = _mm_add_epi32(E2l,
6207
                    _mm_madd_epi16(m128Tmp6,
6208
                            _mm_load_si128(
6209
                                    (__m128i *) (transform16x16_1[3][2]))));
6210
            E2h = _mm_add_epi32(E2h,
6211
                    _mm_madd_epi16(m128Tmp7,
6212
                            _mm_load_si128(
6213
                                    (__m128i *) (transform16x16_1[3][2]))));
6214
6215
            /*  Compute E3  */
6216
            E3l = _mm_madd_epi16(m128Tmp0,
6217
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6218
            E3h = _mm_madd_epi16(m128Tmp1,
6219
                    _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6220
            E3l = _mm_add_epi32(E3l,
6221
                    _mm_madd_epi16(m128Tmp2,
6222
                            _mm_load_si128(
6223
                                    (__m128i *) (transform16x16_1[1][3]))));
6224
            E3h = _mm_add_epi32(E3h,
6225
                    _mm_madd_epi16(m128Tmp3,
6226
                            _mm_load_si128(
6227
                                    (__m128i *) (transform16x16_1[1][3]))));
6228
            E3l = _mm_add_epi32(E3l,
6229
                    _mm_madd_epi16(m128Tmp4,
6230
                            _mm_load_si128(
6231
                                    (__m128i *) (transform16x16_1[2][3]))));
6232
            E3h = _mm_add_epi32(E3h,
6233
                    _mm_madd_epi16(m128Tmp5,
6234
                            _mm_load_si128(
6235
                                    (__m128i *) (transform16x16_1[2][3]))));
6236
            E3l = _mm_add_epi32(E3l,
6237
                    _mm_madd_epi16(m128Tmp6,
6238
                            _mm_load_si128(
6239
                                    (__m128i *) (transform16x16_1[3][3]))));
6240
            E3h = _mm_add_epi32(E3h,
6241
                    _mm_madd_epi16(m128Tmp7,
6242
                            _mm_load_si128(
6243
                                    (__m128i *) (transform16x16_1[3][3]))));
6244
6245
            /*  Compute E4  */
6246
            E4l = _mm_madd_epi16(m128Tmp0,
6247
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6248
            E4h = _mm_madd_epi16(m128Tmp1,
6249
                    _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6250
            E4l = _mm_add_epi32(E4l,
6251
                    _mm_madd_epi16(m128Tmp2,
6252
                            _mm_load_si128(
6253
                                    (__m128i *) (transform16x16_1[1][4]))));
6254
            E4h = _mm_add_epi32(E4h,
6255
                    _mm_madd_epi16(m128Tmp3,
6256
                            _mm_load_si128(
6257
                                    (__m128i *) (transform16x16_1[1][4]))));
6258
            E4l = _mm_add_epi32(E4l,
6259
                    _mm_madd_epi16(m128Tmp4,
6260
                            _mm_load_si128(
6261
                                    (__m128i *) (transform16x16_1[2][4]))));
6262
            E4h = _mm_add_epi32(E4h,
6263
                    _mm_madd_epi16(m128Tmp5,
6264
                            _mm_load_si128(
6265
                                    (__m128i *) (transform16x16_1[2][4]))));
6266
            E4l = _mm_add_epi32(E4l,
6267
                    _mm_madd_epi16(m128Tmp6,
6268
                            _mm_load_si128(
6269
                                    (__m128i *) (transform16x16_1[3][4]))));
6270
            E4h = _mm_add_epi32(E4h,
6271
                    _mm_madd_epi16(m128Tmp7,
6272
                            _mm_load_si128(
6273
                                    (__m128i *) (transform16x16_1[3][4]))));
6274
6275
            /*  Compute E3  */
6276
            E5l = _mm_madd_epi16(m128Tmp0,
6277
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6278
            E5h = _mm_madd_epi16(m128Tmp1,
6279
                    _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6280
            E5l = _mm_add_epi32(E5l,
6281
                    _mm_madd_epi16(m128Tmp2,
6282
                            _mm_load_si128(
6283
                                    (__m128i *) (transform16x16_1[1][5]))));
6284
            E5h = _mm_add_epi32(E5h,
6285
                    _mm_madd_epi16(m128Tmp3,
6286
                            _mm_load_si128(
6287
                                    (__m128i *) (transform16x16_1[1][5]))));
6288
            E5l = _mm_add_epi32(E5l,
6289
                    _mm_madd_epi16(m128Tmp4,
6290
                            _mm_load_si128(
6291
                                    (__m128i *) (transform16x16_1[2][5]))));
6292
            E5h = _mm_add_epi32(E5h,
6293
                    _mm_madd_epi16(m128Tmp5,
6294
                            _mm_load_si128(
6295
                                    (__m128i *) (transform16x16_1[2][5]))));
6296
            E5l = _mm_add_epi32(E5l,
6297
                    _mm_madd_epi16(m128Tmp6,
6298
                            _mm_load_si128(
6299
                                    (__m128i *) (transform16x16_1[3][5]))));
6300
            E5h = _mm_add_epi32(E5h,
6301
                    _mm_madd_epi16(m128Tmp7,
6302
                            _mm_load_si128(
6303
                                    (__m128i *) (transform16x16_1[3][5]))));
6304
6305
            /*  Compute E6  */
6306
            E6l = _mm_madd_epi16(m128Tmp0,
6307
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6308
            E6h = _mm_madd_epi16(m128Tmp1,
6309
                    _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6310
            E6l = _mm_add_epi32(E6l,
6311
                    _mm_madd_epi16(m128Tmp2,
6312
                            _mm_load_si128(
6313
                                    (__m128i *) (transform16x16_1[1][6]))));
6314
            E6h = _mm_add_epi32(E6h,
6315
                    _mm_madd_epi16(m128Tmp3,
6316
                            _mm_load_si128(
6317
                                    (__m128i *) (transform16x16_1[1][6]))));
6318
            E6l = _mm_add_epi32(E6l,
6319
                    _mm_madd_epi16(m128Tmp4,
6320
                            _mm_load_si128(
6321
                                    (__m128i *) (transform16x16_1[2][6]))));
6322
            E6h = _mm_add_epi32(E6h,
6323
                    _mm_madd_epi16(m128Tmp5,
6324
                            _mm_load_si128(
6325
                                    (__m128i *) (transform16x16_1[2][6]))));
6326
            E6l = _mm_add_epi32(E6l,
6327
                    _mm_madd_epi16(m128Tmp6,
6328
                            _mm_load_si128(
6329
                                    (__m128i *) (transform16x16_1[3][6]))));
6330
            E6h = _mm_add_epi32(E6h,
6331
                    _mm_madd_epi16(m128Tmp7,
6332
                            _mm_load_si128(
6333
                                    (__m128i *) (transform16x16_1[3][6]))));
6334
6335
            /*  Compute E7  */
6336
            E7l = _mm_madd_epi16(m128Tmp0,
6337
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6338
            E7h = _mm_madd_epi16(m128Tmp1,
6339
                    _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6340
            E7l = _mm_add_epi32(E7l,
6341
                    _mm_madd_epi16(m128Tmp2,
6342
                            _mm_load_si128(
6343
                                    (__m128i *) (transform16x16_1[1][7]))));
6344
            E7h = _mm_add_epi32(E7h,
6345
                    _mm_madd_epi16(m128Tmp3,
6346
                            _mm_load_si128(
6347
                                    (__m128i *) (transform16x16_1[1][7]))));
6348
            E7l = _mm_add_epi32(E7l,
6349
                    _mm_madd_epi16(m128Tmp4,
6350
                            _mm_load_si128(
6351
                                    (__m128i *) (transform16x16_1[2][7]))));
6352
            E7h = _mm_add_epi32(E7h,
6353
                    _mm_madd_epi16(m128Tmp5,
6354
                            _mm_load_si128(
6355
                                    (__m128i *) (transform16x16_1[2][7]))));
6356
            E7l = _mm_add_epi32(E7l,
6357
                    _mm_madd_epi16(m128Tmp6,
6358
                            _mm_load_si128(
6359
                                    (__m128i *) (transform16x16_1[3][7]))));
6360
            E7h = _mm_add_epi32(E7h,
6361
                    _mm_madd_epi16(m128Tmp7,
6362
                            _mm_load_si128(
6363
                                    (__m128i *) (transform16x16_1[3][7]))));
6364
6365
            /*  Compute EE0 and EEE */
6366
6367
            m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6368
            E00l = _mm_madd_epi16(m128Tmp0,
6369
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6370
            m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6371
            E00h = _mm_madd_epi16(m128Tmp1,
6372
                    _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6373
6374
            m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6375
            E00l = _mm_add_epi32(E00l,
6376
                    _mm_madd_epi16(m128Tmp2,
6377
                            _mm_load_si128(
6378
                                    (__m128i *) (transform16x16_2[1][0]))));
6379
            m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6380
            E00h = _mm_add_epi32(E00h,
6381
                    _mm_madd_epi16(m128Tmp3,
6382
                            _mm_load_si128(
6383
                                    (__m128i *) (transform16x16_2[1][0]))));
6384
6385
            E01l = _mm_madd_epi16(m128Tmp0,
6386
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6387
            E01h = _mm_madd_epi16(m128Tmp1,
6388
                    _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6389
            E01l = _mm_add_epi32(E01l,
6390
                    _mm_madd_epi16(m128Tmp2,
6391
                            _mm_load_si128(
6392
                                    (__m128i *) (transform16x16_2[1][1]))));
6393
            E01h = _mm_add_epi32(E01h,
6394
                    _mm_madd_epi16(m128Tmp3,
6395
                            _mm_load_si128(
6396
                                    (__m128i *) (transform16x16_2[1][1]))));
6397
6398
            E02l = _mm_madd_epi16(m128Tmp0,
6399
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6400
            E02h = _mm_madd_epi16(m128Tmp1,
6401
                    _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6402
            E02l = _mm_add_epi32(E02l,
6403
                    _mm_madd_epi16(m128Tmp2,
6404
                            _mm_load_si128(
6405
                                    (__m128i *) (transform16x16_2[1][2]))));
6406
            E02h = _mm_add_epi32(E02h,
6407
                    _mm_madd_epi16(m128Tmp3,
6408
                            _mm_load_si128(
6409
                                    (__m128i *) (transform16x16_2[1][2]))));
6410
6411
            E03l = _mm_madd_epi16(m128Tmp0,
6412
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6413
            E03h = _mm_madd_epi16(m128Tmp1,
6414
                    _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6415
            E03l = _mm_add_epi32(E03l,
6416
                    _mm_madd_epi16(m128Tmp2,
6417
                            _mm_load_si128(
6418
                                    (__m128i *) (transform16x16_2[1][3]))));
6419
            E03h = _mm_add_epi32(E03h,
6420
                    _mm_madd_epi16(m128Tmp3,
6421
                            _mm_load_si128(
6422
                                    (__m128i *) (transform16x16_2[1][3]))));
6423
6424
            /*  Compute EE0 and EEE */
6425
6426
            m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6427
            EE0l = _mm_madd_epi16(m128Tmp0,
6428
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6429
            m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6430
            EE0h = _mm_madd_epi16(m128Tmp1,
6431
                    _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6432
6433
            m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6434
            EEE0l = _mm_madd_epi16(m128Tmp2,
6435
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6436
            m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6437
            EEE0h = _mm_madd_epi16(m128Tmp3,
6438
                    _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6439
6440
            EE1l = _mm_madd_epi16(m128Tmp0,
6441
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6442
            EE1h = _mm_madd_epi16(m128Tmp1,
6443
                    _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6444
6445
            EEE1l = _mm_madd_epi16(m128Tmp2,
6446
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6447
            EEE1h = _mm_madd_epi16(m128Tmp3,
6448
                    _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6449
6450
            /*  Compute EE    */
6451
6452
            EE2l = _mm_sub_epi32(EEE1l, EE1l);
6453
            EE3l = _mm_sub_epi32(EEE0l, EE0l);
6454
            EE2h = _mm_sub_epi32(EEE1h, EE1h);
6455
            EE3h = _mm_sub_epi32(EEE0h, EE0h);
6456
6457
            EE0l = _mm_add_epi32(EEE0l, EE0l);
6458
            EE1l = _mm_add_epi32(EEE1l, EE1l);
6459
            EE0h = _mm_add_epi32(EEE0h, EE0h);
6460
            EE1h = _mm_add_epi32(EEE1h, EE1h);
6461
            /**/
6462
6463
            EE7l = _mm_sub_epi32(EE0l, E00l);
6464
            EE6l = _mm_sub_epi32(EE1l, E01l);
6465
            EE5l = _mm_sub_epi32(EE2l, E02l);
6466
            EE4l = _mm_sub_epi32(EE3l, E03l);
6467
6468
            EE7h = _mm_sub_epi32(EE0h, E00h);
6469
            EE6h = _mm_sub_epi32(EE1h, E01h);
6470
            EE5h = _mm_sub_epi32(EE2h, E02h);
6471
            EE4h = _mm_sub_epi32(EE3h, E03h);
6472
6473
            EE0l = _mm_add_epi32(EE0l, E00l);
6474
            EE1l = _mm_add_epi32(EE1l, E01l);
6475
            EE2l = _mm_add_epi32(EE2l, E02l);
6476
            EE3l = _mm_add_epi32(EE3l, E03l);
6477
6478
            EE0h = _mm_add_epi32(EE0h, E00h);
6479
            EE1h = _mm_add_epi32(EE1h, E01h);
6480
            EE2h = _mm_add_epi32(EE2h, E02h);
6481
            EE3h = _mm_add_epi32(EE3h, E03h);
6482
            /*      Compute E       */
6483
6484
            E15l = _mm_sub_epi32(EE0l, E0l);
6485
            E15l = _mm_add_epi32(E15l, m128iAdd);
6486
            E14l = _mm_sub_epi32(EE1l, E1l);
6487
            E14l = _mm_add_epi32(E14l, m128iAdd);
6488
            E13l = _mm_sub_epi32(EE2l, E2l);
6489
            E13l = _mm_add_epi32(E13l, m128iAdd);
6490
            E12l = _mm_sub_epi32(EE3l, E3l);
6491
            E12l = _mm_add_epi32(E12l, m128iAdd);
6492
            E11l = _mm_sub_epi32(EE4l, E4l);
6493
            E11l = _mm_add_epi32(E11l, m128iAdd);
6494
            E10l = _mm_sub_epi32(EE5l, E5l);
6495
            E10l = _mm_add_epi32(E10l, m128iAdd);
6496
            E9l = _mm_sub_epi32(EE6l, E6l);
6497
            E9l = _mm_add_epi32(E9l, m128iAdd);
6498
            E8l = _mm_sub_epi32(EE7l, E7l);
6499
            E8l = _mm_add_epi32(E8l, m128iAdd);
6500
6501
            E0l = _mm_add_epi32(EE0l, E0l);
6502
            E0l = _mm_add_epi32(E0l, m128iAdd);
6503
            E1l = _mm_add_epi32(EE1l, E1l);
6504
            E1l = _mm_add_epi32(E1l, m128iAdd);
6505
            E2l = _mm_add_epi32(EE2l, E2l);
6506
            E2l = _mm_add_epi32(E2l, m128iAdd);
6507
            E3l = _mm_add_epi32(EE3l, E3l);
6508
            E3l = _mm_add_epi32(E3l, m128iAdd);
6509
            E4l = _mm_add_epi32(EE4l, E4l);
6510
            E4l = _mm_add_epi32(E4l, m128iAdd);
6511
            E5l = _mm_add_epi32(EE5l, E5l);
6512
            E5l = _mm_add_epi32(E5l, m128iAdd);
6513
            E6l = _mm_add_epi32(EE6l, E6l);
6514
            E6l = _mm_add_epi32(E6l, m128iAdd);
6515
            E7l = _mm_add_epi32(EE7l, E7l);
6516
            E7l = _mm_add_epi32(E7l, m128iAdd);
6517
6518
            E15h = _mm_sub_epi32(EE0h, E0h);
6519
            E15h = _mm_add_epi32(E15h, m128iAdd);
6520
            E14h = _mm_sub_epi32(EE1h, E1h);
6521
            E14h = _mm_add_epi32(E14h, m128iAdd);
6522
            E13h = _mm_sub_epi32(EE2h, E2h);
6523
            E13h = _mm_add_epi32(E13h, m128iAdd);
6524
            E12h = _mm_sub_epi32(EE3h, E3h);
6525
            E12h = _mm_add_epi32(E12h, m128iAdd);
6526
            E11h = _mm_sub_epi32(EE4h, E4h);
6527
            E11h = _mm_add_epi32(E11h, m128iAdd);
6528
            E10h = _mm_sub_epi32(EE5h, E5h);
6529
            E10h = _mm_add_epi32(E10h, m128iAdd);
6530
            E9h = _mm_sub_epi32(EE6h, E6h);
6531
            E9h = _mm_add_epi32(E9h, m128iAdd);
6532
            E8h = _mm_sub_epi32(EE7h, E7h);
6533
            E8h = _mm_add_epi32(E8h, m128iAdd);
6534
6535
            E0h = _mm_add_epi32(EE0h, E0h);
6536
            E0h = _mm_add_epi32(E0h, m128iAdd);
6537
            E1h = _mm_add_epi32(EE1h, E1h);
6538
            E1h = _mm_add_epi32(E1h, m128iAdd);
6539
            E2h = _mm_add_epi32(EE2h, E2h);
6540
            E2h = _mm_add_epi32(E2h, m128iAdd);
6541
            E3h = _mm_add_epi32(EE3h, E3h);
6542
            E3h = _mm_add_epi32(E3h, m128iAdd);
6543
            E4h = _mm_add_epi32(EE4h, E4h);
6544
            E4h = _mm_add_epi32(E4h, m128iAdd);
6545
            E5h = _mm_add_epi32(EE5h, E5h);
6546
            E5h = _mm_add_epi32(E5h, m128iAdd);
6547
            E6h = _mm_add_epi32(EE6h, E6h);
6548
            E6h = _mm_add_epi32(E6h, m128iAdd);
6549
            E7h = _mm_add_epi32(EE7h, E7h);
6550
            E7h = _mm_add_epi32(E7h, m128iAdd);
6551
6552
            m128iS0 = _mm_packs_epi32(
6553
                    _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6554
                    _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6555
            m128iS1 = _mm_packs_epi32(
6556
                    _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6557
                    _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6558
            m128iS2 = _mm_packs_epi32(
6559
                    _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6560
                    _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6561
            m128iS3 = _mm_packs_epi32(
6562
                    _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6563
                    _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6564
            m128iS4 = _mm_packs_epi32(
6565
                    _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6566
                    _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6567
            m128iS5 = _mm_packs_epi32(
6568
                    _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6569
                    _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6570
            m128iS6 = _mm_packs_epi32(
6571
                    _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6572
                    _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6573
            m128iS7 = _mm_packs_epi32(
6574
                    _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6575
                    _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6576
            m128iS8 = _mm_packs_epi32(
6577
                    _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6578
                    _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6579
            m128iS9 = _mm_packs_epi32(
6580
                    _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6581
                    _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6582
            m128iS10 = _mm_packs_epi32(
6583
                    _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6584
                    _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6585
            m128iS11 = _mm_packs_epi32(
6586
                    _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6587
                    _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6588
            m128iS12 = _mm_packs_epi32(
6589
                    _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6590
                    _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6591
            m128iS13 = _mm_packs_epi32(
6592
                    _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6593
                    _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6594
            m128iS14 = _mm_packs_epi32(
6595
                    _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6596
                    _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6597
            m128iS15 = _mm_packs_epi32(
6598
                    _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6599
                    _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6600
6601
            m128iS31 = _mm_packs_epi32(
6602
                    _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6603
                    _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6604
            m128iS30 = _mm_packs_epi32(
6605
                    _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6606
                    _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6607
            m128iS29 = _mm_packs_epi32(
6608
                    _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6609
                    _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6610
            m128iS28 = _mm_packs_epi32(
6611
                    _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6612
                    _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6613
            m128iS27 = _mm_packs_epi32(
6614
                    _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6615
                    _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6616
            m128iS26 = _mm_packs_epi32(
6617
                    _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6618
                    _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6619
            m128iS25 = _mm_packs_epi32(
6620
                    _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6621
                    _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6622
            m128iS24 = _mm_packs_epi32(
6623
                    _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6624
                    _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6625
            m128iS23 = _mm_packs_epi32(
6626
                    _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6627
                    _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6628
            m128iS22 = _mm_packs_epi32(
6629
                    _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6630
                    _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6631
            m128iS21 = _mm_packs_epi32(
6632
                    _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6633
                    _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6634
            m128iS20 = _mm_packs_epi32(
6635
                    _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6636
                    _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6637
            m128iS19 = _mm_packs_epi32(
6638
                    _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6639
                    _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6640
            m128iS18 = _mm_packs_epi32(
6641
                    _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6642
                    _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6643
            m128iS17 = _mm_packs_epi32(
6644
                    _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6645
                    _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6646
            m128iS16 = _mm_packs_epi32(
6647
                    _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6648
                    _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6649
6650
            if (!j) {
6651
                /*      Inverse the matrix      */
6652
                E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6653
                E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6654
                E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6655
                E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6656
                E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6657
                E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6658
                E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6659
                E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6660
                E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6661
                E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6662
                E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6663
                E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6664
                E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6665
                E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6666
                E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6667
                E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6668
6669
                O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6670
                O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6671
                O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6672
                O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6673
                O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6674
                O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6675
                O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6676
                O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6677
                O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6678
                O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6679
                O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6680
                O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6681
                O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6682
                O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6683
                O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6684
                O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6685
6686
                E0h = _mm_unpacklo_epi16(E0l, E8l);
6687
                E1h = _mm_unpacklo_epi16(E1l, E9l);
6688
                E2h = _mm_unpacklo_epi16(E2l, E10l);
6689
                E3h = _mm_unpacklo_epi16(E3l, E11l);
6690
                E4h = _mm_unpacklo_epi16(E4l, E12l);
6691
                E5h = _mm_unpacklo_epi16(E5l, E13l);
6692
                E6h = _mm_unpacklo_epi16(E6l, E14l);
6693
                E7h = _mm_unpacklo_epi16(E7l, E15l);
6694
6695
                E8h = _mm_unpackhi_epi16(E0l, E8l);
6696
                E9h = _mm_unpackhi_epi16(E1l, E9l);
6697
                E10h = _mm_unpackhi_epi16(E2l, E10l);
6698
                E11h = _mm_unpackhi_epi16(E3l, E11l);
6699
                E12h = _mm_unpackhi_epi16(E4l, E12l);
6700
                E13h = _mm_unpackhi_epi16(E5l, E13l);
6701
                E14h = _mm_unpackhi_epi16(E6l, E14l);
6702
                E15h = _mm_unpackhi_epi16(E7l, E15l);
6703
6704
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6705
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6706
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6707
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6708
6709
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6710
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6711
                m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6712
                m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6713
6714
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6715
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6716
                m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6717
                m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6718
6719
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6720
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6721
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6722
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6723
6724
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6725
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6726
                m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6727
                m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6728
6729
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6730
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6731
                m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6732
                m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6733
6734
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6735
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6736
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6737
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6738
6739
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6740
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6741
                m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6742
                m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6743
6744
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6745
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6746
                m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6747
                m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6748
6749
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6750
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6751
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6752
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6753
6754
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6755
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6756
                m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6757
                m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6758
6759
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6760
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6761
                m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6762
                m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6763
6764
                /*  */
6765
                E0h = _mm_unpacklo_epi16(O0l, O8l);
6766
                E1h = _mm_unpacklo_epi16(O1l, O9l);
6767
                E2h = _mm_unpacklo_epi16(O2l, O10l);
6768
                E3h = _mm_unpacklo_epi16(O3l, O11l);
6769
                E4h = _mm_unpacklo_epi16(O4l, O12l);
6770
                E5h = _mm_unpacklo_epi16(O5l, O13l);
6771
                E6h = _mm_unpacklo_epi16(O6l, O14l);
6772
                E7h = _mm_unpacklo_epi16(O7l, O15l);
6773
6774
                E8h = _mm_unpackhi_epi16(O0l, O8l);
6775
                E9h = _mm_unpackhi_epi16(O1l, O9l);
6776
                E10h = _mm_unpackhi_epi16(O2l, O10l);
6777
                E11h = _mm_unpackhi_epi16(O3l, O11l);
6778
                E12h = _mm_unpackhi_epi16(O4l, O12l);
6779
                E13h = _mm_unpackhi_epi16(O5l, O13l);
6780
                E14h = _mm_unpackhi_epi16(O6l, O14l);
6781
                E15h = _mm_unpackhi_epi16(O7l, O15l);
6782
6783
                m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6784
                m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6785
                m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6786
                m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6787
6788
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6789
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6790
                m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6791
                m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6792
6793
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6794
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6795
                m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6796
                m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6797
6798
                m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6799
                m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6800
                m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6801
                m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6802
6803
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6804
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6805
                m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6806
                m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6807
6808
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6809
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6810
                m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6811
                m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6812
6813
                m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6814
                m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6815
                m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6816
                m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6817
6818
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6819
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6820
                m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6821
                m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6822
6823
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6824
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6825
                m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6826
                m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6827
6828
                m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6829
                m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6830
                m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6831
                m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6832
6833
                m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6834
                m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6835
                m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6836
                m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6837
6838
                m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6839
                m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6840
                m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6841
                m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6842
                /*  */
6843
                _mm_store_si128((__m128i *) (src + i), m128iS0);
6844
                _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6845
                _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6846
                _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6847
                _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6848
                _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6849
                _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6850
                _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6851
                _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6852
                _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6853
                _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6854
                _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6855
                _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6856
                _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6857
                _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6858
                _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6859
                _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6860
                _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6861
                _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6862
                _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6863
                _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6864
                _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6865
                _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6866
                _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6867
                _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6868
                _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6869
                _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6870
                _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6871
                _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6872
                _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6873
                _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6874
                _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6875
6876
                if (i <= 16) {
6877
                    int k = i + 8;
6878
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
6879
                    m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6880
                    m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6881
                    m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6882
                    m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6883
                    m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6884
                    m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6885
                    m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6886
                    m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6887
                    m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6888
                    m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6889
                    m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6890
                    m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6891
                    m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6892
                    m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6893
                    m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6894
6895
                    m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6896
                    m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6897
                    m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6898
                    m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6899
                    m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6900
                    m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6901
                    m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6902
                    m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6903
                    m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6904
                    m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6905
                    m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6906
                    m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6907
                    m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6908
                    m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6909
                    m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6910
                    m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6911
                } else {
6912
                    m128iS0 = _mm_load_si128((__m128i *) (src));
6913
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6914
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6915
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6916
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6917
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6918
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6919
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6920
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6921
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6922
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6923
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6924
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6925
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6926
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6927
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6928
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6929
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6930
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6931
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6932
                    m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6933
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6934
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6935
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6936
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6937
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6938
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6939
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6940
                    m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6941
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6942
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6943
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6944
                    shift = shift_2nd;
6945
                    m128iAdd = _mm_set1_epi32(add_2nd);
6946
                }
6947
6948
            } else {
6949
                int k, m = 0;
6950
                _mm_storeu_si128((__m128i *) (src), m128iS0);
6951
                _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6952
                _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6953
                _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6954
                _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6955
                _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6956
                _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6957
                _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6958
                _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6959
                _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6960
                _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6961
                _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6962
                _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6963
                _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6964
                _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6965
                _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6966
6967
                _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6968
                _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6969
                _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6970
                _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6971
                _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6972
                _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6973
                _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6974
                _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6975
                _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6976
                _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6977
                _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6978
                _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6979
                _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6980
                _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6981
                _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6982
                _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6983
                dst = (uint16_t*) _dst + (i * stride);
6984
                for (k = 0; k < 8; k++) {
6985
                    dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6986
                    dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6987
                    dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6988
                    dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6989
                    dst[4] = av_clip_uintp2(
6990
                            dst[4] + src[m + 128],10);
6991
                    dst[5] = av_clip_uintp2(
6992
                            dst[5] + src[m + 128 + 8],10);
6993
                    dst[6] = av_clip_uintp2(
6994
                            dst[6] + src[m + 128 + 16],10);
6995
                    dst[7] = av_clip_uintp2(
6996
                            dst[7] + src[m + 128 + 24],10);
6997
6998
                    dst[8] = av_clip_uintp2(
6999
                            dst[8] + src[m + 256],10);
7000
                    dst[9] = av_clip_uintp2(
7001
                            dst[9] + src[m + 256 + 8],10);
7002
                    dst[10] = av_clip_uintp2(
7003
                            dst[10] + src[m + 256 + 16],10);
7004
                    dst[11] = av_clip_uintp2(
7005
                            dst[11] + src[m + 256 + 24],10);
7006
                    dst[12] = av_clip_uintp2(
7007
                            dst[12] + src[m + 384],10);
7008
                    dst[13] = av_clip_uintp2(
7009
                            dst[13] + src[m + 384 + 8],10);
7010
                    dst[14] = av_clip_uintp2(
7011
                            dst[14] + src[m + 384 + 16],10);
7012
                    dst[15] = av_clip_uintp2(
7013
                            dst[15] + src[m + 384 + 24],10);
7014
7015
                    dst[16] = av_clip_uintp2(
7016
                            dst[16] + src[m + 512],10);
7017
                    dst[17] = av_clip_uintp2(
7018
                            dst[17] + src[m + 512 + 8],10);
7019
                    dst[18] = av_clip_uintp2(
7020
                            dst[18] + src[m + 512 + 16],10);
7021
                    dst[19] = av_clip_uintp2(
7022
                            dst[19] + src[m + 512 + 24],10);
7023
                    dst[20] = av_clip_uintp2(
7024
                            dst[20] + src[m + 640],10);
7025
                    dst[21] = av_clip_uintp2(
7026
                            dst[21] + src[m + 640 + 8],10);
7027
                    dst[22] = av_clip_uintp2(
7028
                            dst[22] + src[m + 640 + 16],10);
7029
                    dst[23] = av_clip_uintp2(
7030
                            dst[23] + src[m + 640 + 24],10);
7031
7032
                    dst[24] = av_clip_uintp2(
7033
                            dst[24] + src[m + 768],10);
7034
                    dst[25] = av_clip_uintp2(
7035
                            dst[25] + src[m + 768 + 8],10);
7036
                    dst[26] = av_clip_uintp2(
7037
                            dst[26] + src[m + 768 + 16],10);
7038
                    dst[27] = av_clip_uintp2(
7039
                            dst[27] + src[m + 768 + 24],10);
7040
                    dst[28] = av_clip_uintp2(
7041
                            dst[28] + src[m + 896],10);
7042
                    dst[29] = av_clip_uintp2(
7043
                            dst[29] + src[m + 896 + 8],10);
7044
                    dst[30] = av_clip_uintp2(
7045
                            dst[30] + src[m + 896 + 16],10);
7046
                    dst[31] = av_clip_uintp2(
7047
                            dst[31] + src[m + 896 + 24],10);
7048
7049
                    m += 1;
7050
                    dst += stride;
7051
                }
7052
                if (i <= 16) {
7053
                    int k = (i + 8) * 4;
7054
                    m128iS0 = _mm_load_si128((__m128i *) (src + k));
7055
                    m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7056
                    m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7057
                    m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7058
                    m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7059
                    m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7060
                    m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7061
                    m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7062
                    m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7063
                    m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7064
                    m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7065
                    m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7066
                    m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7067
                    m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7068
                    m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7069
                    m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7070
                    m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7071
                    m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7072
                    m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7073
                    m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7074
                    m128iS20 = _mm_loadu_si128(
7075
                            (__m128i *) (src + 512 + 16 + k));
7076
                    m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7077
                    m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7078
                    m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7079
                    m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7080
                    m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7081
                    m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7082
                    m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7083
                    m128iS28 = _mm_loadu_si128(
7084
                            (__m128i *) (src + 512 + 24 + k));
7085
                    m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7086
                    m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7087
                    m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7088
                }
7089
            }
7090
        }
7091
    }
7092
}
7093
#endif
7094