Coverage Report

Created: 2024-10-01 06:54

/src/Simd/src/Simd/SimdSse41Winograd2.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2022 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdStore.h"
26
#include "Simd/SimdWinograd.h"
27
#include "Simd/SimdBase.h"
28
29
namespace Simd
30
{
31
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)    
32
    namespace Sse41
33
    {
34
        SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const __m128 src[4], float * dst, size_t stride)
35
0
        {
36
0
            _mm_storeu_ps(dst + 0 * stride, src[0]);
37
0
            _mm_storeu_ps(dst + 1 * stride, _mm_add_ps(src[0], src[1]));
38
0
            _mm_storeu_ps(dst + 2 * stride, src[1]);
39
40
0
            _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(src[0], src[2]));
41
0
            _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_add_ps(src[0], src[1]), _mm_add_ps(src[2], src[3])));
42
0
            _mm_storeu_ps(dst + 5 * stride, _mm_add_ps(src[1], src[3]));
43
44
0
            _mm_storeu_ps(dst + 6 * stride, src[2]);
45
0
            _mm_storeu_ps(dst + 7 * stride, _mm_add_ps(src[2], src[3]));
46
0
            _mm_storeu_ps(dst + 8 * stride, src[3]);
47
0
        }
48
49
        SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4n(const float* src, float* dst, size_t stride)
50
0
        {
51
0
            __m128 _src[4];
52
0
            Load4(src + 0, 4, _src + 0);
53
0
            WinogradKernel2x2Block2x2SetFilter(_src, dst, stride);
54
0
        }
55
56
        SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4t(const float* src, float* dst, size_t stride)
57
0
        {
58
0
            __m128 _src[4];
59
0
            _src[0] = _mm_loadu_ps(src + 0 * stride);
60
0
            _src[1] = _mm_loadu_ps(src + 1 * stride);
61
0
            _src[2] = _mm_loadu_ps(src + 2 * stride);
62
0
            _src[3] = _mm_loadu_ps(src + 3 * stride);
63
0
            WinogradKernel2x2Block2x2SetFilter(_src, dst, stride);
64
0
        }
65
66
        void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans)
67
0
        {
68
0
            size_t size4 = AlignLo(size, 4), i = 0;
69
0
            if (trans)
70
0
            {
71
0
                for (; i < size4; i += 4)
72
0
                    WinogradKernel2x2Block2x2SetFilter4t(src + i, dst + i, size);
73
0
                for (; i < size; i += 1)
74
0
                    Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size);
75
0
            }
76
0
            else
77
0
            {
78
0
                for (; i < size4; i += 4, src += 16, dst += 4)
79
0
                    WinogradKernel2x2Block2x2SetFilter4n(src, dst, size);
80
0
                for (; i < size; i += 1, src += 4, dst += 1)
81
0
                    Base::WinogradKernel2x2Block2x2SetFilter1n(src, dst, size);
82
0
            }
83
0
        }
84
85
        //-----------------------------------------------------------------------
86
87
        SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4Store(const __m128* src, float* dst, size_t stride)
88
0
        {
89
0
            _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(src[0], src[1]), _mm_sub_ps(src[4], src[3])));
90
0
            _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(src[1], src[4]));
91
0
            _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(src[2], src[1]), _mm_sub_ps(src[4], src[5])));
92
0
            _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(src[3], src[4]));
93
0
            _mm_storeu_ps(dst + 4 * stride, src[4]);
94
0
            _mm_storeu_ps(dst + 5 * stride, _mm_sub_ps(src[5], src[4]));
95
0
            _mm_storeu_ps(dst + 6 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[3]), _mm_sub_ps(src[6], src[7])));
96
0
            _mm_storeu_ps(dst + 7 * stride, _mm_sub_ps(src[7], src[4]));
97
0
            _mm_storeu_ps(dst + 8 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[5]), _mm_sub_ps(src[8], src[7])));
98
0
        }
99
100
        SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[9])
101
0
        {
102
0
            dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC);
103
0
            dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC);
104
0
            dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC);
105
0
            dst[3] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC);
106
0
            dst[4] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC);
107
0
            dst[5] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC);
108
0
            dst[6] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC);
109
0
            dst[7] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC);
110
0
            dst[8] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC);
111
0
        }
112
113
        SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride)
114
0
        {
115
0
            size_t srcS = srcW * srcC;
116
0
            size_t srcCF = AlignLo(srcC, F);
117
0
            for (size_t c = 0; c < srcCF; c += F)
118
0
            {
119
0
                __m128 tmp[9];
120
0
                WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, tmp);
121
0
                WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride);
122
0
            }
123
0
            if (srcCF < srcC)
124
0
            {
125
0
                __m128 tmp[9];
126
0
                WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, tmp);
127
0
                WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride);
128
0
            }
129
0
        }
130
131
        SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[9])
132
0
        {
133
0
            for (size_t i = 0; i < 9; ++i)
134
0
                dst[i] = _mm_setzero_ps();
135
0
            for (size_t row = rowB; row < rowE; ++row)
136
0
                for (size_t col = colB; col < colE; ++col)
137
0
                    dst[row * 3 + col] = _mm_loadu_ps(src + row * srcS + col * srcC);
138
0
        }
139
140
        SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride)
141
0
        {
142
0
            size_t srcS = srcW * srcC;
143
0
            size_t srcCF = AlignLo(srcC, F);
144
0
            for (size_t c = 0; c < srcCF; c += F)
145
0
            {
146
0
                __m128 tmp[9];
147
0
                WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp);
148
0
                WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride);
149
0
            }
150
0
            if (srcCF < srcC)
151
0
            {
152
0
                __m128 tmp[9];
153
0
                WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp);
154
0
                WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride);
155
0
            }
156
0
        }
157
158
        void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth,
159
            size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans)
160
0
        {
161
0
            assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1));
162
0
            if (trans ? (srcChannels < F) : true)
163
0
            {
164
0
                Base::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans);
165
0
                return;
166
0
            }
167
0
            size_t dstH = srcHeight - 1 + padY + padH;
168
0
            size_t dstW = srcWidth - 1 + padX + padW;
169
0
            size_t dstH2 = AlignLo(dstH, 2);
170
0
            size_t dstW2 = AlignLo(dstW, 2);
171
0
            size_t noseW = Simd::Min<size_t>(3, dstW + 1);
172
0
            size_t noseH = Simd::Min<size_t>(3, dstH + 1);
173
0
            size_t startY = padY ? 2 : 0;
174
0
            size_t startX = padX ? 2 : 0;
175
0
            if (padY || padH)
176
0
            {
177
0
                if (dstH == dstH2)
178
0
                    dstH2 -= 2;
179
0
                if (dstW == dstW2)
180
0
                    dstW2 -= 2;
181
0
                if (padY)
182
0
                    src -= (srcWidth + 1) * (trans ? srcChannels : 1);
183
0
            }
184
0
            size_t tailW = dstW - dstW2 + (padW ? 0 : 1);
185
0
            size_t tailH = dstH - dstH2 + (padH ? 0 : 1);
186
0
            size_t row = 0, col = 0;
187
0
            if (padY)
188
0
            {
189
0
                if (padX)
190
0
                    WinogradKernel2x2Block2x2SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels;
191
0
                for (col = startX; col < dstW2; col += 2)
192
0
                    WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels;
193
0
                if (col < dstW)
194
0
                    WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels;
195
0
            }
196
0
            for (row = startY; row < dstH2; row += 2)
197
0
            {
198
0
                if (padX)
199
0
                    WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels;
200
0
                for (col = startX; col < dstW2; col += 2)
201
0
                    WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels;
202
0
                if (col < dstW)
203
0
                    WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels;
204
0
            }
205
0
            if (row < dstH)
206
0
            {
207
0
                if (padX)
208
0
                    WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels;
209
0
                for (col = startX; col < dstW2; col += 2)
210
0
                    WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels;
211
0
                if (col < dstW)
212
0
                    WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels;
213
0
            }
214
0
        }
215
216
        //-----------------------------------------------------------------------
217
218
        SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, __m128* dst)
219
0
        {
220
0
            __m128 s[9];
221
0
            s[0] = _mm_loadu_ps(src + 0 * stride);
222
0
            s[1] = _mm_loadu_ps(src + 1 * stride);
223
0
            s[2] = _mm_loadu_ps(src + 2 * stride);
224
0
            s[3] = _mm_loadu_ps(src + 3 * stride);
225
0
            s[4] = _mm_loadu_ps(src + 4 * stride);
226
0
            s[5] = _mm_loadu_ps(src + 5 * stride);
227
0
            s[6] = _mm_loadu_ps(src + 6 * stride);
228
0
            s[7] = _mm_loadu_ps(src + 7 * stride);
229
0
            s[8] = _mm_loadu_ps(src + 8 * stride);
230
0
            dst[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[3], s[4]));
231
0
            dst[1] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_add_ps(s[4], s[5]));
232
0
            dst[2] = _mm_add_ps(_mm_add_ps(s[3], s[4]), _mm_add_ps(s[6], s[7]));
233
0
            dst[3] = _mm_add_ps(_mm_add_ps(s[4], s[5]), _mm_add_ps(s[7], s[8]));
234
0
        }
235
236
        SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC)
237
0
        {
238
0
            _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]);
239
0
            _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]);
240
0
            _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[2]);
241
0
            _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[3]);
242
0
        }
243
244
        SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC)
245
0
        {
246
0
            size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);
247
0
            for (size_t d = 0; d < dstCF; d += F)
248
0
            {
249
0
                __m128 tmp[4];
250
0
                WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp);
251
0
                WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC);
252
0
            }
253
0
            if (dstCF < dstC)
254
0
            {
255
0
                __m128 tmp[4];
256
0
                WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp);
257
0
                WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC);
258
0
            }
259
0
        }
260
261
        SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE)
262
0
        {
263
0
            for (size_t row = 0; row < rowE; ++row)
264
0
                for (size_t col = 0; col < colE; ++col)
265
0
                    _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 2 + col]);
266
0
        }
267
268
        SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE)
269
0
        {
270
0
            size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);
271
0
            for (size_t d = 0; d < dstCF; d += F)
272
0
            {
273
0
                __m128 tmp[4];
274
0
                WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp);
275
0
                WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE);
276
0
            }
277
0
            if (dstCF < dstC)
278
0
            {
279
0
                __m128 tmp[4];
280
0
                WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp);
281
0
                WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE);
282
0
            }
283
0
        }
284
285
        void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans)
286
0
        {
287
0
            if (trans ? (dstChannels < F) : true)
288
0
            {
289
0
                Base::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans);
290
0
                return;
291
0
            }
292
0
            size_t tileH = (dstHeight + 1) / 2;
293
0
            size_t tileW = (dstWidth + 1) / 2;
294
0
            size_t dstH2 = AlignLo(dstHeight, 2);
295
0
            size_t dstW2 = AlignLo(dstWidth, 2);
296
0
            size_t row, col;
297
0
            for (row = 0; row < dstH2; row += 2)
298
0
            {
299
0
                for (col = 0; col < dstW2; col += 2)
300
0
                    WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels;
301
0
                if (col < dstWidth)
302
0
                    WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels;
303
0
            }
304
0
            if (row < dstHeight)
305
0
            {
306
0
                for (col = 0; col < dstW2; col += 2)
307
0
                    WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels;
308
0
                if (col < dstWidth)
309
0
                    WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels;
310
0
            }
311
0
        }
312
313
        //-----------------------------------------------------------------------
314
315
        SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const __m128* t, float* dst, size_t stride)
316
0
        {
317
0
            const __m128 r2 = _mm_set1_ps(1.0f / 2.0f);
318
0
            const __m128 r3 = _mm_set1_ps(1.0f / 3.0f);
319
0
            const __m128 r6 = _mm_set1_ps(1.0f / 6.0f);
320
0
            const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f);
321
322
0
            _mm_storeu_ps(dst + 0 * stride, _mm_mul_ps(r2, t[0]));
323
0
            _mm_storeu_ps(dst + 1 * stride, _mm_mul_ps(mr2, _mm_add_ps(t[0], t[1])));
324
0
            _mm_storeu_ps(dst + 2 * stride, _mm_mul_ps(r6, _mm_sub_ps(t[1], t[0])));
325
0
            _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(_mm_mul_ps(r6, t[0]), _mm_mul_ps(r3, t[1])));
326
0
            _mm_storeu_ps(dst + 4 * stride, t[1]);
327
0
        }
328
329
        SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const __m128 src[4], float* dst, size_t stride)
330
0
        {
331
0
            const __m128 r2 = _mm_set1_ps(1.0f / 2.0f);
332
0
            const __m128 r3 = _mm_set1_ps(1.0f / 3.0f);
333
0
            const __m128 r6 = _mm_set1_ps(1.0f / 6.0f);
334
0
            const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f);
335
336
0
            __m128 t[2];
337
0
            t[0] = _mm_mul_ps(r2, src[0]);
338
0
            t[1] = _mm_mul_ps(r2, src[1]);
339
0
            WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride);
340
341
0
            t[0] = _mm_mul_ps(mr2, _mm_add_ps(src[0], src[2]));
342
0
            t[1] = _mm_mul_ps(mr2, _mm_add_ps(src[1], src[3]));
343
0
            WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride);
344
345
0
            t[0] = _mm_mul_ps(r6, _mm_sub_ps(src[2], src[0]));
346
0
            t[1] = _mm_mul_ps(r6, _mm_sub_ps(src[3], src[1]));
347
0
            WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride);
348
349
0
            t[0] = _mm_add_ps(_mm_mul_ps(r6, src[0]), _mm_mul_ps(r3, src[2]));
350
0
            t[1] = _mm_add_ps(_mm_mul_ps(r6, src[1]), _mm_mul_ps(r3, src[3]));
351
0
            WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride);
352
353
0
            t[0] = src[2];
354
0
            t[1] = src[3];
355
0
            WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride);
356
0
        }
357
358
        SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4n(const float* src, float* dst, size_t stride)
359
0
        {
360
0
            __m128 _src[4];
361
0
            Load4(src + 0, 4, _src + 0);
362
0
            WinogradKernel2x2Block4x4SetFilter(_src, dst, stride);
363
0
        }
364
365
        SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4t(const float* src, float* dst, size_t stride)
366
0
        {
367
0
            __m128 _src[4];
368
0
            _src[0] = _mm_loadu_ps(src + 0 * stride);
369
0
            _src[1] = _mm_loadu_ps(src + 1 * stride);
370
0
            _src[2] = _mm_loadu_ps(src + 2 * stride);
371
0
            _src[3] = _mm_loadu_ps(src + 3 * stride);
372
0
            WinogradKernel2x2Block4x4SetFilter(_src, dst, stride);
373
0
        }
374
375
        void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans)
376
0
        {
377
0
            size_t size4 = AlignLo(size, 4), i = 0;
378
0
            if (trans)
379
0
            {
380
0
                for (; i < size4; i += 4)
381
0
                    WinogradKernel2x2Block4x4SetFilter4t(src + i, dst + i, size);
382
0
                for (; i < size; i += 1)
383
0
                    Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size);
384
0
            }
385
0
            else
386
0
            {
387
0
                for (; i < size4; i += 4, src += 16, dst += 4)
388
0
                    WinogradKernel2x2Block4x4SetFilter4n(src, dst, size);
389
0
                for (; i < size; i += 1, src += 4, dst += 1)
390
0
                    Base::WinogradKernel2x2Block4x4SetFilter1n(src, dst, size);
391
0
            }
392
0
        }
393
394
        //-----------------------------------------------------------------------
395
396
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const __m128 tmp[5], float* dst, size_t stride)
397
0
        {
398
0
            const __m128 _2 = _mm_set1_ps(2.0f);
399
0
            const __m128 _3 = _mm_set1_ps(3.0f);
400
0
            _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[0]), tmp[1]), _mm_sub_ps(tmp[3], _mm_mul_ps(_2, tmp[2]))));
401
0
            _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(tmp[3], _mm_add_ps(_mm_mul_ps(_2, tmp[1]), tmp[2])));
402
0
            _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), _mm_mul_ps(_3, tmp[2])), tmp[3]));
403
0
            _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(tmp[3], tmp[1]));
404
0
            _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), tmp[2]), _mm_sub_ps(tmp[4], _mm_mul_ps(_2, tmp[3]))));
405
0
        }
406
407
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const __m128* src, float* dst, size_t stride)
408
0
        {
409
0
            const __m128 _2 = _mm_set1_ps(2.0f);
410
0
            const __m128 _3 = _mm_set1_ps(3.0f);
411
0
            __m128 tmp[5];
412
0
            tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[0]), src[5]), _mm_sub_ps(src[15], _mm_mul_ps(_2, src[10])));
413
0
            tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[1]), src[6]), _mm_sub_ps(src[16], _mm_mul_ps(_2, src[11])));
414
0
            tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[2]), src[7]), _mm_sub_ps(src[17], _mm_mul_ps(_2, src[12])));
415
0
            tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[3]), src[8]), _mm_sub_ps(src[18], _mm_mul_ps(_2, src[13])));
416
0
            tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[4]), src[9]), _mm_sub_ps(src[19], _mm_mul_ps(_2, src[14])));
417
0
            WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 0 * stride, stride);
418
419
0
            tmp[0] = _mm_sub_ps(src[15], _mm_add_ps(_mm_mul_ps(_2, src[5]), src[10]));
420
0
            tmp[1] = _mm_sub_ps(src[16], _mm_add_ps(_mm_mul_ps(_2, src[6]), src[11]));
421
0
            tmp[2] = _mm_sub_ps(src[17], _mm_add_ps(_mm_mul_ps(_2, src[7]), src[12]));
422
0
            tmp[3] = _mm_sub_ps(src[18], _mm_add_ps(_mm_mul_ps(_2, src[8]), src[13]));
423
0
            tmp[4] = _mm_sub_ps(src[19], _mm_add_ps(_mm_mul_ps(_2, src[9]), src[14]));
424
0
            WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 5 * stride, stride);
425
426
0
            tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), _mm_mul_ps(_3, src[10])), src[15]);
427
0
            tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), _mm_mul_ps(_3, src[11])), src[16]);
428
0
            tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), _mm_mul_ps(_3, src[12])), src[17]);
429
0
            tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), _mm_mul_ps(_3, src[13])), src[18]);
430
0
            tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), _mm_mul_ps(_3, src[14])), src[19]);
431
0
            WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 10 * stride, stride);
432
433
0
            tmp[0] = _mm_sub_ps(src[15], src[5]);
434
0
            tmp[1] = _mm_sub_ps(src[16], src[6]);
435
0
            tmp[2] = _mm_sub_ps(src[17], src[7]);
436
0
            tmp[3] = _mm_sub_ps(src[18], src[8]);
437
0
            tmp[4] = _mm_sub_ps(src[19], src[9]);
438
0
            WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 15 * stride, stride);
439
440
0
            tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), src[10]), _mm_sub_ps(src[20], _mm_mul_ps(_2, src[15])));
441
0
            tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), src[11]), _mm_sub_ps(src[21], _mm_mul_ps(_2, src[16])));
442
0
            tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), src[12]), _mm_sub_ps(src[22], _mm_mul_ps(_2, src[17])));
443
0
            tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), src[13]), _mm_sub_ps(src[23], _mm_mul_ps(_2, src[18])));
444
0
            tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), src[14]), _mm_sub_ps(src[24], _mm_mul_ps(_2, src[19])));
445
0
            WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 20 * stride, stride);
446
0
        }
447
448
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[25])
449
0
        {
450
0
            dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC);
451
0
            dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC);
452
0
            dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC);
453
0
            dst[3] = _mm_loadu_ps(src + 0 * srcS + 3 * srcC);
454
0
            dst[4] = _mm_loadu_ps(src + 0 * srcS + 4 * srcC);
455
0
            dst[5] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC);
456
0
            dst[6] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC);
457
0
            dst[7] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC);
458
0
            dst[8] = _mm_loadu_ps(src + 1 * srcS + 3 * srcC);
459
0
            dst[9] = _mm_loadu_ps(src + 1 * srcS + 4 * srcC);
460
0
            dst[10] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC);
461
0
            dst[11] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC);
462
0
            dst[12] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC);
463
0
            dst[13] = _mm_loadu_ps(src + 2 * srcS + 3 * srcC);
464
0
            dst[14] = _mm_loadu_ps(src + 2 * srcS + 4 * srcC);
465
0
            dst[15] = _mm_loadu_ps(src + 3 * srcS + 0 * srcC);
466
0
            dst[16] = _mm_loadu_ps(src + 3 * srcS + 1 * srcC);
467
0
            dst[17] = _mm_loadu_ps(src + 3 * srcS + 2 * srcC);
468
0
            dst[18] = _mm_loadu_ps(src + 3 * srcS + 3 * srcC);
469
0
            dst[19] = _mm_loadu_ps(src + 3 * srcS + 4 * srcC);
470
0
            dst[20] = _mm_loadu_ps(src + 4 * srcS + 0 * srcC);
471
0
            dst[21] = _mm_loadu_ps(src + 4 * srcS + 1 * srcC);
472
0
            dst[22] = _mm_loadu_ps(src + 4 * srcS + 2 * srcC);
473
0
            dst[23] = _mm_loadu_ps(src + 4 * srcS + 3 * srcC);
474
0
            dst[24] = _mm_loadu_ps(src + 4 * srcS + 4 * srcC);
475
0
        }
476
477
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride)
478
0
        {
479
0
            size_t srcS = srcW * srcC;
480
0
            size_t srcCF = AlignLo(srcC, F);
481
0
            for (size_t c = 0; c < srcCF; c += F)
482
0
            {
483
0
                __m128 tmp[25];
484
0
                WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, tmp);
485
0
                WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride);
486
0
            }
487
0
            if (srcCF < srcC)
488
0
            {
489
0
                __m128 tmp[25];
490
0
                WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, tmp);
491
0
                WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride);
492
0
            }
493
0
        }
494
495
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[25])
496
0
        {
497
0
            for (size_t i = 0; i < 25; ++i)
498
0
                dst[i] = _mm_setzero_ps();
499
0
            for (size_t row = rowB; row < rowE; ++row)
500
0
                for (size_t col = colB; col < colE; ++col)
501
0
                    dst[row * 5 + col] = _mm_loadu_ps(src + row * srcS + col * srcC);
502
0
        }
503
504
        SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride)
505
0
        {
506
0
            size_t srcS = srcW * srcC;
507
0
            size_t srcCF = AlignLo(srcC, F);
508
0
            for (size_t c = 0; c < srcCF; c += F)
509
0
            {
510
0
                __m128 tmp[25];
511
0
                WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp);
512
0
                WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride);
513
0
            }
514
0
            if (srcCF < srcC)
515
0
            {
516
0
                __m128 tmp[25];
517
0
                WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp);
518
0
                WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride);
519
0
            }
520
0
        }
521
522
        void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth,
523
            size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans)
524
0
        {
525
0
            assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1));
526
0
            if (trans ? (srcChannels < F) : true)
527
0
            {
528
0
                Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans);
529
0
                return;
530
0
            }
531
0
            size_t dstH = srcHeight - 1 + padY + padH;
532
0
            size_t dstW = srcWidth - 1 + padX + padW;
533
0
            size_t dstH4 = AlignLo(dstH, 4);
534
0
            size_t dstW4 = AlignLo(dstW, 4);
535
0
            size_t noseW = Simd::Min<size_t>(5, dstW + 1);
536
0
            size_t noseH = Simd::Min<size_t>(5, dstH + 1);
537
0
            size_t startY = padY ? 4 : 0;
538
0
            size_t startX = padX ? 4 : 0;
539
0
            if (padY || padH)
540
0
            {
541
0
                if (dstH == dstH4)
542
0
                    dstH4 -= 4;
543
0
                if (dstW == dstW4)
544
0
                    dstW4 -= 4;
545
0
                if (padY)
546
0
                    src -= (srcWidth + 1) * (trans ? srcChannels : 1);
547
0
            }
548
0
            size_t tailW = dstW - dstW4 + (padW ? 0 : 1);
549
0
            size_t tailH = dstH - dstH4 + (padH ? 0 : 1);
550
0
            size_t row = 0, col = 0;
551
0
            if (padY)
552
0
            {
553
0
                if (padX)
554
0
                    WinogradKernel2x2Block4x4SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels;
555
0
                for (col = startX; col < dstW4; col += 4)
556
0
                    WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels;
557
0
                if (col < dstW)
558
0
                    WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels;
559
0
            }
560
0
            for (row = startY; row < dstH4; row += 4)
561
0
            {
562
0
                if (padX)
563
0
                    WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels;
564
0
                for (col = startX; col < dstW4; col += 4)
565
0
                    WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels;
566
0
                if (col < dstW)
567
0
                    WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels;
568
0
            }
569
0
            if (row < dstH)
570
0
            {
571
0
                if (padX)
572
0
                    WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels;
573
0
                for (col = startX; col < dstW4; col += 4)
574
0
                    WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels;
575
0
                if (col < dstW)
576
0
                    WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels;
577
0
            }
578
0
        }
579
580
        //-----------------------------------------------------------------------
581
582
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const __m128 * s, __m128 * d)
583
0
        {
584
0
            const __m128 _2 = _mm_set1_ps(2.0f);
585
0
            const __m128 _4 = _mm_set1_ps(4.0f);
586
0
            const __m128 _8 = _mm_set1_ps(8.0f);
587
0
            d[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[2], s[3]));
588
0
            d[1] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_mul_ps(_2, s[3]));
589
0
            d[2] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_mul_ps(_4, s[3]));
590
0
            d[3] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_add_ps(_mm_mul_ps(_8, s[3]), s[4]));
591
0
        }
592
593
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, __m128* dst)
594
0
        {
595
0
            __m128 s[25];
596
0
            s[0] = _mm_loadu_ps(src + 0 * stride);
597
0
            s[1] = _mm_loadu_ps(src + 1 * stride);
598
0
            s[2] = _mm_loadu_ps(src + 2 * stride);
599
0
            s[3] = _mm_loadu_ps(src + 3 * stride);
600
0
            s[4] = _mm_loadu_ps(src + 4 * stride);
601
0
            s[5] = _mm_loadu_ps(src + 5 * stride);
602
0
            s[6] = _mm_loadu_ps(src + 6 * stride);
603
0
            s[7] = _mm_loadu_ps(src + 7 * stride);
604
0
            s[8] = _mm_loadu_ps(src + 8 * stride);
605
0
            s[9] = _mm_loadu_ps(src + 9 * stride);
606
0
            s[10] = _mm_loadu_ps(src + 10 * stride);
607
0
            s[11] = _mm_loadu_ps(src + 11 * stride);
608
0
            s[12] = _mm_loadu_ps(src + 12 * stride);
609
0
            s[13] = _mm_loadu_ps(src + 13 * stride);
610
0
            s[14] = _mm_loadu_ps(src + 14 * stride);
611
0
            s[15] = _mm_loadu_ps(src + 15 * stride);
612
0
            s[16] = _mm_loadu_ps(src + 16 * stride);
613
0
            s[17] = _mm_loadu_ps(src + 17 * stride);
614
0
            s[18] = _mm_loadu_ps(src + 18 * stride);
615
0
            s[19] = _mm_loadu_ps(src + 19 * stride);
616
0
            s[20] = _mm_loadu_ps(src + 20 * stride);
617
0
            s[21] = _mm_loadu_ps(src + 21 * stride);
618
0
            s[22] = _mm_loadu_ps(src + 22 * stride);
619
0
            s[23] = _mm_loadu_ps(src + 23 * stride);
620
0
            s[24] = _mm_loadu_ps(src + 24 * stride);
621
622
0
            const __m128 _2 = _mm_set1_ps(2.0f);
623
0
            const __m128 _4 = _mm_set1_ps(4.0f);
624
0
            const __m128 _8 = _mm_set1_ps(8.0f);
625
0
            __m128 t[5];
626
0
            t[0] = _mm_add_ps(_mm_add_ps(s[0], s[5]), _mm_add_ps(s[10], s[15]));
627
0
            t[1] = _mm_add_ps(_mm_add_ps(s[1], s[6]), _mm_add_ps(s[11], s[16]));
628
0
            t[2] = _mm_add_ps(_mm_add_ps(s[2], s[7]), _mm_add_ps(s[12], s[17]));
629
0
            t[3] = _mm_add_ps(_mm_add_ps(s[3], s[8]), _mm_add_ps(s[13], s[18]));
630
0
            t[4] = _mm_add_ps(_mm_add_ps(s[4], s[9]), _mm_add_ps(s[14], s[19]));
631
0
            WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 0);
632
633
0
            t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_mul_ps(_2, s[15]));
634
0
            t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_mul_ps(_2, s[16]));
635
0
            t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_mul_ps(_2, s[17]));
636
0
            t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_mul_ps(_2, s[18]));
637
0
            t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_mul_ps(_2, s[19]));
638
0
            WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 4);
639
640
0
            t[0] = _mm_add_ps(_mm_add_ps(s[5], s[10]), _mm_mul_ps(_4, s[15]));
641
0
            t[1] = _mm_add_ps(_mm_add_ps(s[6], s[11]), _mm_mul_ps(_4, s[16]));
642
0
            t[2] = _mm_add_ps(_mm_add_ps(s[7], s[12]), _mm_mul_ps(_4, s[17]));
643
0
            t[3] = _mm_add_ps(_mm_add_ps(s[8], s[13]), _mm_mul_ps(_4, s[18]));
644
0
            t[4] = _mm_add_ps(_mm_add_ps(s[9], s[14]), _mm_mul_ps(_4, s[19]));
645
0
            WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 8);
646
647
0
            t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_add_ps(_mm_mul_ps(_8, s[15]), s[20]));
648
0
            t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_add_ps(_mm_mul_ps(_8, s[16]), s[21]));
649
0
            t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_add_ps(_mm_mul_ps(_8, s[17]), s[22]));
650
0
            t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_add_ps(_mm_mul_ps(_8, s[18]), s[23]));
651
0
            t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_add_ps(_mm_mul_ps(_8, s[19]), s[24]));
652
0
            WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 12);
653
0
        }
654
655
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC)
656
0
        {
657
0
            _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]);
658
0
            _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]);
659
0
            _mm_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]);
660
0
            _mm_storeu_ps(dst + 0 * dstS + 3 * dstC, src[3]);
661
0
            _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[4]);
662
0
            _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[5]);
663
0
            _mm_storeu_ps(dst + 1 * dstS + 2 * dstC, src[6]);
664
0
            _mm_storeu_ps(dst + 1 * dstS + 3 * dstC, src[7]);
665
0
            _mm_storeu_ps(dst + 2 * dstS + 0 * dstC, src[8]);
666
0
            _mm_storeu_ps(dst + 2 * dstS + 1 * dstC, src[9]);
667
0
            _mm_storeu_ps(dst + 2 * dstS + 2 * dstC, src[10]);
668
0
            _mm_storeu_ps(dst + 2 * dstS + 3 * dstC, src[11]);
669
0
            _mm_storeu_ps(dst + 3 * dstS + 0 * dstC, src[12]);
670
0
            _mm_storeu_ps(dst + 3 * dstS + 1 * dstC, src[13]);
671
0
            _mm_storeu_ps(dst + 3 * dstS + 2 * dstC, src[14]);
672
0
            _mm_storeu_ps(dst + 3 * dstS + 3 * dstC, src[15]);
673
0
        }
674
675
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC)
676
0
        {
677
0
            size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);
678
0
            for (size_t d = 0; d < dstCF; d += F)
679
0
            {
680
0
                __m128 tmp[16];
681
0
                WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp);
682
0
                WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC);
683
0
            }
684
0
            if (dstCF < dstC)
685
0
            {
686
0
                __m128 tmp[16];
687
0
                WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp);
688
0
                WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC);
689
0
            }
690
0
        }
691
692
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE)
693
0
        {
694
0
            for (size_t row = 0; row < rowE; ++row)
695
0
                for (size_t col = 0; col < colE; ++col)
696
0
                    _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 4 + col]);
697
0
        }
698
699
        SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE)
700
0
        {
701
0
            size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);
702
0
            for (size_t d = 0; d < dstCF; d += F)
703
0
            {
704
0
                __m128 tmp[16];
705
0
                WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp);
706
0
                WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE);
707
0
            }
708
0
            if (dstCF < dstC)
709
0
            {
710
0
                __m128 tmp[16];
711
0
                WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp);
712
0
                WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE);
713
0
            }
714
0
        }
715
716
        void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans)
717
0
        {
718
0
            if (trans ? (dstChannels < F) : true)
719
0
            {
720
0
                Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans);
721
0
                return;
722
0
            }
723
0
            size_t tileH = (dstHeight + 3) / 4;
724
0
            size_t tileW = (dstWidth + 3) / 4;
725
0
            size_t dstH4 = AlignLo(dstHeight, 4);
726
0
            size_t dstW4 = AlignLo(dstWidth, 4);
727
0
            size_t row, col;
728
0
            for (row = 0; row < dstH4; row += 4)
729
0
            {
730
0
                for (col = 0; col < dstW4; col += 4)
731
0
                    WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels;
732
0
                if (col < dstWidth)
733
0
                    WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels;
734
0
            }
735
0
            if (row < dstHeight)
736
0
            {
737
0
                for (col = 0; col < dstW4; col += 4)
738
0
                    WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels;
739
0
                if (col < dstWidth)
740
0
                    WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels;
741
0
            }
742
0
        }
743
    }
744
#endif
745
}