/src/Simd/src/Simd/SimdSse41Winograd2.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  | * Simd Library (http://ermig1979.github.io/Simd).  | 
3  |  | *  | 
4  |  | * Copyright (c) 2011-2022 Yermalayeu Ihar.  | 
5  |  | *  | 
6  |  | * Permission is hereby granted, free of charge, to any person obtaining a copy  | 
7  |  | * of this software and associated documentation files (the "Software"), to deal  | 
8  |  | * in the Software without restriction, including without limitation the rights  | 
9  |  | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  | 
10  |  | * copies of the Software, and to permit persons to whom the Software is  | 
11  |  | * furnished to do so, subject to the following conditions:  | 
12  |  | *  | 
13  |  | * The above copyright notice and this permission notice shall be included in  | 
14  |  | * all copies or substantial portions of the Software.  | 
15  |  | *  | 
16  |  | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  | 
17  |  | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  | 
18  |  | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  | 
19  |  | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  | 
20  |  | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  | 
21  |  | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  | 
22  |  | * SOFTWARE.  | 
23  |  | */  | 
24  |  | #include "Simd/SimdMemory.h"  | 
25  |  | #include "Simd/SimdStore.h"  | 
26  |  | #include "Simd/SimdWinograd.h"  | 
27  |  | #include "Simd/SimdBase.h"  | 
28  |  |  | 
29  |  | namespace Simd  | 
30  |  | { | 
31  |  | #if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)      | 
32  |  |     namespace Sse41  | 
33  |  |     { | 
34  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter(const __m128 src[4], float * dst, size_t stride)  | 
35  | 0  |         { | 
36  | 0  |             _mm_storeu_ps(dst + 0 * stride, src[0]);  | 
37  | 0  |             _mm_storeu_ps(dst + 1 * stride, _mm_add_ps(src[0], src[1]));  | 
38  | 0  |             _mm_storeu_ps(dst + 2 * stride, src[1]);  | 
39  |  | 
  | 
40  | 0  |             _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(src[0], src[2]));  | 
41  | 0  |             _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_add_ps(src[0], src[1]), _mm_add_ps(src[2], src[3])));  | 
42  | 0  |             _mm_storeu_ps(dst + 5 * stride, _mm_add_ps(src[1], src[3]));  | 
43  |  | 
  | 
44  | 0  |             _mm_storeu_ps(dst + 6 * stride, src[2]);  | 
45  | 0  |             _mm_storeu_ps(dst + 7 * stride, _mm_add_ps(src[2], src[3]));  | 
46  | 0  |             _mm_storeu_ps(dst + 8 * stride, src[3]);  | 
47  | 0  |         }  | 
48  |  |  | 
49  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4n(const float* src, float* dst, size_t stride)  | 
50  | 0  |         { | 
51  | 0  |             __m128 _src[4];  | 
52  | 0  |             Load4(src + 0, 4, _src + 0);  | 
53  | 0  |             WinogradKernel2x2Block2x2SetFilter(_src, dst, stride);  | 
54  | 0  |         }  | 
55  |  |  | 
56  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetFilter4t(const float* src, float* dst, size_t stride)  | 
57  | 0  |         { | 
58  | 0  |             __m128 _src[4];  | 
59  | 0  |             _src[0] = _mm_loadu_ps(src + 0 * stride);  | 
60  | 0  |             _src[1] = _mm_loadu_ps(src + 1 * stride);  | 
61  | 0  |             _src[2] = _mm_loadu_ps(src + 2 * stride);  | 
62  | 0  |             _src[3] = _mm_loadu_ps(src + 3 * stride);  | 
63  | 0  |             WinogradKernel2x2Block2x2SetFilter(_src, dst, stride);  | 
64  | 0  |         }  | 
65  |  |  | 
66  |  |         void WinogradKernel2x2Block2x2SetFilter(const float* src, size_t size, float* dst, SimdBool trans)  | 
67  | 0  |         { | 
68  | 0  |             size_t size4 = AlignLo(size, 4), i = 0;  | 
69  | 0  |             if (trans)  | 
70  | 0  |             { | 
71  | 0  |                 for (; i < size4; i += 4)  | 
72  | 0  |                     WinogradKernel2x2Block2x2SetFilter4t(src + i, dst + i, size);  | 
73  | 0  |                 for (; i < size; i += 1)  | 
74  | 0  |                     Base::WinogradKernel2x2Block2x2SetFilter1t(src + i, dst + i, size);  | 
75  | 0  |             }  | 
76  | 0  |             else  | 
77  | 0  |             { | 
78  | 0  |                 for (; i < size4; i += 4, src += 16, dst += 4)  | 
79  | 0  |                     WinogradKernel2x2Block2x2SetFilter4n(src, dst, size);  | 
80  | 0  |                 for (; i < size; i += 1, src += 4, dst += 1)  | 
81  | 0  |                     Base::WinogradKernel2x2Block2x2SetFilter1n(src, dst, size);  | 
82  | 0  |             }  | 
83  | 0  |         }  | 
84  |  |  | 
85  |  |         //-----------------------------------------------------------------------  | 
86  |  |  | 
87  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4Store(const __m128* src, float* dst, size_t stride)  | 
88  | 0  |         { | 
89  | 0  |             _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(src[0], src[1]), _mm_sub_ps(src[4], src[3])));  | 
90  | 0  |             _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(src[1], src[4]));  | 
91  | 0  |             _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(src[2], src[1]), _mm_sub_ps(src[4], src[5])));  | 
92  | 0  |             _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(src[3], src[4]));  | 
93  | 0  |             _mm_storeu_ps(dst + 4 * stride, src[4]);  | 
94  | 0  |             _mm_storeu_ps(dst + 5 * stride, _mm_sub_ps(src[5], src[4]));  | 
95  | 0  |             _mm_storeu_ps(dst + 6 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[3]), _mm_sub_ps(src[6], src[7])));  | 
96  | 0  |             _mm_storeu_ps(dst + 7 * stride, _mm_sub_ps(src[7], src[4]));  | 
97  | 0  |             _mm_storeu_ps(dst + 8 * stride, _mm_add_ps(_mm_sub_ps(src[4], src[5]), _mm_sub_ps(src[8], src[7])));  | 
98  | 0  |         }  | 
99  |  |  | 
100  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[9])  | 
101  | 0  |         { | 
102  | 0  |             dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC);  | 
103  | 0  |             dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC);  | 
104  | 0  |             dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC);  | 
105  | 0  |             dst[3] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC);  | 
106  | 0  |             dst[4] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC);  | 
107  | 0  |             dst[5] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC);  | 
108  | 0  |             dst[6] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC);  | 
109  | 0  |             dst[7] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC);  | 
110  | 0  |             dst[8] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC);  | 
111  | 0  |         }  | 
112  |  |  | 
113  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride)  | 
114  | 0  |         { | 
115  | 0  |             size_t srcS = srcW * srcC;  | 
116  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
117  | 0  |             for (size_t c = 0; c < srcCF; c += F)  | 
118  | 0  |             { | 
119  | 0  |                 __m128 tmp[9];  | 
120  | 0  |                 WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, tmp);  | 
121  | 0  |                 WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride);  | 
122  | 0  |             }  | 
123  | 0  |             if (srcCF < srcC)  | 
124  | 0  |             { | 
125  | 0  |                 __m128 tmp[9];  | 
126  | 0  |                 WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, tmp);  | 
127  | 0  |                 WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride);  | 
128  | 0  |             }  | 
129  | 0  |         }  | 
130  |  |  | 
131  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[9])  | 
132  | 0  |         { | 
133  | 0  |             for (size_t i = 0; i < 9; ++i)  | 
134  | 0  |                 dst[i] = _mm_setzero_ps();  | 
135  | 0  |             for (size_t row = rowB; row < rowE; ++row)  | 
136  | 0  |                 for (size_t col = colB; col < colE; ++col)  | 
137  | 0  |                     dst[row * 3 + col] = _mm_loadu_ps(src + row * srcS + col * srcC);  | 
138  | 0  |         }  | 
139  |  |  | 
140  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride)  | 
141  | 0  |         { | 
142  | 0  |             size_t srcS = srcW * srcC;  | 
143  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
144  | 0  |             for (size_t c = 0; c < srcCF; c += F)  | 
145  | 0  |             { | 
146  | 0  |                 __m128 tmp[9];  | 
147  | 0  |                 WinogradKernel2x2Block2x2SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp);  | 
148  | 0  |                 WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + c, dstStride);  | 
149  | 0  |             }  | 
150  | 0  |             if (srcCF < srcC)  | 
151  | 0  |             { | 
152  | 0  |                 __m128 tmp[9];  | 
153  | 0  |                 WinogradKernel2x2Block2x2SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp);  | 
154  | 0  |                 WinogradKernel2x2Block2x2SetInput4Store(tmp, dst + srcC - F, dstStride);  | 
155  | 0  |             }  | 
156  | 0  |         }  | 
157  |  |  | 
158  |  |         void WinogradKernel2x2Block2x2SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth,  | 
159  |  |             size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans)  | 
160  | 0  |         { | 
161  | 0  |             assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1));  | 
162  | 0  |             if (trans ? (srcChannels < F) : true)  | 
163  | 0  |             { | 
164  | 0  |                 Base::WinogradKernel2x2Block2x2SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans);  | 
165  | 0  |                 return;  | 
166  | 0  |             }  | 
167  | 0  |             size_t dstH = srcHeight - 1 + padY + padH;  | 
168  | 0  |             size_t dstW = srcWidth - 1 + padX + padW;  | 
169  | 0  |             size_t dstH2 = AlignLo(dstH, 2);  | 
170  | 0  |             size_t dstW2 = AlignLo(dstW, 2);  | 
171  | 0  |             size_t noseW = Simd::Min<size_t>(3, dstW + 1);  | 
172  | 0  |             size_t noseH = Simd::Min<size_t>(3, dstH + 1);  | 
173  | 0  |             size_t startY = padY ? 2 : 0;  | 
174  | 0  |             size_t startX = padX ? 2 : 0;  | 
175  | 0  |             if (padY || padH)  | 
176  | 0  |             { | 
177  | 0  |                 if (dstH == dstH2)  | 
178  | 0  |                     dstH2 -= 2;  | 
179  | 0  |                 if (dstW == dstW2)  | 
180  | 0  |                     dstW2 -= 2;  | 
181  | 0  |                 if (padY)  | 
182  | 0  |                     src -= (srcWidth + 1) * (trans ? srcChannels : 1);  | 
183  | 0  |             }  | 
184  | 0  |             size_t tailW = dstW - dstW2 + (padW ? 0 : 1);  | 
185  | 0  |             size_t tailH = dstH - dstH2 + (padH ? 0 : 1);  | 
186  | 0  |             size_t row = 0, col = 0;  | 
187  | 0  |             if (padY)  | 
188  | 0  |             { | 
189  | 0  |                 if (padX)  | 
190  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels;  | 
191  | 0  |                 for (col = startX; col < dstW2; col += 2)  | 
192  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 3, dst, dstStride), dst += srcChannels;  | 
193  | 0  |                 if (col < dstW)  | 
194  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels;  | 
195  | 0  |             }  | 
196  | 0  |             for (row = startY; row < dstH2; row += 2)  | 
197  | 0  |             { | 
198  | 0  |                 if (padX)  | 
199  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 3, 1, noseW, dst, dstStride), dst += srcChannels;  | 
200  | 0  |                 for (col = startX; col < dstW2; col += 2)  | 
201  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels;  | 
202  | 0  |                 if (col < dstW)  | 
203  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 3, 0, tailW, dst, dstStride), dst += srcChannels;  | 
204  | 0  |             }  | 
205  | 0  |             if (row < dstH)  | 
206  | 0  |             { | 
207  | 0  |                 if (padX)  | 
208  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels;  | 
209  | 0  |                 for (col = startX; col < dstW2; col += 2)  | 
210  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 3, dst, dstStride), dst += srcChannels;  | 
211  | 0  |                 if (col < dstW)  | 
212  | 0  |                     WinogradKernel2x2Block2x2SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels;  | 
213  | 0  |             }  | 
214  | 0  |         }  | 
215  |  |  | 
216  |  |         //-----------------------------------------------------------------------  | 
217  |  |  | 
218  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputLoad9(const float* src, size_t stride, __m128* dst)  | 
219  | 0  |         { | 
220  | 0  |             __m128 s[9];  | 
221  | 0  |             s[0] = _mm_loadu_ps(src + 0 * stride);  | 
222  | 0  |             s[1] = _mm_loadu_ps(src + 1 * stride);  | 
223  | 0  |             s[2] = _mm_loadu_ps(src + 2 * stride);  | 
224  | 0  |             s[3] = _mm_loadu_ps(src + 3 * stride);  | 
225  | 0  |             s[4] = _mm_loadu_ps(src + 4 * stride);  | 
226  | 0  |             s[5] = _mm_loadu_ps(src + 5 * stride);  | 
227  | 0  |             s[6] = _mm_loadu_ps(src + 6 * stride);  | 
228  | 0  |             s[7] = _mm_loadu_ps(src + 7 * stride);  | 
229  | 0  |             s[8] = _mm_loadu_ps(src + 8 * stride);  | 
230  | 0  |             dst[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[3], s[4]));  | 
231  | 0  |             dst[1] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_add_ps(s[4], s[5]));  | 
232  | 0  |             dst[2] = _mm_add_ps(_mm_add_ps(s[3], s[4]), _mm_add_ps(s[6], s[7]));  | 
233  | 0  |             dst[3] = _mm_add_ps(_mm_add_ps(s[4], s[5]), _mm_add_ps(s[7], s[8]));  | 
234  | 0  |         }  | 
235  |  |  | 
236  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC)  | 
237  | 0  |         { | 
238  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]);  | 
239  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]);  | 
240  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[2]);  | 
241  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[3]);  | 
242  | 0  |         }  | 
243  |  |  | 
244  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC)  | 
245  | 0  |         { | 
246  | 0  |             size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);  | 
247  | 0  |             for (size_t d = 0; d < dstCF; d += F)  | 
248  | 0  |             { | 
249  | 0  |                 __m128 tmp[4];  | 
250  | 0  |                 WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp);  | 
251  | 0  |                 WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC);  | 
252  | 0  |             }  | 
253  | 0  |             if (dstCF < dstC)  | 
254  | 0  |             { | 
255  | 0  |                 __m128 tmp[4];  | 
256  | 0  |                 WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp);  | 
257  | 0  |                 WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC);  | 
258  | 0  |             }  | 
259  | 0  |         }  | 
260  |  |  | 
261  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetOutputStore4(const __m128 src[4], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE)  | 
262  | 0  |         { | 
263  | 0  |             for (size_t row = 0; row < rowE; ++row)  | 
264  | 0  |                 for (size_t col = 0; col < colE; ++col)  | 
265  | 0  |                     _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 2 + col]);  | 
266  | 0  |         }  | 
267  |  |  | 
268  |  |         SIMD_INLINE void WinogradKernel2x2Block2x2SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE)  | 
269  | 0  |         { | 
270  | 0  |             size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);  | 
271  | 0  |             for (size_t d = 0; d < dstCF; d += F)  | 
272  | 0  |             { | 
273  | 0  |                 __m128 tmp[4];  | 
274  | 0  |                 WinogradKernel2x2Block2x2SetOutputLoad9(src + d, srcStride, tmp);  | 
275  | 0  |                 WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + d, dstS, dstC, rowE, colE);  | 
276  | 0  |             }  | 
277  | 0  |             if (dstCF < dstC)  | 
278  | 0  |             { | 
279  | 0  |                 __m128 tmp[4];  | 
280  | 0  |                 WinogradKernel2x2Block2x2SetOutputLoad9(src + dstC - F, srcStride, tmp);  | 
281  | 0  |                 WinogradKernel2x2Block2x2SetOutputStore4(tmp, dst + dstC - F, dstS, dstC, rowE, colE);  | 
282  | 0  |             }  | 
283  | 0  |         }  | 
284  |  |  | 
285  |  |         void WinogradKernel2x2Block2x2SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans)  | 
286  | 0  |         { | 
287  | 0  |             if (trans ? (dstChannels < F) : true)  | 
288  | 0  |             { | 
289  | 0  |                 Base::WinogradKernel2x2Block2x2SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans);  | 
290  | 0  |                 return;  | 
291  | 0  |             }  | 
292  | 0  |             size_t tileH = (dstHeight + 1) / 2;  | 
293  | 0  |             size_t tileW = (dstWidth + 1) / 2;  | 
294  | 0  |             size_t dstH2 = AlignLo(dstHeight, 2);  | 
295  | 0  |             size_t dstW2 = AlignLo(dstWidth, 2);  | 
296  | 0  |             size_t row, col;  | 
297  | 0  |             for (row = 0; row < dstH2; row += 2)  | 
298  | 0  |             { | 
299  | 0  |                 for (col = 0; col < dstW2; col += 2)  | 
300  | 0  |                     WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels;  | 
301  | 0  |                 if (col < dstWidth)  | 
302  | 0  |                     WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 2, dstWidth - col), src += dstChannels;  | 
303  | 0  |             }  | 
304  | 0  |             if (row < dstHeight)  | 
305  | 0  |             { | 
306  | 0  |                 for (col = 0; col < dstW2; col += 2)  | 
307  | 0  |                     WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 2), src += dstChannels;  | 
308  | 0  |                 if (col < dstWidth)  | 
309  | 0  |                     WinogradKernel2x2Block2x2SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels;  | 
310  | 0  |             }  | 
311  | 0  |         }  | 
312  |  |  | 
313  |  |         //-----------------------------------------------------------------------  | 
314  |  |  | 
315  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetFilterRow(const __m128* t, float* dst, size_t stride)  | 
316  | 0  |         { | 
317  | 0  |             const __m128 r2 = _mm_set1_ps(1.0f / 2.0f);  | 
318  | 0  |             const __m128 r3 = _mm_set1_ps(1.0f / 3.0f);  | 
319  | 0  |             const __m128 r6 = _mm_set1_ps(1.0f / 6.0f);  | 
320  | 0  |             const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f);  | 
321  |  | 
  | 
322  | 0  |             _mm_storeu_ps(dst + 0 * stride, _mm_mul_ps(r2, t[0]));  | 
323  | 0  |             _mm_storeu_ps(dst + 1 * stride, _mm_mul_ps(mr2, _mm_add_ps(t[0], t[1])));  | 
324  | 0  |             _mm_storeu_ps(dst + 2 * stride, _mm_mul_ps(r6, _mm_sub_ps(t[1], t[0])));  | 
325  | 0  |             _mm_storeu_ps(dst + 3 * stride, _mm_add_ps(_mm_mul_ps(r6, t[0]), _mm_mul_ps(r3, t[1])));  | 
326  | 0  |             _mm_storeu_ps(dst + 4 * stride, t[1]);  | 
327  | 0  |         }  | 
328  |  |  | 
329  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter(const __m128 src[4], float* dst, size_t stride)  | 
330  | 0  |         { | 
331  | 0  |             const __m128 r2 = _mm_set1_ps(1.0f / 2.0f);  | 
332  | 0  |             const __m128 r3 = _mm_set1_ps(1.0f / 3.0f);  | 
333  | 0  |             const __m128 r6 = _mm_set1_ps(1.0f / 6.0f);  | 
334  | 0  |             const __m128 mr2 = _mm_set1_ps(-1.0f / 2.0f);  | 
335  |  | 
  | 
336  | 0  |             __m128 t[2];  | 
337  | 0  |             t[0] = _mm_mul_ps(r2, src[0]);  | 
338  | 0  |             t[1] = _mm_mul_ps(r2, src[1]);  | 
339  | 0  |             WinogradKernel2x2Block4x4SetFilterRow(t, dst + 0 * stride, stride);  | 
340  |  | 
  | 
341  | 0  |             t[0] = _mm_mul_ps(mr2, _mm_add_ps(src[0], src[2]));  | 
342  | 0  |             t[1] = _mm_mul_ps(mr2, _mm_add_ps(src[1], src[3]));  | 
343  | 0  |             WinogradKernel2x2Block4x4SetFilterRow(t, dst + 5 * stride, stride);  | 
344  |  | 
  | 
345  | 0  |             t[0] = _mm_mul_ps(r6, _mm_sub_ps(src[2], src[0]));  | 
346  | 0  |             t[1] = _mm_mul_ps(r6, _mm_sub_ps(src[3], src[1]));  | 
347  | 0  |             WinogradKernel2x2Block4x4SetFilterRow(t, dst + 10 * stride, stride);  | 
348  |  | 
  | 
349  | 0  |             t[0] = _mm_add_ps(_mm_mul_ps(r6, src[0]), _mm_mul_ps(r3, src[2]));  | 
350  | 0  |             t[1] = _mm_add_ps(_mm_mul_ps(r6, src[1]), _mm_mul_ps(r3, src[3]));  | 
351  | 0  |             WinogradKernel2x2Block4x4SetFilterRow(t, dst + 15 * stride, stride);  | 
352  |  | 
  | 
353  | 0  |             t[0] = src[2];  | 
354  | 0  |             t[1] = src[3];  | 
355  | 0  |             WinogradKernel2x2Block4x4SetFilterRow(t, dst + 20 * stride, stride);  | 
356  | 0  |         }  | 
357  |  |  | 
358  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4n(const float* src, float* dst, size_t stride)  | 
359  | 0  |         { | 
360  | 0  |             __m128 _src[4];  | 
361  | 0  |             Load4(src + 0, 4, _src + 0);  | 
362  | 0  |             WinogradKernel2x2Block4x4SetFilter(_src, dst, stride);  | 
363  | 0  |         }  | 
364  |  |  | 
365  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetFilter4t(const float* src, float* dst, size_t stride)  | 
366  | 0  |         { | 
367  | 0  |             __m128 _src[4];  | 
368  | 0  |             _src[0] = _mm_loadu_ps(src + 0 * stride);  | 
369  | 0  |             _src[1] = _mm_loadu_ps(src + 1 * stride);  | 
370  | 0  |             _src[2] = _mm_loadu_ps(src + 2 * stride);  | 
371  | 0  |             _src[3] = _mm_loadu_ps(src + 3 * stride);  | 
372  | 0  |             WinogradKernel2x2Block4x4SetFilter(_src, dst, stride);  | 
373  | 0  |         }  | 
374  |  |  | 
375  |  |         void WinogradKernel2x2Block4x4SetFilter(const float* src, size_t size, float* dst, SimdBool trans)  | 
376  | 0  |         { | 
377  | 0  |             size_t size4 = AlignLo(size, 4), i = 0;  | 
378  | 0  |             if (trans)  | 
379  | 0  |             { | 
380  | 0  |                 for (; i < size4; i += 4)  | 
381  | 0  |                     WinogradKernel2x2Block4x4SetFilter4t(src + i, dst + i, size);  | 
382  | 0  |                 for (; i < size; i += 1)  | 
383  | 0  |                     Base::WinogradKernel2x2Block4x4SetFilter1t(src + i, dst + i, size);  | 
384  | 0  |             }  | 
385  | 0  |             else  | 
386  | 0  |             { | 
387  | 0  |                 for (; i < size4; i += 4, src += 16, dst += 4)  | 
388  | 0  |                     WinogradKernel2x2Block4x4SetFilter4n(src, dst, size);  | 
389  | 0  |                 for (; i < size; i += 1, src += 4, dst += 1)  | 
390  | 0  |                     Base::WinogradKernel2x2Block4x4SetFilter1n(src, dst, size);  | 
391  | 0  |             }  | 
392  | 0  |         }  | 
393  |  |  | 
394  |  |         //-----------------------------------------------------------------------  | 
395  |  |  | 
396  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStoreRow(const __m128 tmp[5], float* dst, size_t stride)  | 
397  | 0  |         { | 
398  | 0  |             const __m128 _2 = _mm_set1_ps(2.0f);  | 
399  | 0  |             const __m128 _3 = _mm_set1_ps(3.0f);  | 
400  | 0  |             _mm_storeu_ps(dst + 0 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[0]), tmp[1]), _mm_sub_ps(tmp[3], _mm_mul_ps(_2, tmp[2]))));  | 
401  | 0  |             _mm_storeu_ps(dst + 1 * stride, _mm_sub_ps(tmp[3], _mm_add_ps(_mm_mul_ps(_2, tmp[1]), tmp[2])));  | 
402  | 0  |             _mm_storeu_ps(dst + 2 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), _mm_mul_ps(_3, tmp[2])), tmp[3]));  | 
403  | 0  |             _mm_storeu_ps(dst + 3 * stride, _mm_sub_ps(tmp[3], tmp[1]));  | 
404  | 0  |             _mm_storeu_ps(dst + 4 * stride, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, tmp[1]), tmp[2]), _mm_sub_ps(tmp[4], _mm_mul_ps(_2, tmp[3]))));  | 
405  | 0  |         }  | 
406  |  |  | 
407  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInputStore(const __m128* src, float* dst, size_t stride)  | 
408  | 0  |         { | 
409  | 0  |             const __m128 _2 = _mm_set1_ps(2.0f);  | 
410  | 0  |             const __m128 _3 = _mm_set1_ps(3.0f);  | 
411  | 0  |             __m128 tmp[5];  | 
412  | 0  |             tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[0]), src[5]), _mm_sub_ps(src[15], _mm_mul_ps(_2, src[10])));  | 
413  | 0  |             tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[1]), src[6]), _mm_sub_ps(src[16], _mm_mul_ps(_2, src[11])));  | 
414  | 0  |             tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[2]), src[7]), _mm_sub_ps(src[17], _mm_mul_ps(_2, src[12])));  | 
415  | 0  |             tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[3]), src[8]), _mm_sub_ps(src[18], _mm_mul_ps(_2, src[13])));  | 
416  | 0  |             tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[4]), src[9]), _mm_sub_ps(src[19], _mm_mul_ps(_2, src[14])));  | 
417  | 0  |             WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 0 * stride, stride);  | 
418  |  | 
  | 
419  | 0  |             tmp[0] = _mm_sub_ps(src[15], _mm_add_ps(_mm_mul_ps(_2, src[5]), src[10]));  | 
420  | 0  |             tmp[1] = _mm_sub_ps(src[16], _mm_add_ps(_mm_mul_ps(_2, src[6]), src[11]));  | 
421  | 0  |             tmp[2] = _mm_sub_ps(src[17], _mm_add_ps(_mm_mul_ps(_2, src[7]), src[12]));  | 
422  | 0  |             tmp[3] = _mm_sub_ps(src[18], _mm_add_ps(_mm_mul_ps(_2, src[8]), src[13]));  | 
423  | 0  |             tmp[4] = _mm_sub_ps(src[19], _mm_add_ps(_mm_mul_ps(_2, src[9]), src[14]));  | 
424  | 0  |             WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 5 * stride, stride);  | 
425  |  | 
  | 
426  | 0  |             tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), _mm_mul_ps(_3, src[10])), src[15]);  | 
427  | 0  |             tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), _mm_mul_ps(_3, src[11])), src[16]);  | 
428  | 0  |             tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), _mm_mul_ps(_3, src[12])), src[17]);  | 
429  | 0  |             tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), _mm_mul_ps(_3, src[13])), src[18]);  | 
430  | 0  |             tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), _mm_mul_ps(_3, src[14])), src[19]);  | 
431  | 0  |             WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 10 * stride, stride);  | 
432  |  | 
  | 
433  | 0  |             tmp[0] = _mm_sub_ps(src[15], src[5]);  | 
434  | 0  |             tmp[1] = _mm_sub_ps(src[16], src[6]);  | 
435  | 0  |             tmp[2] = _mm_sub_ps(src[17], src[7]);  | 
436  | 0  |             tmp[3] = _mm_sub_ps(src[18], src[8]);  | 
437  | 0  |             tmp[4] = _mm_sub_ps(src[19], src[9]);  | 
438  | 0  |             WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 15 * stride, stride);  | 
439  |  | 
  | 
440  | 0  |             tmp[0] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[5]), src[10]), _mm_sub_ps(src[20], _mm_mul_ps(_2, src[15])));  | 
441  | 0  |             tmp[1] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[6]), src[11]), _mm_sub_ps(src[21], _mm_mul_ps(_2, src[16])));  | 
442  | 0  |             tmp[2] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[7]), src[12]), _mm_sub_ps(src[22], _mm_mul_ps(_2, src[17])));  | 
443  | 0  |             tmp[3] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[8]), src[13]), _mm_sub_ps(src[23], _mm_mul_ps(_2, src[18])));  | 
444  | 0  |             tmp[4] = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(_2, src[9]), src[14]), _mm_sub_ps(src[24], _mm_mul_ps(_2, src[19])));  | 
445  | 0  |             WinogradKernel2x2Block4x4SetInputStoreRow(tmp, dst + 20 * stride, stride);  | 
446  | 0  |         }  | 
447  |  |  | 
448  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, __m128 dst[25])  | 
449  | 0  |         { | 
450  | 0  |             dst[0] = _mm_loadu_ps(src + 0 * srcS + 0 * srcC);  | 
451  | 0  |             dst[1] = _mm_loadu_ps(src + 0 * srcS + 1 * srcC);  | 
452  | 0  |             dst[2] = _mm_loadu_ps(src + 0 * srcS + 2 * srcC);  | 
453  | 0  |             dst[3] = _mm_loadu_ps(src + 0 * srcS + 3 * srcC);  | 
454  | 0  |             dst[4] = _mm_loadu_ps(src + 0 * srcS + 4 * srcC);  | 
455  | 0  |             dst[5] = _mm_loadu_ps(src + 1 * srcS + 0 * srcC);  | 
456  | 0  |             dst[6] = _mm_loadu_ps(src + 1 * srcS + 1 * srcC);  | 
457  | 0  |             dst[7] = _mm_loadu_ps(src + 1 * srcS + 2 * srcC);  | 
458  | 0  |             dst[8] = _mm_loadu_ps(src + 1 * srcS + 3 * srcC);  | 
459  | 0  |             dst[9] = _mm_loadu_ps(src + 1 * srcS + 4 * srcC);  | 
460  | 0  |             dst[10] = _mm_loadu_ps(src + 2 * srcS + 0 * srcC);  | 
461  | 0  |             dst[11] = _mm_loadu_ps(src + 2 * srcS + 1 * srcC);  | 
462  | 0  |             dst[12] = _mm_loadu_ps(src + 2 * srcS + 2 * srcC);  | 
463  | 0  |             dst[13] = _mm_loadu_ps(src + 2 * srcS + 3 * srcC);  | 
464  | 0  |             dst[14] = _mm_loadu_ps(src + 2 * srcS + 4 * srcC);  | 
465  | 0  |             dst[15] = _mm_loadu_ps(src + 3 * srcS + 0 * srcC);  | 
466  | 0  |             dst[16] = _mm_loadu_ps(src + 3 * srcS + 1 * srcC);  | 
467  | 0  |             dst[17] = _mm_loadu_ps(src + 3 * srcS + 2 * srcC);  | 
468  | 0  |             dst[18] = _mm_loadu_ps(src + 3 * srcS + 3 * srcC);  | 
469  | 0  |             dst[19] = _mm_loadu_ps(src + 3 * srcS + 4 * srcC);  | 
470  | 0  |             dst[20] = _mm_loadu_ps(src + 4 * srcS + 0 * srcC);  | 
471  | 0  |             dst[21] = _mm_loadu_ps(src + 4 * srcS + 1 * srcC);  | 
472  | 0  |             dst[22] = _mm_loadu_ps(src + 4 * srcS + 2 * srcC);  | 
473  | 0  |             dst[23] = _mm_loadu_ps(src + 4 * srcS + 3 * srcC);  | 
474  | 0  |             dst[24] = _mm_loadu_ps(src + 4 * srcS + 4 * srcC);  | 
475  | 0  |         }  | 
476  |  |  | 
477  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, float* dst, size_t dstStride)  | 
478  | 0  |         { | 
479  | 0  |             size_t srcS = srcW * srcC;  | 
480  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
481  | 0  |             for (size_t c = 0; c < srcCF; c += F)  | 
482  | 0  |             { | 
483  | 0  |                 __m128 tmp[25];  | 
484  | 0  |                 WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, tmp);  | 
485  | 0  |                 WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride);  | 
486  | 0  |             }  | 
487  | 0  |             if (srcCF < srcC)  | 
488  | 0  |             { | 
489  | 0  |                 __m128 tmp[25];  | 
490  | 0  |                 WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, tmp);  | 
491  | 0  |                 WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride);  | 
492  | 0  |             }  | 
493  | 0  |         }  | 
494  |  |  | 
495  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcS, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, __m128 dst[25])  | 
496  | 0  |         { | 
497  | 0  |             for (size_t i = 0; i < 25; ++i)  | 
498  | 0  |                 dst[i] = _mm_setzero_ps();  | 
499  | 0  |             for (size_t row = rowB; row < rowE; ++row)  | 
500  | 0  |                 for (size_t col = colB; col < colE; ++col)  | 
501  | 0  |                     dst[row * 5 + col] = _mm_loadu_ps(src + row * srcS + col * srcC);  | 
502  | 0  |         }  | 
503  |  |  | 
504  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetInput4t(const float* src, size_t srcW, size_t srcC, size_t rowB, size_t rowE, size_t colB, size_t colE, float* dst, size_t dstStride)  | 
505  | 0  |         { | 
506  | 0  |             size_t srcS = srcW * srcC;  | 
507  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
508  | 0  |             for (size_t c = 0; c < srcCF; c += F)  | 
509  | 0  |             { | 
510  | 0  |                 __m128 tmp[25];  | 
511  | 0  |                 WinogradKernel2x2Block4x4SetInput4t(src + c, srcS, srcC, rowB, rowE, colB, colE, tmp);  | 
512  | 0  |                 WinogradKernel2x2Block4x4SetInputStore(tmp, dst + c, dstStride);  | 
513  | 0  |             }  | 
514  | 0  |             if (srcCF < srcC)  | 
515  | 0  |             { | 
516  | 0  |                 __m128 tmp[25];  | 
517  | 0  |                 WinogradKernel2x2Block4x4SetInput4t(src + srcC - F, srcS, srcC, rowB, rowE, colB, colE, tmp);  | 
518  | 0  |                 WinogradKernel2x2Block4x4SetInputStore(tmp, dst + srcC - F, dstStride);  | 
519  | 0  |             }  | 
520  | 0  |         }  | 
521  |  |  | 
522  |  |         void WinogradKernel2x2Block4x4SetInput(const float* src, size_t srcChannels, size_t srcHeight, size_t srcWidth,  | 
523  |  |             size_t padY, size_t padX, size_t padH, size_t padW, float* dst, size_t dstStride, SimdBool trans)  | 
524  | 0  |         { | 
525  | 0  |             assert(padY == padX && padW == padH && (padY + padH == 0 || padY + padH == 1));  | 
526  | 0  |             if (trans ? (srcChannels < F) : true)  | 
527  | 0  |             { | 
528  | 0  |                 Base::WinogradKernel2x2Block4x4SetInput(src, srcChannels, srcHeight, srcWidth, padY, padX, padH, padW, dst, dstStride, trans);  | 
529  | 0  |                 return;  | 
530  | 0  |             }  | 
531  | 0  |             size_t dstH = srcHeight - 1 + padY + padH;  | 
532  | 0  |             size_t dstW = srcWidth - 1 + padX + padW;  | 
533  | 0  |             size_t dstH4 = AlignLo(dstH, 4);  | 
534  | 0  |             size_t dstW4 = AlignLo(dstW, 4);  | 
535  | 0  |             size_t noseW = Simd::Min<size_t>(5, dstW + 1);  | 
536  | 0  |             size_t noseH = Simd::Min<size_t>(5, dstH + 1);  | 
537  | 0  |             size_t startY = padY ? 4 : 0;  | 
538  | 0  |             size_t startX = padX ? 4 : 0;  | 
539  | 0  |             if (padY || padH)  | 
540  | 0  |             { | 
541  | 0  |                 if (dstH == dstH4)  | 
542  | 0  |                     dstH4 -= 4;  | 
543  | 0  |                 if (dstW == dstW4)  | 
544  | 0  |                     dstW4 -= 4;  | 
545  | 0  |                 if (padY)  | 
546  | 0  |                     src -= (srcWidth + 1) * (trans ? srcChannels : 1);  | 
547  | 0  |             }  | 
548  | 0  |             size_t tailW = dstW - dstW4 + (padW ? 0 : 1);  | 
549  | 0  |             size_t tailH = dstH - dstH4 + (padH ? 0 : 1);  | 
550  | 0  |             size_t row = 0, col = 0;  | 
551  | 0  |             if (padY)  | 
552  | 0  |             { | 
553  | 0  |                 if (padX)  | 
554  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src, srcWidth, srcChannels, 1, noseH, 1, noseW, dst, dstStride), dst += srcChannels;  | 
555  | 0  |                 for (col = startX; col < dstW4; col += 4)  | 
556  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, 5, dst, dstStride), dst += srcChannels;  | 
557  | 0  |                 if (col < dstW)  | 
558  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + col * srcChannels, srcWidth, srcChannels, 1, noseH, 0, tailW, dst, dstStride), dst += srcChannels;  | 
559  | 0  |             }  | 
560  | 0  |             for (row = startY; row < dstH4; row += 4)  | 
561  | 0  |             { | 
562  | 0  |                 if (padX)  | 
563  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, 5, 1, noseW, dst, dstStride), dst += srcChannels;  | 
564  | 0  |                 for (col = startX; col < dstW4; col += 4)  | 
565  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, dst, dstStride), dst += srcChannels;  | 
566  | 0  |                 if (col < dstW)  | 
567  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, 5, 0, tailW, dst, dstStride), dst += srcChannels;  | 
568  | 0  |             }  | 
569  | 0  |             if (row < dstH)  | 
570  | 0  |             { | 
571  | 0  |                 if (padX)  | 
572  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + row * srcWidth * srcChannels, srcWidth, srcChannels, 0, tailH, 1, noseW, dst, dstStride), dst += srcChannels;  | 
573  | 0  |                 for (col = startX; col < dstW4; col += 4)  | 
574  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, 5, dst, dstStride), dst += srcChannels;  | 
575  | 0  |                 if (col < dstW)  | 
576  | 0  |                     WinogradKernel2x2Block4x4SetInput4t(src + (row * srcWidth + col) * srcChannels, srcWidth, srcChannels, 0, tailH, 0, tailW, dst, dstStride), dst += srcChannels;  | 
577  | 0  |             }  | 
578  | 0  |         }  | 
579  |  |  | 
580  |  |         //-----------------------------------------------------------------------  | 
581  |  |  | 
582  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputGetRow(const __m128 * s, __m128 * d)  | 
583  | 0  |         { | 
584  | 0  |             const __m128 _2 = _mm_set1_ps(2.0f);  | 
585  | 0  |             const __m128 _4 = _mm_set1_ps(4.0f);  | 
586  | 0  |             const __m128 _8 = _mm_set1_ps(8.0f);  | 
587  | 0  |             d[0] = _mm_add_ps(_mm_add_ps(s[0], s[1]), _mm_add_ps(s[2], s[3]));  | 
588  | 0  |             d[1] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_mul_ps(_2, s[3]));  | 
589  | 0  |             d[2] = _mm_add_ps(_mm_add_ps(s[1], s[2]), _mm_mul_ps(_4, s[3]));  | 
590  | 0  |             d[3] = _mm_add_ps(_mm_sub_ps(s[1], s[2]), _mm_add_ps(_mm_mul_ps(_8, s[3]), s[4]));  | 
591  | 0  |         }  | 
592  |  |  | 
593  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputLoad25(const float* src, size_t stride, __m128* dst)  | 
594  | 0  |         { | 
595  | 0  |             __m128 s[25];  | 
596  | 0  |             s[0] = _mm_loadu_ps(src + 0 * stride);  | 
597  | 0  |             s[1] = _mm_loadu_ps(src + 1 * stride);  | 
598  | 0  |             s[2] = _mm_loadu_ps(src + 2 * stride);  | 
599  | 0  |             s[3] = _mm_loadu_ps(src + 3 * stride);  | 
600  | 0  |             s[4] = _mm_loadu_ps(src + 4 * stride);  | 
601  | 0  |             s[5] = _mm_loadu_ps(src + 5 * stride);  | 
602  | 0  |             s[6] = _mm_loadu_ps(src + 6 * stride);  | 
603  | 0  |             s[7] = _mm_loadu_ps(src + 7 * stride);  | 
604  | 0  |             s[8] = _mm_loadu_ps(src + 8 * stride);  | 
605  | 0  |             s[9] = _mm_loadu_ps(src + 9 * stride);  | 
606  | 0  |             s[10] = _mm_loadu_ps(src + 10 * stride);  | 
607  | 0  |             s[11] = _mm_loadu_ps(src + 11 * stride);  | 
608  | 0  |             s[12] = _mm_loadu_ps(src + 12 * stride);  | 
609  | 0  |             s[13] = _mm_loadu_ps(src + 13 * stride);  | 
610  | 0  |             s[14] = _mm_loadu_ps(src + 14 * stride);  | 
611  | 0  |             s[15] = _mm_loadu_ps(src + 15 * stride);  | 
612  | 0  |             s[16] = _mm_loadu_ps(src + 16 * stride);  | 
613  | 0  |             s[17] = _mm_loadu_ps(src + 17 * stride);  | 
614  | 0  |             s[18] = _mm_loadu_ps(src + 18 * stride);  | 
615  | 0  |             s[19] = _mm_loadu_ps(src + 19 * stride);  | 
616  | 0  |             s[20] = _mm_loadu_ps(src + 20 * stride);  | 
617  | 0  |             s[21] = _mm_loadu_ps(src + 21 * stride);  | 
618  | 0  |             s[22] = _mm_loadu_ps(src + 22 * stride);  | 
619  | 0  |             s[23] = _mm_loadu_ps(src + 23 * stride);  | 
620  | 0  |             s[24] = _mm_loadu_ps(src + 24 * stride);  | 
621  |  | 
  | 
622  | 0  |             const __m128 _2 = _mm_set1_ps(2.0f);  | 
623  | 0  |             const __m128 _4 = _mm_set1_ps(4.0f);  | 
624  | 0  |             const __m128 _8 = _mm_set1_ps(8.0f);  | 
625  | 0  |             __m128 t[5];  | 
626  | 0  |             t[0] = _mm_add_ps(_mm_add_ps(s[0], s[5]), _mm_add_ps(s[10], s[15]));  | 
627  | 0  |             t[1] = _mm_add_ps(_mm_add_ps(s[1], s[6]), _mm_add_ps(s[11], s[16]));  | 
628  | 0  |             t[2] = _mm_add_ps(_mm_add_ps(s[2], s[7]), _mm_add_ps(s[12], s[17]));  | 
629  | 0  |             t[3] = _mm_add_ps(_mm_add_ps(s[3], s[8]), _mm_add_ps(s[13], s[18]));  | 
630  | 0  |             t[4] = _mm_add_ps(_mm_add_ps(s[4], s[9]), _mm_add_ps(s[14], s[19]));  | 
631  | 0  |             WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 0);  | 
632  |  | 
  | 
633  | 0  |             t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_mul_ps(_2, s[15]));  | 
634  | 0  |             t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_mul_ps(_2, s[16]));  | 
635  | 0  |             t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_mul_ps(_2, s[17]));  | 
636  | 0  |             t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_mul_ps(_2, s[18]));  | 
637  | 0  |             t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_mul_ps(_2, s[19]));  | 
638  | 0  |             WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 4);  | 
639  |  | 
  | 
640  | 0  |             t[0] = _mm_add_ps(_mm_add_ps(s[5], s[10]), _mm_mul_ps(_4, s[15]));  | 
641  | 0  |             t[1] = _mm_add_ps(_mm_add_ps(s[6], s[11]), _mm_mul_ps(_4, s[16]));  | 
642  | 0  |             t[2] = _mm_add_ps(_mm_add_ps(s[7], s[12]), _mm_mul_ps(_4, s[17]));  | 
643  | 0  |             t[3] = _mm_add_ps(_mm_add_ps(s[8], s[13]), _mm_mul_ps(_4, s[18]));  | 
644  | 0  |             t[4] = _mm_add_ps(_mm_add_ps(s[9], s[14]), _mm_mul_ps(_4, s[19]));  | 
645  | 0  |             WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 8);  | 
646  |  | 
  | 
647  | 0  |             t[0] = _mm_add_ps(_mm_sub_ps(s[5], s[10]), _mm_add_ps(_mm_mul_ps(_8, s[15]), s[20]));  | 
648  | 0  |             t[1] = _mm_add_ps(_mm_sub_ps(s[6], s[11]), _mm_add_ps(_mm_mul_ps(_8, s[16]), s[21]));  | 
649  | 0  |             t[2] = _mm_add_ps(_mm_sub_ps(s[7], s[12]), _mm_add_ps(_mm_mul_ps(_8, s[17]), s[22]));  | 
650  | 0  |             t[3] = _mm_add_ps(_mm_sub_ps(s[8], s[13]), _mm_add_ps(_mm_mul_ps(_8, s[18]), s[23]));  | 
651  | 0  |             t[4] = _mm_add_ps(_mm_sub_ps(s[9], s[14]), _mm_add_ps(_mm_mul_ps(_8, s[19]), s[24]));  | 
652  | 0  |             WinogradKernel2x2Block4x4SetOutputGetRow(t, dst + 12);  | 
653  | 0  |         }  | 
654  |  |  | 
655  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC)  | 
656  | 0  |         { | 
657  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 0 * dstC, src[0]);  | 
658  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 1 * dstC, src[1]);  | 
659  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 2 * dstC, src[2]);  | 
660  | 0  |             _mm_storeu_ps(dst + 0 * dstS + 3 * dstC, src[3]);  | 
661  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 0 * dstC, src[4]);  | 
662  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 1 * dstC, src[5]);  | 
663  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 2 * dstC, src[6]);  | 
664  | 0  |             _mm_storeu_ps(dst + 1 * dstS + 3 * dstC, src[7]);  | 
665  | 0  |             _mm_storeu_ps(dst + 2 * dstS + 0 * dstC, src[8]);  | 
666  | 0  |             _mm_storeu_ps(dst + 2 * dstS + 1 * dstC, src[9]);  | 
667  | 0  |             _mm_storeu_ps(dst + 2 * dstS + 2 * dstC, src[10]);  | 
668  | 0  |             _mm_storeu_ps(dst + 2 * dstS + 3 * dstC, src[11]);  | 
669  | 0  |             _mm_storeu_ps(dst + 3 * dstS + 0 * dstC, src[12]);  | 
670  | 0  |             _mm_storeu_ps(dst + 3 * dstS + 1 * dstC, src[13]);  | 
671  | 0  |             _mm_storeu_ps(dst + 3 * dstS + 2 * dstC, src[14]);  | 
672  | 0  |             _mm_storeu_ps(dst + 3 * dstS + 3 * dstC, src[15]);  | 
673  | 0  |         }  | 
674  |  |  | 
675  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC)  | 
676  | 0  |         { | 
677  | 0  |             size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);  | 
678  | 0  |             for (size_t d = 0; d < dstCF; d += F)  | 
679  | 0  |             { | 
680  | 0  |                 __m128 tmp[16];  | 
681  | 0  |                 WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp);  | 
682  | 0  |                 WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC);  | 
683  | 0  |             }  | 
684  | 0  |             if (dstCF < dstC)  | 
685  | 0  |             { | 
686  | 0  |                 __m128 tmp[16];  | 
687  | 0  |                 WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp);  | 
688  | 0  |                 WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC);  | 
689  | 0  |             }  | 
690  | 0  |         }  | 
691  |  |  | 
692  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutputStore16(const __m128 src[16], float* dst, size_t dstS, size_t dstC, size_t rowE, size_t colE)  | 
693  | 0  |         { | 
694  | 0  |             for (size_t row = 0; row < rowE; ++row)  | 
695  | 0  |                 for (size_t col = 0; col < colE; ++col)  | 
696  | 0  |                     _mm_storeu_ps(dst + row * dstS + col * dstC, src[row * 4 + col]);  | 
697  | 0  |         }  | 
698  |  |  | 
699  |  |         SIMD_INLINE void WinogradKernel2x2Block4x4SetOutput4t(const float* src, size_t srcStride, float* dst, size_t dstW, size_t dstC, size_t rowE, size_t colE)  | 
700  | 0  |         { | 
701  | 0  |             size_t dstS = dstW * dstC, dstCF = AlignLo(dstC, F);  | 
702  | 0  |             for (size_t d = 0; d < dstCF; d += F)  | 
703  | 0  |             { | 
704  | 0  |                 __m128 tmp[16];  | 
705  | 0  |                 WinogradKernel2x2Block4x4SetOutputLoad25(src + d, srcStride, tmp);  | 
706  | 0  |                 WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + d, dstS, dstC, rowE, colE);  | 
707  | 0  |             }  | 
708  | 0  |             if (dstCF < dstC)  | 
709  | 0  |             { | 
710  | 0  |                 __m128 tmp[16];  | 
711  | 0  |                 WinogradKernel2x2Block4x4SetOutputLoad25(src + dstC - F, srcStride, tmp);  | 
712  | 0  |                 WinogradKernel2x2Block4x4SetOutputStore16(tmp, dst + dstC - F, dstS, dstC, rowE, colE);  | 
713  | 0  |             }  | 
714  | 0  |         }  | 
715  |  |  | 
716  |  |         void WinogradKernel2x2Block4x4SetOutput(const float* src, size_t srcStride, float* dst, size_t dstChannels, size_t dstHeight, size_t dstWidth, SimdBool trans)  | 
717  | 0  |         { | 
718  | 0  |             if (trans ? (dstChannels < F) : true)  | 
719  | 0  |             { | 
720  | 0  |                 Base::WinogradKernel2x2Block4x4SetOutput(src, srcStride, dst, dstChannels, dstHeight, dstWidth, trans);  | 
721  | 0  |                 return;  | 
722  | 0  |             }  | 
723  | 0  |             size_t tileH = (dstHeight + 3) / 4;  | 
724  | 0  |             size_t tileW = (dstWidth + 3) / 4;  | 
725  | 0  |             size_t dstH4 = AlignLo(dstHeight, 4);  | 
726  | 0  |             size_t dstW4 = AlignLo(dstWidth, 4);  | 
727  | 0  |             size_t row, col;  | 
728  | 0  |             for (row = 0; row < dstH4; row += 4)  | 
729  | 0  |             { | 
730  | 0  |                 for (col = 0; col < dstW4; col += 4)  | 
731  | 0  |                     WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels), src += dstChannels;  | 
732  | 0  |                 if (col < dstWidth)  | 
733  | 0  |                     WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, 4, dstWidth - col), src += dstChannels;  | 
734  | 0  |             }  | 
735  | 0  |             if (row < dstHeight)  | 
736  | 0  |             { | 
737  | 0  |                 for (col = 0; col < dstW4; col += 4)  | 
738  | 0  |                     WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, 4), src += dstChannels;  | 
739  | 0  |                 if (col < dstWidth)  | 
740  | 0  |                     WinogradKernel2x2Block4x4SetOutput4t(src, srcStride, dst + (row * dstWidth + col) * dstChannels, dstWidth, dstChannels, dstHeight - row, dstWidth - col), src += dstChannels;  | 
741  | 0  |             }  | 
742  | 0  |         }  | 
743  |  |     }  | 
744  |  | #endif  | 
745  |  | }  |