/src/Simd/src/Simd/SimdSse41SynetConvolution32fDirectNhwc.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  | * Simd Library (http://ermig1979.github.io/Simd).  | 
3  |  | *  | 
4  |  | * Copyright (c) 2011-2024 Yermalayeu Ihar.  | 
5  |  | *  | 
6  |  | * Permission is hereby granted, free of charge, to any person obtaining a copy  | 
7  |  | * of this software and associated documentation files (the "Software"), to deal  | 
8  |  | * in the Software without restriction, including without limitation the rights  | 
9  |  | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  | 
10  |  | * copies of the Software, and to permit persons to whom the Software is  | 
11  |  | * furnished to do so, subject to the following conditions:  | 
12  |  | *  | 
13  |  | * The above copyright notice and this permission notice shall be included in  | 
14  |  | * all copies or substantial portions of the Software.  | 
15  |  | *  | 
16  |  | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  | 
17  |  | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  | 
18  |  | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  | 
19  |  | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  | 
20  |  | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  | 
21  |  | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  | 
22  |  | * SOFTWARE.  | 
23  |  | */  | 
24  |  | #include "Simd/SimdSynetConvolution32f.h"  | 
25  |  | #include "Simd/SimdSynetConvolution32fCommon.h"  | 
26  |  | #include "Simd/SimdExtract.h"  | 
27  |  | #include "Simd/SimdSynet.h"  | 
28  |  | #include "Simd/SimdGemm.h"  | 
29  |  | #include "Simd/SimdExp.h"  | 
30  |  |  | 
31  |  | namespace Simd  | 
32  |  | { | 
33  |  | #if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)     | 
34  |  |     namespace Sse41  | 
35  |  |     { | 
36  |  |         SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam & p)  | 
37  |  |             : Base::SynetConvolution32fDirectNhwc(p)  | 
38  | 0  |         { | 
39  | 0  |             _convolutionBiasActivation = SetConvolutionBiasActivation();  | 
40  | 0  |         }  | 
41  |  |  | 
42  |  |         bool SynetConvolution32fDirectNhwc::Preferable(const ConvParam & p)  | 
43  | 0  |         { | 
44  | 0  |             if (!p.IsDilation(1) || p.trans == 0)  | 
45  | 0  |                 return false;  | 
46  | 0  |             if (p.group == 1)  | 
47  | 0  |             { | 
48  | 0  |                 if (p.kernelY > p.srcH || p.kernelX > p.srcW)  | 
49  | 0  |                     return false;  | 
50  | 0  |                 double k = double(p.srcC) / p.kernelX / p.kernelY;  | 
51  | 0  |                 return k < 2.0;  | 
52  | 0  |             }  | 
53  | 0  |             else if (p.IsDepthwise())  | 
54  | 0  |             { | 
55  | 0  |                 return true;  | 
56  | 0  |             }  | 
57  | 0  |             return false;  | 
58  | 0  |         }  | 
59  |  |  | 
60  |  |         SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam & p, size_t kH, size_t kW, const float * weight, __m128 & sum)  | 
61  | 0  |         { | 
62  | 0  |             size_t size = kW * p.srcC, tail = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC;  | 
63  | 0  |             for (size_t ky = 0; ky < kH; ++ky)  | 
64  | 0  |             { | 
65  | 0  |                 for (size_t i = 0; i < size; ++i, weight += dstC)  | 
66  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(src[i]), _mm_loadu_ps(weight)), sum);  | 
67  | 0  |                 weight += tail;  | 
68  | 0  |                 src += stride;  | 
69  | 0  |             }  | 
70  | 0  |         }  | 
71  |  |  | 
72  |  |         template<::SimdConvolutionActivationType type>  | 
73  |  |         SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst)  | 
74  | 0  |         { | 
75  | 0  |             size_t dstC = p.dstC;  | 
76  | 0  |             size_t dstCF = AlignLo(dstC, F);  | 
77  | 0  |             size_t dc = 0;  | 
78  | 0  |             for (; dc < dstCF; dc += F)  | 
79  | 0  |             { | 
80  | 0  |                 __m128 conv = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
81  | 0  |                 KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv);  | 
82  | 0  |                 _mm_storeu_ps(dst + dc, Activate<type>(conv, params, dc));  | 
83  | 0  |             }  | 
84  | 0  |             if (dc < dstC)  | 
85  | 0  |             { | 
86  | 0  |                 dc = dstC - F;  | 
87  | 0  |                 __m128 conv = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
88  | 0  |                 KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv);  | 
89  | 0  |                 _mm_storeu_ps(dst + dc, Activate<type>(conv, params, dc));  | 
90  | 0  |             }  | 
91  | 0  |         } Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)  | 
92  |  |  | 
93  |  |         SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam & p, const float * weight, __m128 sums[2][2])  | 
94  | 0  |         { | 
95  | 0  |             size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;  | 
96  | 0  |             const float * src0 = src + 0 * step;  | 
97  | 0  |             const float * src1 = src + 1 * step;  | 
98  | 0  |             __m128 w0, w1, s0;  | 
99  | 0  |             for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
100  | 0  |             { | 
101  | 0  |                 size_t offset = ky * stride;  | 
102  | 0  |                 for (size_t end = offset + size; offset < end; ++offset)  | 
103  | 0  |                 { | 
104  | 0  |                     w0 = _mm_loadu_ps(weight + 0 * F);  | 
105  | 0  |                     w1 = _mm_loadu_ps(weight + 1 * F);  | 
106  | 0  |                     s0 = _mm_set1_ps(src0[offset]);  | 
107  | 0  |                     sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);  | 
108  | 0  |                     sums[0][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[0][1]);  | 
109  | 0  |                     s0 = _mm_set1_ps(src1[offset]);  | 
110  | 0  |                     sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);  | 
111  | 0  |                     sums[1][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[1][1]);  | 
112  | 0  |                     weight += dstC;  | 
113  | 0  |                 }  | 
114  | 0  |             }  | 
115  | 0  |         }  | 
116  |  |  | 
117  |  |         SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam & p, const float * weight, __m128 sums[2][1])  | 
118  | 0  |         { | 
119  | 0  |             size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;  | 
120  | 0  |             const float * src0 = src + 0 * step;  | 
121  | 0  |             const float * src1 = src + 1 * step;  | 
122  | 0  |             __m128 w0, s0;  | 
123  | 0  |             for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
124  | 0  |             { | 
125  | 0  |                 size_t offset = ky * stride;  | 
126  | 0  |                 for (size_t end = offset + size; offset < end; ++offset)  | 
127  | 0  |                 { | 
128  | 0  |                     w0 = _mm_loadu_ps(weight + 0 * F);  | 
129  | 0  |                     s0 = _mm_set1_ps(src0[offset]);  | 
130  | 0  |                     sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);  | 
131  | 0  |                     s0 = _mm_set1_ps(src1[offset]);  | 
132  | 0  |                     sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);  | 
133  | 0  |                     weight += dstC;  | 
134  | 0  |                 }  | 
135  | 0  |             }  | 
136  | 0  |         }  | 
137  |  |  | 
138  |  |         template<::SimdConvolutionActivationType type>  | 
139  |  |         SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)  | 
140  | 0  |         { | 
141  | 0  |             size_t dstC = p.dstC;  | 
142  | 0  |             size_t dstCF1 = AlignLo(dstC, 1 * F);  | 
143  | 0  |             size_t dstCF2 = AlignLo(dstC, 2 * F);  | 
144  | 0  |             size_t dc = 0;  | 
145  | 0  |             for (; dc < dstCF2; dc += 2 * F)  | 
146  | 0  |             { | 
147  | 0  |                 __m128 sums[2][2];  | 
148  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc + 0 * F) : _mm_setzero_ps();  | 
149  | 0  |                 __m128 bias1 = bias ? _mm_loadu_ps(bias + dc + 1 * F) : _mm_setzero_ps();  | 
150  | 0  |                 sums[0][0] = bias0;  | 
151  | 0  |                 sums[0][1] = bias1;  | 
152  | 0  |                 sums[1][0] = bias0;  | 
153  | 0  |                 sums[1][1] = bias1;  | 
154  | 0  |                 KernelHwcDefaultBody2x2(src, p, weight + dc, sums);  | 
155  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate<type>(sums[0][0], params, dc + 0 * F));  | 
156  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate<type>(sums[0][1], params, dc + 1 * F));  | 
157  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate<type>(sums[1][0], params, dc + 0 * F));  | 
158  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate<type>(sums[1][1], params, dc + 1 * F));  | 
159  | 0  |             }  | 
160  | 0  |             for (; dc < dstCF1; dc += 1 * F)  | 
161  | 0  |             { | 
162  | 0  |                 __m128 sums[2][1];  | 
163  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
164  | 0  |                 sums[0][0] = bias0;  | 
165  | 0  |                 sums[1][0] = bias0;  | 
166  | 0  |                 KernelHwcDefaultBody2x1(src, p, weight + dc, sums);  | 
167  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));  | 
168  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));  | 
169  | 0  |             }  | 
170  | 0  |             if (dc < dstC)  | 
171  | 0  |             { | 
172  | 0  |                 dc = dstC - F;  | 
173  | 0  |                 __m128 sums[2][1];  | 
174  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
175  | 0  |                 sums[0][0] = bias0;  | 
176  | 0  |                 sums[1][0] = bias0;  | 
177  | 0  |                 KernelHwcDefaultBody2x1(src, p, weight + dc, sums);  | 
178  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));  | 
179  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));  | 
180  | 0  |             }  | 
181  | 0  |         } Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
182  |  |  | 
183  |  |         SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam & p, const float * weight, __m128 sums[6][2])  | 
184  | 0  |         { | 
185  | 0  |             size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;  | 
186  | 0  |             const float * src0 = src + 0 * step;  | 
187  | 0  |             const float * src1 = src + 1 * step;  | 
188  | 0  |             const float * src2 = src + 2 * step;  | 
189  | 0  |             const float * src3 = src + 3 * step;  | 
190  | 0  |             const float * src4 = src + 4 * step;  | 
191  | 0  |             const float * src5 = src + 5 * step;  | 
192  | 0  |             __m128 w0, w1, s0;  | 
193  | 0  |             for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
194  | 0  |             { | 
195  | 0  |                 size_t offset = ky * stride;  | 
196  | 0  |                 for (size_t end = offset + size; offset < end; ++offset)  | 
197  | 0  |                 { | 
198  | 0  |                     w0 = _mm_loadu_ps(weight + 0 * F);  | 
199  | 0  |                     w1 = _mm_loadu_ps(weight + 1 * F);  | 
200  | 0  |                     s0 = _mm_set1_ps(src0[offset]);  | 
201  | 0  |                     sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);  | 
202  | 0  |                     sums[0][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[0][1]);  | 
203  | 0  |                     s0 = _mm_set1_ps(src1[offset]);  | 
204  | 0  |                     sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);  | 
205  | 0  |                     sums[1][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[1][1]);  | 
206  | 0  |                     s0 = _mm_set1_ps(src2[offset]);  | 
207  | 0  |                     sums[2][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[2][0]);  | 
208  | 0  |                     sums[2][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[2][1]);  | 
209  | 0  |                     s0 = _mm_set1_ps(src3[offset]);  | 
210  | 0  |                     sums[3][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[3][0]);  | 
211  | 0  |                     sums[3][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[3][1]);  | 
212  | 0  |                     s0 = _mm_set1_ps(src4[offset]);  | 
213  | 0  |                     sums[4][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[4][0]);  | 
214  | 0  |                     sums[4][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[4][1]);  | 
215  | 0  |                     s0 = _mm_set1_ps(src5[offset]);  | 
216  | 0  |                     sums[5][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[5][0]);  | 
217  | 0  |                     sums[5][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[5][1]);  | 
218  | 0  |                     weight += dstC;  | 
219  | 0  |                 }  | 
220  | 0  |             }  | 
221  | 0  |         }  | 
222  |  |  | 
223  |  |         SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam & p, const float * weight, __m128 sums[6][1])  | 
224  | 0  |         { | 
225  | 0  |             size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;  | 
226  | 0  |             const float * src0 = src + 0 * step;  | 
227  | 0  |             const float * src1 = src + 1 * step;  | 
228  | 0  |             const float * src2 = src + 2 * step;  | 
229  | 0  |             const float * src3 = src + 3 * step;  | 
230  | 0  |             const float * src4 = src + 4 * step;  | 
231  | 0  |             const float * src5 = src + 5 * step;  | 
232  | 0  |             __m128 w0, s0;  | 
233  | 0  |             for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
234  | 0  |             { | 
235  | 0  |                 size_t offset = ky * stride;  | 
236  | 0  |                 for (size_t end = offset + size; offset < end; ++offset)  | 
237  | 0  |                 { | 
238  | 0  |                     w0 = _mm_loadu_ps(weight + 0 * F);  | 
239  | 0  |                     s0 = _mm_set1_ps(src0[offset]);  | 
240  | 0  |                     sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);  | 
241  | 0  |                     s0 = _mm_set1_ps(src1[offset]);  | 
242  | 0  |                     sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);  | 
243  | 0  |                     s0 = _mm_set1_ps(src2[offset]);  | 
244  | 0  |                     sums[2][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[2][0]);  | 
245  | 0  |                     s0 = _mm_set1_ps(src3[offset]);  | 
246  | 0  |                     sums[3][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[3][0]);  | 
247  | 0  |                     s0 = _mm_set1_ps(src4[offset]);  | 
248  | 0  |                     sums[4][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[4][0]);  | 
249  | 0  |                     s0 = _mm_set1_ps(src5[offset]);  | 
250  | 0  |                     sums[5][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[5][0]);  | 
251  | 0  |                     weight += dstC;  | 
252  | 0  |                 }  | 
253  | 0  |             }  | 
254  | 0  |         }  | 
255  |  |  | 
256  |  |         template<::SimdConvolutionActivationType type>  | 
257  |  |         SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)  | 
258  | 0  |         { | 
259  | 0  |             size_t dstC = p.dstC;  | 
260  | 0  |             size_t dstCF1 = AlignLo(dstC, 1 * F);  | 
261  | 0  |             size_t dstCF2 = AlignLo(dstC, 2 * F);  | 
262  | 0  |             size_t dc = 0;  | 
263  | 0  |             for (; dc < dstCF2; dc += 2 * F)  | 
264  | 0  |             { | 
265  | 0  |                 __m128 sums[6][2];  | 
266  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc + 0 * F) : _mm_setzero_ps();  | 
267  | 0  |                 __m128 bias1 = bias ? _mm_loadu_ps(bias + dc + 1 * F) : _mm_setzero_ps();  | 
268  | 0  |                 sums[0][0] = bias0;  | 
269  | 0  |                 sums[0][1] = bias1;  | 
270  | 0  |                 sums[1][0] = bias0;  | 
271  | 0  |                 sums[1][1] = bias1;  | 
272  | 0  |                 sums[2][0] = bias0;  | 
273  | 0  |                 sums[2][1] = bias1;  | 
274  | 0  |                 sums[3][0] = bias0;  | 
275  | 0  |                 sums[3][1] = bias1;  | 
276  | 0  |                 sums[4][0] = bias0;  | 
277  | 0  |                 sums[4][1] = bias1;  | 
278  | 0  |                 sums[5][0] = bias0;  | 
279  | 0  |                 sums[5][1] = bias1;  | 
280  | 0  |                 KernelHwcDefaultBody6x2(src, p, weight + dc, sums);  | 
281  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate<type>(sums[0][0], params, dc + 0 * F));  | 
282  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate<type>(sums[0][1], params, dc + 1 * F));  | 
283  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate<type>(sums[1][0], params, dc + 0 * F));  | 
284  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate<type>(sums[1][1], params, dc + 1 * F));  | 
285  | 0  |                 _mm_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate<type>(sums[2][0], params, dc + 0 * F));  | 
286  | 0  |                 _mm_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate<type>(sums[2][1], params, dc + 1 * F));  | 
287  | 0  |                 _mm_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate<type>(sums[3][0], params, dc + 0 * F));  | 
288  | 0  |                 _mm_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate<type>(sums[3][1], params, dc + 1 * F));  | 
289  | 0  |                 _mm_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate<type>(sums[4][0], params, dc + 0 * F));  | 
290  | 0  |                 _mm_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate<type>(sums[4][1], params, dc + 1 * F));  | 
291  | 0  |                 _mm_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate<type>(sums[5][0], params, dc + 0 * F));  | 
292  | 0  |                 _mm_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate<type>(sums[5][1], params, dc + 1 * F));  | 
293  | 0  |             }  | 
294  | 0  |             for (; dc < dstCF1; dc += 1 * F)  | 
295  | 0  |             { | 
296  | 0  |                 __m128 sums[6][1];  | 
297  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
298  | 0  |                 sums[0][0] = bias0;  | 
299  | 0  |                 sums[1][0] = bias0;  | 
300  | 0  |                 sums[2][0] = bias0;  | 
301  | 0  |                 sums[3][0] = bias0;  | 
302  | 0  |                 sums[4][0] = bias0;  | 
303  | 0  |                 sums[5][0] = bias0;  | 
304  | 0  |                 KernelHwcDefaultBody6x1(src, p, weight + dc, sums);  | 
305  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));  | 
306  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));  | 
307  | 0  |                 _mm_storeu_ps(dst + dc + 2 * dstC, Activate<type>(sums[2][0], params, dc));  | 
308  | 0  |                 _mm_storeu_ps(dst + dc + 3 * dstC, Activate<type>(sums[3][0], params, dc));  | 
309  | 0  |                 _mm_storeu_ps(dst + dc + 4 * dstC, Activate<type>(sums[4][0], params, dc));  | 
310  | 0  |                 _mm_storeu_ps(dst + dc + 5 * dstC, Activate<type>(sums[5][0], params, dc));  | 
311  | 0  |             }  | 
312  | 0  |             if (dc < dstC)  | 
313  | 0  |             { | 
314  | 0  |                 dc = dstC - F;  | 
315  | 0  |                 __m128 sums[6][1];  | 
316  | 0  |                 __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();  | 
317  | 0  |                 sums[0][0] = bias0;  | 
318  | 0  |                 sums[1][0] = bias0;  | 
319  | 0  |                 sums[2][0] = bias0;  | 
320  | 0  |                 sums[3][0] = bias0;  | 
321  | 0  |                 sums[4][0] = bias0;  | 
322  | 0  |                 sums[5][0] = bias0;  | 
323  | 0  |                 KernelHwcDefaultBody6x1(src, p, weight + dc, sums);  | 
324  | 0  |                 _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));  | 
325  | 0  |                 _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));  | 
326  | 0  |                 _mm_storeu_ps(dst + dc + 2 * dstC, Activate<type>(sums[2][0], params, dc));  | 
327  | 0  |                 _mm_storeu_ps(dst + dc + 3 * dstC, Activate<type>(sums[3][0], params, dc));  | 
328  | 0  |                 _mm_storeu_ps(dst + dc + 4 * dstC, Activate<type>(sums[4][0], params, dc));  | 
329  | 0  |                 _mm_storeu_ps(dst + dc + 5 * dstC, Activate<type>(sums[5][0], params, dc));  | 
330  | 0  |             }  | 
331  | 0  |         } Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
332  |  |  | 
333  |  |         template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)  | 
334  | 0  |         { | 
335  | 0  |             size_t noseH = p.padY, noseW = p.padX;  | 
336  | 0  |             size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW;  | 
337  | 0  |             size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW;  | 
338  | 0  |             size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW;  | 
339  | 0  |             size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW;  | 
340  | 0  |             size_t wS = p.srcC*p.dstC;  | 
341  | 0  |             size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1;  | 
342  | 0  |             size_t sy = 0;  | 
343  | 0  |             for (; sy < noseH; sy += p.strideY)  | 
344  | 0  |             { | 
345  | 0  |                 size_t sx = 0;  | 
346  | 0  |                 const float * w = weight + (noseH - sy) * p.kernelY * wS;  | 
347  | 0  |                 for (; sx < noseW; sx += p.strideX, dst += p.dstC)  | 
348  | 0  |                     KernelHwcDefaultEdge<type>(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst);  | 
349  | 0  |                 for (; sx < bodyW; sx += p.strideX, dst += p.dstC)  | 
350  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst);  | 
351  | 0  |                 for (; sx < tailW; sx += p.strideX, dst += p.dstC)  | 
352  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst);  | 
353  | 0  |             }  | 
354  | 0  |             src += (sy - noseH)*p.srcW*p.srcC;  | 
355  | 0  |             for (; sy < bodyH; sy += p.strideY)  | 
356  | 0  |             { | 
357  | 0  |                 size_t sx = 0;  | 
358  | 0  |                 for (; sx < noseW; sx += p.strideX, dst += p.dstC)  | 
359  | 0  |                     KernelHwcDefaultEdge<type>(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst);  | 
360  | 0  |                 for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC)  | 
361  | 0  |                     KernelHwcDefaultBody6<type>(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst);  | 
362  | 0  |                 for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC)  | 
363  | 0  |                     KernelHwcDefaultBody2<type>(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst);  | 
364  | 0  |                 for (; sx < bodyW; sx += p.strideX, dst += p.dstC)  | 
365  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst);  | 
366  | 0  |                 for (; sx < tailW; sx += p.strideX, dst += p.dstC)  | 
367  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst);  | 
368  | 0  |                 src += p.strideY*p.srcW*p.srcC;  | 
369  | 0  |             }  | 
370  | 0  |             for (; sy < tailH; sy += p.strideY)  | 
371  | 0  |             { | 
372  | 0  |                 size_t sx = 0;  | 
373  | 0  |                 for (; sx < noseW; sx += p.strideX, dst += p.dstC)  | 
374  | 0  |                     KernelHwcDefaultEdge<type>(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst);  | 
375  | 0  |                 for (; sx < bodyW; sx += p.strideX, dst += p.dstC)  | 
376  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst);  | 
377  | 0  |                 for (; sx < tailW; sx += p.strideX, dst += p.dstC)  | 
378  | 0  |                     KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst);  | 
379  | 0  |                 src += p.strideY*p.srcW*p.srcC;  | 
380  | 0  |             }  | 
381  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
382  |  |  | 
383  |  |         //-------------------------------------------------------------------------------------------------  | 
384  |  |  | 
385  |  |         template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)  | 
386  | 0  |         { | 
387  | 0  |             size_t size = p.group;  | 
388  | 0  |             size_t sizeF = AlignLo(size, F);  | 
389  | 0  |             size_t size2F = AlignLo(size, 2 * F);  | 
390  | 0  |             size_t size4F = AlignLo(size, 4 * F);  | 
391  | 0  |             size_t size8F = AlignLo(size, 8 * F);  | 
392  | 0  |             for (size_t dy = 0; dy < p.dstH; ++dy)  | 
393  | 0  |             { | 
394  | 0  |                 for (size_t dx = 0; dx < p.dstW; ++dx)  | 
395  | 0  |                 { | 
396  | 0  |                     size_t i = 0;  | 
397  | 0  |                     for (; i < size8F; i += 8 * F)  | 
398  | 0  |                     { | 
399  | 0  |                         __m128 sums[8];  | 
400  | 0  |                         if (bias)  | 
401  | 0  |                         { | 
402  | 0  |                             sums[0] = _mm_loadu_ps(bias + i + 0 * F);  | 
403  | 0  |                             sums[1] = _mm_loadu_ps(bias + i + 1 * F);  | 
404  | 0  |                             sums[2] = _mm_loadu_ps(bias + i + 2 * F);  | 
405  | 0  |                             sums[3] = _mm_loadu_ps(bias + i + 3 * F);  | 
406  | 0  |                             sums[4] = _mm_loadu_ps(bias + i + 4 * F);  | 
407  | 0  |                             sums[5] = _mm_loadu_ps(bias + i + 5 * F);  | 
408  | 0  |                             sums[6] = _mm_loadu_ps(bias + i + 6 * F);  | 
409  | 0  |                             sums[7] = _mm_loadu_ps(bias + i + 7 * F);  | 
410  | 0  |                         }  | 
411  | 0  |                         else  | 
412  | 0  |                         { | 
413  | 0  |                             sums[0] = _mm_setzero_ps();  | 
414  | 0  |                             sums[1] = _mm_setzero_ps();  | 
415  | 0  |                             sums[2] = _mm_setzero_ps();  | 
416  | 0  |                             sums[3] = _mm_setzero_ps();  | 
417  | 0  |                             sums[4] = _mm_setzero_ps();  | 
418  | 0  |                             sums[5] = _mm_setzero_ps();  | 
419  | 0  |                             sums[6] = _mm_setzero_ps();  | 
420  | 0  |                             sums[7] = _mm_setzero_ps();  | 
421  | 0  |                         }  | 
422  | 0  |                         for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
423  | 0  |                         { | 
424  | 0  |                             size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;  | 
425  | 0  |                             if (sy < p.srcH)  | 
426  | 0  |                             { | 
427  | 0  |                                 for (size_t kx = 0; kx < p.kernelX; ++kx)  | 
428  | 0  |                                 { | 
429  | 0  |                                     size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;  | 
430  | 0  |                                     if (sx < p.srcW)  | 
431  | 0  |                                     { | 
432  | 0  |                                         const float * pw = weight + (ky*p.kernelX + kx)*size + i;  | 
433  | 0  |                                         const float * ps = src + (sy*p.srcW + sx)*size + i;  | 
434  | 0  |                                         sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);  | 
435  | 0  |                                         sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);  | 
436  | 0  |                                         sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);  | 
437  | 0  |                                         sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);  | 
438  | 0  |                                         sums[4] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * F), _mm_loadu_ps(pw + 4 * F)), sums[4]);  | 
439  | 0  |                                         sums[5] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * F), _mm_loadu_ps(pw + 5 * F)), sums[5]);  | 
440  | 0  |                                         sums[6] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * F), _mm_loadu_ps(pw + 6 * F)), sums[6]);  | 
441  | 0  |                                         sums[7] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * F), _mm_loadu_ps(pw + 7 * F)), sums[7]);  | 
442  | 0  |                                     }  | 
443  | 0  |                                 }  | 
444  | 0  |                             }  | 
445  | 0  |                         }  | 
446  | 0  |                         _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));  | 
447  | 0  |                         _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));  | 
448  | 0  |                         _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));  | 
449  | 0  |                         _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));  | 
450  | 0  |                         _mm_storeu_ps(dst + i + 4 * F, Activate<type>(sums[4], params, i + 4 * F));  | 
451  | 0  |                         _mm_storeu_ps(dst + i + 5 * F, Activate<type>(sums[5], params, i + 5 * F));  | 
452  | 0  |                         _mm_storeu_ps(dst + i + 6 * F, Activate<type>(sums[6], params, i + 6 * F));  | 
453  | 0  |                         _mm_storeu_ps(dst + i + 7 * F, Activate<type>(sums[7], params, i + 7 * F));  | 
454  | 0  |                     }  | 
455  | 0  |                     for (; i < size4F; i += 4 * F)  | 
456  | 0  |                     { | 
457  | 0  |                         __m128 sums[4];  | 
458  | 0  |                         if (bias)  | 
459  | 0  |                         { | 
460  | 0  |                             sums[0] = _mm_loadu_ps(bias + i + 0 * F);  | 
461  | 0  |                             sums[1] = _mm_loadu_ps(bias + i + 1 * F);  | 
462  | 0  |                             sums[2] = _mm_loadu_ps(bias + i + 2 * F);  | 
463  | 0  |                             sums[3] = _mm_loadu_ps(bias + i + 3 * F);  | 
464  | 0  |                         }  | 
465  | 0  |                         else  | 
466  | 0  |                         { | 
467  | 0  |                             sums[0] = _mm_setzero_ps();  | 
468  | 0  |                             sums[1] = _mm_setzero_ps();  | 
469  | 0  |                             sums[2] = _mm_setzero_ps();  | 
470  | 0  |                             sums[3] = _mm_setzero_ps();  | 
471  | 0  |                         }  | 
472  | 0  |                         for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
473  | 0  |                         { | 
474  | 0  |                             size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;  | 
475  | 0  |                             if (sy < p.srcH)  | 
476  | 0  |                             { | 
477  | 0  |                                 for (size_t kx = 0; kx < p.kernelX; ++kx)  | 
478  | 0  |                                 { | 
479  | 0  |                                     size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;  | 
480  | 0  |                                     if (sx < p.srcW)  | 
481  | 0  |                                     { | 
482  | 0  |                                         const float * pw = weight + (ky*p.kernelX + kx)*size + i;  | 
483  | 0  |                                         const float * ps = src + (sy*p.srcW + sx)*size + i;  | 
484  | 0  |                                         sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);  | 
485  | 0  |                                         sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);  | 
486  | 0  |                                         sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);  | 
487  | 0  |                                         sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);  | 
488  | 0  |                                     }  | 
489  | 0  |                                 }  | 
490  | 0  |                             }  | 
491  | 0  |                         }  | 
492  | 0  |                         _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));  | 
493  | 0  |                         _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));  | 
494  | 0  |                         _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));  | 
495  | 0  |                         _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));  | 
496  | 0  |                     }  | 
497  | 0  |                     for (; i < size2F; i += 2 * F)  | 
498  | 0  |                     { | 
499  | 0  |                         __m128 sums[2];  | 
500  | 0  |                         if (bias)  | 
501  | 0  |                         { | 
502  | 0  |                             sums[0] = _mm_loadu_ps(bias + i + 0 * F);  | 
503  | 0  |                             sums[1] = _mm_loadu_ps(bias + i + 1 * F);  | 
504  | 0  |                         }  | 
505  | 0  |                         else  | 
506  | 0  |                         { | 
507  | 0  |                             sums[0] = _mm_setzero_ps();  | 
508  | 0  |                             sums[1] = _mm_setzero_ps();  | 
509  | 0  |                         }  | 
510  | 0  |                         for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
511  | 0  |                         { | 
512  | 0  |                             size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;  | 
513  | 0  |                             if (sy < p.srcH)  | 
514  | 0  |                             { | 
515  | 0  |                                 for (size_t kx = 0; kx < p.kernelX; ++kx)  | 
516  | 0  |                                 { | 
517  | 0  |                                     size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;  | 
518  | 0  |                                     if (sx < p.srcW)  | 
519  | 0  |                                     { | 
520  | 0  |                                         const float * pw = weight + (ky*p.kernelX + kx)*size + i;  | 
521  | 0  |                                         const float * ps = src + (sy*p.srcW + sx)*size + i;  | 
522  | 0  |                                         sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);  | 
523  | 0  |                                         sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);  | 
524  | 0  |                                     }  | 
525  | 0  |                                 }  | 
526  | 0  |                             }  | 
527  | 0  |                         }  | 
528  | 0  |                         _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));  | 
529  | 0  |                         _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));  | 
530  | 0  |                     }  | 
531  | 0  |                     for (; i < size; i += F)  | 
532  | 0  |                     { | 
533  | 0  |                         size_t ci = i >= sizeF ? size - F : i;  | 
534  | 0  |                         __m128 sum = bias ? _mm_loadu_ps(bias + ci) : _mm_setzero_ps();  | 
535  | 0  |                         for (size_t ky = 0; ky < p.kernelY; ++ky)  | 
536  | 0  |                         { | 
537  | 0  |                             size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;  | 
538  | 0  |                             if (sy < p.srcH)  | 
539  | 0  |                             { | 
540  | 0  |                                 for (size_t kx = 0; kx < p.kernelX; ++kx)  | 
541  | 0  |                                 { | 
542  | 0  |                                     size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;  | 
543  | 0  |                                     if (sx < p.srcW)  | 
544  | 0  |                                     { | 
545  | 0  |                                         const float * pw = weight + (ky*p.kernelX + kx)*size + ci;  | 
546  | 0  |                                         const float * ps = src + (sy*p.srcW + sx)*size + ci;  | 
547  | 0  |                                         sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);  | 
548  | 0  |                                     }  | 
549  | 0  |                                 }  | 
550  | 0  |                             }  | 
551  | 0  |                         }  | 
552  | 0  |                         _mm_storeu_ps(dst + ci, Activate<type>(sum, params, ci));  | 
553  | 0  |                     }  | 
554  | 0  |                     dst += p.dstC;  | 
555  | 0  |                 }  | 
556  | 0  |             }  | 
557  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
558  |  |  | 
559  |  |         //-------------------------------------------------------------------------------------------------  | 
560  |  |  | 
561  |  |         template<SimdConvolutionActivationType type>  | 
562  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst)  | 
563  | 0  |         { | 
564  | 0  |             size_t srcC = p.srcC;  | 
565  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
566  | 0  |             size_t c = 0;  | 
567  | 0  |             for (; c < srcCF; c += F)  | 
568  | 0  |             { | 
569  | 0  |                 __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
570  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
571  | 0  |                 { | 
572  | 0  |                     size_t sy = dy * p.strideY + ky - p.padY;  | 
573  | 0  |                     if (sy < p.srcH)  | 
574  | 0  |                     { | 
575  | 0  |                         for (size_t kx = 0; kx < 3; ++kx)  | 
576  | 0  |                         { | 
577  | 0  |                             size_t sx = dx * p.strideX + kx - p.padX;  | 
578  | 0  |                             if (sx < p.srcW)  | 
579  | 0  |                             { | 
580  | 0  |                                 const float * pw = weight + (ky * 3 + kx) * srcC;  | 
581  | 0  |                                 const float * ps = src + (sy*p.srcW + sx) * srcC;  | 
582  | 0  |                                 sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);  | 
583  | 0  |                             }  | 
584  | 0  |                         }  | 
585  | 0  |                     }  | 
586  | 0  |                 }  | 
587  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));  | 
588  | 0  |                 src += F;  | 
589  | 0  |                 weight += F;  | 
590  | 0  |             }  | 
591  | 0  |             if (c < srcC)  | 
592  | 0  |             { | 
593  | 0  |                 c = srcC - F;  | 
594  | 0  |                 src -= srcCF - c;  | 
595  | 0  |                 weight -= srcCF - c;  | 
596  | 0  |                 __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
597  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
598  | 0  |                 { | 
599  | 0  |                     size_t sy = dy * p.strideY + ky - p.padY;  | 
600  | 0  |                     if (sy < p.srcH)  | 
601  | 0  |                     { | 
602  | 0  |                         for (size_t kx = 0; kx < 3; ++kx)  | 
603  | 0  |                         { | 
604  | 0  |                             size_t sx = dx * p.strideX + kx - p.padX;  | 
605  | 0  |                             if (sx < p.srcW)  | 
606  | 0  |                             { | 
607  | 0  |                                 const float * pw = weight + (ky * 3 + kx) * srcC;  | 
608  | 0  |                                 const float * ps = src + (sy*p.srcW + sx) * srcC;  | 
609  | 0  |                                 sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);  | 
610  | 0  |                             }  | 
611  | 0  |                         }  | 
612  | 0  |                     }  | 
613  | 0  |                 }  | 
614  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));  | 
615  | 0  |             }  | 
616  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)  | 
617  |  |  | 
618  |  |         template<::SimdConvolutionActivationType type>  | 
619  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)  | 
620  | 0  |         { | 
621  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
622  | 0  |             size_t c = 0;  | 
623  | 0  |             for (; c < srcCF; c += F)  | 
624  | 0  |             { | 
625  | 0  |                 __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
626  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
627  | 0  |                 { | 
628  | 0  |                     const float * ps = src + ky * srcS;  | 
629  | 0  |                     const float * pw = weight + ky * 3 * srcC;  | 
630  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);  | 
631  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);  | 
632  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);  | 
633  | 0  |                 }  | 
634  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));  | 
635  | 0  |                 src += F;  | 
636  | 0  |                 weight += F;  | 
637  | 0  |             }  | 
638  | 0  |             if (c < srcC)  | 
639  | 0  |             { | 
640  | 0  |                 c = srcC - F;  | 
641  | 0  |                 src -= srcCF - c;  | 
642  | 0  |                 weight -= srcCF - c;  | 
643  | 0  |                 __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
644  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
645  | 0  |                 { | 
646  | 0  |                     const float * ps = src + ky * srcS;  | 
647  | 0  |                     const float * pw = weight + ky * 3 * srcC;  | 
648  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);  | 
649  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);  | 
650  | 0  |                     sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);  | 
651  | 0  |                 }  | 
652  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));  | 
653  | 0  |             }  | 
654  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)  | 
655  |  |  | 
656  |  |         template<::SimdConvolutionActivationType type>  | 
657  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)  | 
658  | 0  |         { | 
659  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
660  | 0  |             size_t c = 0;  | 
661  | 0  |             __m128 sum0, sum1, w0;  | 
662  | 0  |             for (; c < srcCF; c += F)  | 
663  | 0  |             { | 
664  | 0  |                 sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
665  | 0  |                 sum1 = sum0;  | 
666  | 0  |                 const float * pw = weight + c;  | 
667  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
668  | 0  |                 { | 
669  | 0  |                     const float * ps0 = src + ky * srcS;  | 
670  | 0  |                     const float * ps1 = ps0 + srcX;  | 
671  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
672  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);  | 
673  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);  | 
674  | 0  |                     pw += srcC;  | 
675  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
676  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);  | 
677  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);  | 
678  | 0  |                     pw += srcC;  | 
679  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
680  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);  | 
681  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);  | 
682  | 0  |                     pw += srcC;  | 
683  | 0  |                 }  | 
684  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));  | 
685  | 0  |                 _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));  | 
686  | 0  |                 src += F;  | 
687  | 0  |             }  | 
688  | 0  |             if (c < srcC)  | 
689  | 0  |             { | 
690  | 0  |                 c = srcC - F;  | 
691  | 0  |                 src -= srcCF - c;  | 
692  | 0  |                 sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
693  | 0  |                 sum1 = sum0;  | 
694  | 0  |                 const float * pw = weight + c;  | 
695  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
696  | 0  |                 { | 
697  | 0  |                     const float * ps0 = src + ky * srcS;  | 
698  | 0  |                     const float * ps1 = ps0 + srcX;  | 
699  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
700  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);  | 
701  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);  | 
702  | 0  |                     pw += srcC;  | 
703  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
704  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);  | 
705  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);  | 
706  | 0  |                     pw += srcC;  | 
707  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
708  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);  | 
709  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);  | 
710  | 0  |                     pw += srcC;  | 
711  | 0  |                 }  | 
712  | 0  |                 _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));  | 
713  | 0  |                 _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));  | 
714  | 0  |             }  | 
715  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)  | 
716  |  |  | 
717  |  |         template<::SimdConvolutionActivationType type>  | 
718  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)  | 
719  | 0  |         { | 
720  | 0  |             size_t srcCF = AlignLo(srcC, F);  | 
721  | 0  |             size_t c = 0;  | 
722  | 0  |             for (; c < srcCF; c += F)  | 
723  | 0  |             { | 
724  | 0  |                 __m128 sum0, sum1, sum2, sum3, w0;  | 
725  | 0  |                 sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
726  | 0  |                 sum1 = sum0;  | 
727  | 0  |                 sum2 = sum0;  | 
728  | 0  |                 sum3 = sum0;  | 
729  | 0  |                 const float * pw = weight + c;  | 
730  | 0  |                 const float * ps0 = src + 0 * srcX;  | 
731  | 0  |                 const float * ps1 = src + 1 * srcX;  | 
732  | 0  |                 const float * ps2 = src + 2 * srcX;  | 
733  | 0  |                 const float * ps3 = src + 3 * srcX;  | 
734  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
735  | 0  |                 { | 
736  | 0  |                     size_t offset = ky * srcS;  | 
737  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
738  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
739  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
740  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
741  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
742  | 0  |                     pw += srcC, offset += srcC;  | 
743  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
744  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
745  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
746  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
747  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
748  | 0  |                     pw += srcC, offset += srcC;  | 
749  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
750  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
751  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
752  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
753  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
754  | 0  |                     pw += srcC, offset += srcC;  | 
755  | 0  |                 }  | 
756  | 0  |                 _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));  | 
757  | 0  |                 _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));  | 
758  | 0  |                 _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));  | 
759  | 0  |                 _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));  | 
760  | 0  |                 src += F;  | 
761  | 0  |                 dst += F;  | 
762  | 0  |             }  | 
763  | 0  |             if (c < srcC)  | 
764  | 0  |             { | 
765  | 0  |                 c = srcC - F;  | 
766  | 0  |                 src -= srcCF - c;  | 
767  | 0  |                 dst -= srcCF - c;  | 
768  | 0  |                 __m128 sum0, sum1, sum2, sum3, w0;  | 
769  | 0  |                 sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();  | 
770  | 0  |                 sum1 = sum0;  | 
771  | 0  |                 sum2 = sum0;  | 
772  | 0  |                 sum3 = sum0;  | 
773  | 0  |                 const float * pw = weight + c;  | 
774  | 0  |                 const float * ps0 = src + 0 * srcX;  | 
775  | 0  |                 const float * ps1 = src + 1 * srcX;  | 
776  | 0  |                 const float * ps2 = src + 2 * srcX;  | 
777  | 0  |                 const float * ps3 = src + 3 * srcX;  | 
778  | 0  |                 for (size_t ky = 0; ky < 3; ++ky)  | 
779  | 0  |                 { | 
780  | 0  |                     size_t offset = ky * srcS;  | 
781  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
782  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
783  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
784  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
785  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
786  | 0  |                     pw += srcC, offset += srcC;  | 
787  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
788  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
789  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
790  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
791  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
792  | 0  |                     pw += srcC, offset += srcC;  | 
793  | 0  |                     w0 = _mm_loadu_ps(pw);  | 
794  | 0  |                     sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);  | 
795  | 0  |                     sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);  | 
796  | 0  |                     sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);  | 
797  | 0  |                     sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);  | 
798  | 0  |                     pw += srcC, offset += srcC;  | 
799  | 0  |                 }  | 
800  | 0  |                 _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));  | 
801  | 0  |                 _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));  | 
802  | 0  |                 _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));  | 
803  | 0  |                 _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));  | 
804  | 0  |             }  | 
805  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)  | 
806  |  |  | 
807  |  |         template<::SimdConvolutionActivationType type>  | 
808  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(const float * src, const ConvParam & p, size_t dy, size_t dx, const __m128 * weight, __m128 bias, const float * params, float * dst)  | 
809  | 0  |         { | 
810  | 0  |             __m128 sum = bias;  | 
811  | 0  |             for (size_t ky = 0; ky < 3; ++ky)  | 
812  | 0  |             { | 
813  | 0  |                 size_t sy = dy * p.strideY + ky - p.padY;  | 
814  | 0  |                 if (sy < p.srcH)  | 
815  | 0  |                 { | 
816  | 0  |                     for (size_t kx = 0; kx < 3; ++kx)  | 
817  | 0  |                     { | 
818  | 0  |                         size_t sx = dx * p.strideX + kx - p.padX;  | 
819  | 0  |                         if (sx < p.srcW)  | 
820  | 0  |                         { | 
821  | 0  |                             const float * ps = src + (sy*p.srcW + sx) * F;  | 
822  | 0  |                             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), weight[ky * 3 + kx]), sum);  | 
823  | 0  |                         }  | 
824  | 0  |                     }  | 
825  | 0  |                 }  | 
826  | 0  |             }  | 
827  | 0  |             _mm_storeu_ps(dst, Activate<type>(sum, params, 0));  | 
828  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)  | 
829  |  |  | 
830  |  |         template<::SimdConvolutionActivationType type>  | 
831  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)  | 
832  | 0  |         { | 
833  | 0  |             __m128 sum = bias;  | 
834  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[0]), sum);  | 
835  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[1]), sum);  | 
836  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[2]), sum);  | 
837  | 0  |             src += srcS;  | 
838  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[3]), sum);  | 
839  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[4]), sum);  | 
840  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[5]), sum);  | 
841  | 0  |             src += srcS;  | 
842  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[6]), sum);  | 
843  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[7]), sum);  | 
844  | 0  |             sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[8]), sum);  | 
845  | 0  |             _mm_storeu_ps(dst, Activate<type>(sum, params, 0));  | 
846  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)  | 
847  |  |  | 
848  |  |         template<::SimdConvolutionActivationType type>  | 
849  |  |         SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)  | 
850  | 0  |         { | 
851  | 0  |             __m128 sum0 = bias;  | 
852  | 0  |             __m128 sum1 = bias;  | 
853  | 0  |             for (size_t ky = 0; ky < 3; ++ky)  | 
854  | 0  |             { | 
855  | 0  |                 __m128 s0 = _mm_loadu_ps(src + 0 * F);  | 
856  | 0  |                 __m128 s1 = _mm_loadu_ps(src + 1 * F);  | 
857  | 0  |                 __m128 s2 = _mm_loadu_ps(src + 2 * F);  | 
858  | 0  |                 __m128 s3 = _mm_loadu_ps(src + 3 * F);  | 
859  | 0  |                 sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum0);  | 
860  | 0  |                 sum1 = _mm_add_ps(_mm_mul_ps(s1, weight[0]), sum1);  | 
861  | 0  |                 sum0 = _mm_add_ps(_mm_mul_ps(s1, weight[1]), sum0);  | 
862  | 0  |                 sum1 = _mm_add_ps(_mm_mul_ps(s2, weight[1]), sum1);  | 
863  | 0  |                 sum0 = _mm_add_ps(_mm_mul_ps(s2, weight[2]), sum0);  | 
864  | 0  |                 sum1 = _mm_add_ps(_mm_mul_ps(s3, weight[2]), sum1);  | 
865  | 0  |                 src += srcS;  | 
866  | 0  |                 weight += 3;  | 
867  | 0  |             }  | 
868  | 0  |             _mm_storeu_ps(dst + 0, Activate<type>(sum0, params, 0));  | 
869  | 0  |             _mm_storeu_ps(dst + F, Activate<type>(sum1, params, 0));  | 
870  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)  | 
871  |  |  | 
872  |  |         template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)  | 
873  | 0  |         { | 
874  | 0  |             size_t srcS = p.srcC*p.srcW;  | 
875  | 0  |             size_t srcX = p.srcC*p.strideX;  | 
876  | 0  |             size_t dstH = p.dstH - p.padH;  | 
877  | 0  |             size_t dstW = p.dstW - p.padW;  | 
878  | 0  |             size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX;  | 
879  | 0  |             size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX;  | 
880  | 0  |             if (p.dstC == F && p.strideX == 1)  | 
881  | 0  |             { | 
882  | 0  |                 __m128 _weight[9];  | 
883  | 0  |                 for (size_t i = 0; i < 9; ++i)  | 
884  | 0  |                     _weight[i] = _mm_loadu_ps(weight + i * F);  | 
885  | 0  |                 __m128 _bias = bias ? _mm_loadu_ps(bias) : _mm_setzero_ps();  | 
886  | 0  |                 size_t dy = 0;  | 
887  | 0  |                 for (; dy < p.padY; ++dy)  | 
888  | 0  |                     for (size_t dx = 0; dx < p.dstW; ++dx)  | 
889  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;  | 
890  | 0  |                 for (; dy < dstH; ++dy)  | 
891  | 0  |                 { | 
892  | 0  |                     size_t dx = 0;  | 
893  | 0  |                     for (; dx < p.padX; ++dx)  | 
894  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;  | 
895  | 0  |                     size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;  | 
896  | 0  |                     for (; dx < dstW2; dx += 2)  | 
897  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<type>(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F;  | 
898  | 0  |                     for (; dx < dstW; ++dx)  | 
899  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<type>(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F;  | 
900  | 0  |                     for (; dx < p.dstW; ++dx)  | 
901  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;  | 
902  | 0  |                 }  | 
903  | 0  |                 for (; dy < p.dstH; ++dy)  | 
904  | 0  |                     for (size_t dx = 0; dx < p.dstW; ++dx)  | 
905  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;  | 
906  | 0  |             }  | 
907  | 0  |             else  | 
908  | 0  |             { | 
909  | 0  |                 size_t dy = 0;  | 
910  | 0  |                 for (; dy < p.padY; ++dy)  | 
911  | 0  |                     for (size_t dx = 0; dx < p.dstW; ++dx)  | 
912  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;  | 
913  | 0  |                 for (; dy < dstH; ++dy)  | 
914  | 0  |                 { | 
915  | 0  |                     size_t dx = 0;  | 
916  | 0  |                     for (; dx < p.padX; ++dx)  | 
917  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;  | 
918  | 0  |                     size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;  | 
919  | 0  |                     for (; dx < dstW4; dx += 4)  | 
920  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX;  | 
921  | 0  |                     for (; dx < dstW2; dx += 2)  | 
922  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX;  | 
923  | 0  |                     for (; dx < dstW; ++dx)  | 
924  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<type>(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX;  | 
925  | 0  |                     for (; dx < p.dstW; ++dx)  | 
926  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;  | 
927  | 0  |                 }  | 
928  | 0  |                 for (; dy < p.dstH; ++dy)  | 
929  | 0  |                     for (size_t dx = 0; dx < p.dstW; ++dx)  | 
930  | 0  |                         ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;  | 
931  | 0  |             }  | 
932  | 0  |         } Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
933  |  |  | 
934  |  |         template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam & p)  | 
935  | 0  |         { | 
936  | 0  |             if (p.group == 1)  | 
937  | 0  |                 return ConvolutionDirectNhwcConvolutionBiasActivationDefault<type>;  | 
938  | 0  |             else if (p.IsDepthwise())  | 
939  | 0  |             { | 
940  | 0  |                 if (p.IsKernel(3) && p.IsDilation(1))  | 
941  | 0  |                     return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<type>;  | 
942  | 0  |                 else  | 
943  | 0  |                     return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<type>;  | 
944  | 0  |             }  | 
945  | 0  |             return NULL;  | 
946  | 0  |         } Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)0>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)1>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)2>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)3>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)4>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)5>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)6>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)7>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)8>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)9>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)10>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)  | 
947  |  |  | 
948  |  |         SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation()  | 
949  | 0  |         { | 
950  | 0  |             const ConvParam & p = _param;  | 
951  | 0  |             SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL;  | 
952  | 0  |             if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW)  | 
953  | 0  |             { | 
954  | 0  |                 switch (p.activation)  | 
955  | 0  |                 { | 
956  | 0  |                 case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break;  | 
957  | 0  |                 case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break;  | 
958  | 0  |                 case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break;  | 
959  | 0  |                 case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break;  | 
960  | 0  |                 case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break;  | 
961  | 0  |                 case ::SimdConvolutionActivationElu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationElu>(p); break;  | 
962  | 0  |                 case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break;  | 
963  | 0  |                 case ::SimdConvolutionActivationMish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationMish>(p); break;  | 
964  | 0  |                 case ::SimdConvolutionActivationHardSigmoid: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHardSigmoid>(p); break;  | 
965  | 0  |                 case ::SimdConvolutionActivationSwish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationSwish>(p); break;  | 
966  | 0  |                 case ::SimdConvolutionActivationGelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationGelu>(p); break;  | 
967  | 0  |                 }  | 
968  | 0  |             }  | 
969  | 0  |             return func ? func : Base::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation();  | 
970  | 0  |         };  | 
971  |  |     }  | 
972  |  | #endif  | 
973  |  | }  |