Coverage Report

Created: 2024-10-01 06:54

/src/Simd/src/Simd/SimdSse41SynetConvolution32fDirectNhwc.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetConvolution32f.h"
25
#include "Simd/SimdSynetConvolution32fCommon.h"
26
#include "Simd/SimdExtract.h"
27
#include "Simd/SimdSynet.h"
28
#include "Simd/SimdGemm.h"
29
#include "Simd/SimdExp.h"
30
31
namespace Simd
32
{
33
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)   
34
    namespace Sse41
35
    {
36
        SynetConvolution32fDirectNhwc::SynetConvolution32fDirectNhwc(const ConvParam & p)
37
            : Base::SynetConvolution32fDirectNhwc(p)
38
0
        {
39
0
            _convolutionBiasActivation = SetConvolutionBiasActivation();
40
0
        }
41
42
        bool SynetConvolution32fDirectNhwc::Preferable(const ConvParam & p)
43
0
        {
44
0
            if (!p.IsDilation(1) || p.trans == 0)
45
0
                return false;
46
0
            if (p.group == 1)
47
0
            {
48
0
                if (p.kernelY > p.srcH || p.kernelX > p.srcW)
49
0
                    return false;
50
0
                double k = double(p.srcC) / p.kernelX / p.kernelY;
51
0
                return k < 2.0;
52
0
            }
53
0
            else if (p.IsDepthwise())
54
0
            {
55
0
                return true;
56
0
            }
57
0
            return false;
58
0
        }
59
60
        SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam & p, size_t kH, size_t kW, const float * weight, __m128 & sum)
61
0
        {
62
0
            size_t size = kW * p.srcC, tail = (p.kernelX - kW)*p.srcC*p.dstC, dstC = p.dstC, stride = p.srcW * p.srcC;
63
0
            for (size_t ky = 0; ky < kH; ++ky)
64
0
            {
65
0
                for (size_t i = 0; i < size; ++i, weight += dstC)
66
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(src[i]), _mm_loadu_ps(weight)), sum);
67
0
                weight += tail;
68
0
                src += stride;
69
0
            }
70
0
        }
71
72
        template<::SimdConvolutionActivationType type>
73
        SIMD_INLINE void KernelHwcDefaultEdge(const float * src, const ConvParam & p, size_t kH, size_t kW, const float * weight, const float * bias, const float * params, float * dst)
74
0
        {
75
0
            size_t dstC = p.dstC;
76
0
            size_t dstCF = AlignLo(dstC, F);
77
0
            size_t dc = 0;
78
0
            for (; dc < dstCF; dc += F)
79
0
            {
80
0
                __m128 conv = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
81
0
                KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv);
82
0
                _mm_storeu_ps(dst + dc, Activate<type>(conv, params, dc));
83
0
            }
84
0
            if (dc < dstC)
85
0
            {
86
0
                dc = dstC - F;
87
0
                __m128 conv = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
88
0
                KernelHwcDefaultEdge(src, p, kH, kW, weight + dc, conv);
89
0
                _mm_storeu_ps(dst + dc, Activate<type>(conv, params, dc));
90
0
            }
91
0
        }
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultEdge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
92
93
        SIMD_INLINE void KernelHwcDefaultBody2x2(const float * src, const ConvParam & p, const float * weight, __m128 sums[2][2])
94
0
        {
95
0
            size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;
96
0
            const float * src0 = src + 0 * step;
97
0
            const float * src1 = src + 1 * step;
98
0
            __m128 w0, w1, s0;
99
0
            for (size_t ky = 0; ky < p.kernelY; ++ky)
100
0
            {
101
0
                size_t offset = ky * stride;
102
0
                for (size_t end = offset + size; offset < end; ++offset)
103
0
                {
104
0
                    w0 = _mm_loadu_ps(weight + 0 * F);
105
0
                    w1 = _mm_loadu_ps(weight + 1 * F);
106
0
                    s0 = _mm_set1_ps(src0[offset]);
107
0
                    sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);
108
0
                    sums[0][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[0][1]);
109
0
                    s0 = _mm_set1_ps(src1[offset]);
110
0
                    sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);
111
0
                    sums[1][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[1][1]);
112
0
                    weight += dstC;
113
0
                }
114
0
            }
115
0
        }
116
117
        SIMD_INLINE void KernelHwcDefaultBody2x1(const float * src, const ConvParam & p, const float * weight, __m128 sums[2][1])
118
0
        {
119
0
            size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;
120
0
            const float * src0 = src + 0 * step;
121
0
            const float * src1 = src + 1 * step;
122
0
            __m128 w0, s0;
123
0
            for (size_t ky = 0; ky < p.kernelY; ++ky)
124
0
            {
125
0
                size_t offset = ky * stride;
126
0
                for (size_t end = offset + size; offset < end; ++offset)
127
0
                {
128
0
                    w0 = _mm_loadu_ps(weight + 0 * F);
129
0
                    s0 = _mm_set1_ps(src0[offset]);
130
0
                    sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);
131
0
                    s0 = _mm_set1_ps(src1[offset]);
132
0
                    sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);
133
0
                    weight += dstC;
134
0
                }
135
0
            }
136
0
        }
137
138
        template<::SimdConvolutionActivationType type>
139
        SIMD_INLINE void KernelHwcDefaultBody2(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
140
0
        {
141
0
            size_t dstC = p.dstC;
142
0
            size_t dstCF1 = AlignLo(dstC, 1 * F);
143
0
            size_t dstCF2 = AlignLo(dstC, 2 * F);
144
0
            size_t dc = 0;
145
0
            for (; dc < dstCF2; dc += 2 * F)
146
0
            {
147
0
                __m128 sums[2][2];
148
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc + 0 * F) : _mm_setzero_ps();
149
0
                __m128 bias1 = bias ? _mm_loadu_ps(bias + dc + 1 * F) : _mm_setzero_ps();
150
0
                sums[0][0] = bias0;
151
0
                sums[0][1] = bias1;
152
0
                sums[1][0] = bias0;
153
0
                sums[1][1] = bias1;
154
0
                KernelHwcDefaultBody2x2(src, p, weight + dc, sums);
155
0
                _mm_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate<type>(sums[0][0], params, dc + 0 * F));
156
0
                _mm_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate<type>(sums[0][1], params, dc + 1 * F));
157
0
                _mm_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate<type>(sums[1][0], params, dc + 0 * F));
158
0
                _mm_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate<type>(sums[1][1], params, dc + 1 * F));
159
0
            }
160
0
            for (; dc < dstCF1; dc += 1 * F)
161
0
            {
162
0
                __m128 sums[2][1];
163
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
164
0
                sums[0][0] = bias0;
165
0
                sums[1][0] = bias0;
166
0
                KernelHwcDefaultBody2x1(src, p, weight + dc, sums);
167
0
                _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));
168
0
                _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));
169
0
            }
170
0
            if (dc < dstC)
171
0
            {
172
0
                dc = dstC - F;
173
0
                __m128 sums[2][1];
174
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
175
0
                sums[0][0] = bias0;
176
0
                sums[1][0] = bias0;
177
0
                KernelHwcDefaultBody2x1(src, p, weight + dc, sums);
178
0
                _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));
179
0
                _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));
180
0
            }
181
0
        }
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody2<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
182
183
        SIMD_INLINE void KernelHwcDefaultBody6x2(const float * src, const ConvParam & p, const float * weight, __m128 sums[6][2])
184
0
        {
185
0
            size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;
186
0
            const float * src0 = src + 0 * step;
187
0
            const float * src1 = src + 1 * step;
188
0
            const float * src2 = src + 2 * step;
189
0
            const float * src3 = src + 3 * step;
190
0
            const float * src4 = src + 4 * step;
191
0
            const float * src5 = src + 5 * step;
192
0
            __m128 w0, w1, s0;
193
0
            for (size_t ky = 0; ky < p.kernelY; ++ky)
194
0
            {
195
0
                size_t offset = ky * stride;
196
0
                for (size_t end = offset + size; offset < end; ++offset)
197
0
                {
198
0
                    w0 = _mm_loadu_ps(weight + 0 * F);
199
0
                    w1 = _mm_loadu_ps(weight + 1 * F);
200
0
                    s0 = _mm_set1_ps(src0[offset]);
201
0
                    sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);
202
0
                    sums[0][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[0][1]);
203
0
                    s0 = _mm_set1_ps(src1[offset]);
204
0
                    sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);
205
0
                    sums[1][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[1][1]);
206
0
                    s0 = _mm_set1_ps(src2[offset]);
207
0
                    sums[2][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[2][0]);
208
0
                    sums[2][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[2][1]);
209
0
                    s0 = _mm_set1_ps(src3[offset]);
210
0
                    sums[3][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[3][0]);
211
0
                    sums[3][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[3][1]);
212
0
                    s0 = _mm_set1_ps(src4[offset]);
213
0
                    sums[4][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[4][0]);
214
0
                    sums[4][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[4][1]);
215
0
                    s0 = _mm_set1_ps(src5[offset]);
216
0
                    sums[5][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[5][0]);
217
0
                    sums[5][1] = _mm_add_ps(_mm_mul_ps(s0, w1), sums[5][1]);
218
0
                    weight += dstC;
219
0
                }
220
0
            }
221
0
        }
222
223
        SIMD_INLINE void KernelHwcDefaultBody6x1(const float * src, const ConvParam & p, const float * weight, __m128 sums[6][1])
224
0
        {
225
0
            size_t size = p.kernelX * p.srcC, dstC = p.dstC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX;
226
0
            const float * src0 = src + 0 * step;
227
0
            const float * src1 = src + 1 * step;
228
0
            const float * src2 = src + 2 * step;
229
0
            const float * src3 = src + 3 * step;
230
0
            const float * src4 = src + 4 * step;
231
0
            const float * src5 = src + 5 * step;
232
0
            __m128 w0, s0;
233
0
            for (size_t ky = 0; ky < p.kernelY; ++ky)
234
0
            {
235
0
                size_t offset = ky * stride;
236
0
                for (size_t end = offset + size; offset < end; ++offset)
237
0
                {
238
0
                    w0 = _mm_loadu_ps(weight + 0 * F);
239
0
                    s0 = _mm_set1_ps(src0[offset]);
240
0
                    sums[0][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[0][0]);
241
0
                    s0 = _mm_set1_ps(src1[offset]);
242
0
                    sums[1][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[1][0]);
243
0
                    s0 = _mm_set1_ps(src2[offset]);
244
0
                    sums[2][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[2][0]);
245
0
                    s0 = _mm_set1_ps(src3[offset]);
246
0
                    sums[3][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[3][0]);
247
0
                    s0 = _mm_set1_ps(src4[offset]);
248
0
                    sums[4][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[4][0]);
249
0
                    s0 = _mm_set1_ps(src5[offset]);
250
0
                    sums[5][0] = _mm_add_ps(_mm_mul_ps(s0, w0), sums[5][0]);
251
0
                    weight += dstC;
252
0
                }
253
0
            }
254
0
        }
255
256
        template<::SimdConvolutionActivationType type>
257
        SIMD_INLINE void KernelHwcDefaultBody6(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
258
0
        {
259
0
            size_t dstC = p.dstC;
260
0
            size_t dstCF1 = AlignLo(dstC, 1 * F);
261
0
            size_t dstCF2 = AlignLo(dstC, 2 * F);
262
0
            size_t dc = 0;
263
0
            for (; dc < dstCF2; dc += 2 * F)
264
0
            {
265
0
                __m128 sums[6][2];
266
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc + 0 * F) : _mm_setzero_ps();
267
0
                __m128 bias1 = bias ? _mm_loadu_ps(bias + dc + 1 * F) : _mm_setzero_ps();
268
0
                sums[0][0] = bias0;
269
0
                sums[0][1] = bias1;
270
0
                sums[1][0] = bias0;
271
0
                sums[1][1] = bias1;
272
0
                sums[2][0] = bias0;
273
0
                sums[2][1] = bias1;
274
0
                sums[3][0] = bias0;
275
0
                sums[3][1] = bias1;
276
0
                sums[4][0] = bias0;
277
0
                sums[4][1] = bias1;
278
0
                sums[5][0] = bias0;
279
0
                sums[5][1] = bias1;
280
0
                KernelHwcDefaultBody6x2(src, p, weight + dc, sums);
281
0
                _mm_storeu_ps(dst + dc + 0 * dstC + 0 * F, Activate<type>(sums[0][0], params, dc + 0 * F));
282
0
                _mm_storeu_ps(dst + dc + 0 * dstC + 1 * F, Activate<type>(sums[0][1], params, dc + 1 * F));
283
0
                _mm_storeu_ps(dst + dc + 1 * dstC + 0 * F, Activate<type>(sums[1][0], params, dc + 0 * F));
284
0
                _mm_storeu_ps(dst + dc + 1 * dstC + 1 * F, Activate<type>(sums[1][1], params, dc + 1 * F));
285
0
                _mm_storeu_ps(dst + dc + 2 * dstC + 0 * F, Activate<type>(sums[2][0], params, dc + 0 * F));
286
0
                _mm_storeu_ps(dst + dc + 2 * dstC + 1 * F, Activate<type>(sums[2][1], params, dc + 1 * F));
287
0
                _mm_storeu_ps(dst + dc + 3 * dstC + 0 * F, Activate<type>(sums[3][0], params, dc + 0 * F));
288
0
                _mm_storeu_ps(dst + dc + 3 * dstC + 1 * F, Activate<type>(sums[3][1], params, dc + 1 * F));
289
0
                _mm_storeu_ps(dst + dc + 4 * dstC + 0 * F, Activate<type>(sums[4][0], params, dc + 0 * F));
290
0
                _mm_storeu_ps(dst + dc + 4 * dstC + 1 * F, Activate<type>(sums[4][1], params, dc + 1 * F));
291
0
                _mm_storeu_ps(dst + dc + 5 * dstC + 0 * F, Activate<type>(sums[5][0], params, dc + 0 * F));
292
0
                _mm_storeu_ps(dst + dc + 5 * dstC + 1 * F, Activate<type>(sums[5][1], params, dc + 1 * F));
293
0
            }
294
0
            for (; dc < dstCF1; dc += 1 * F)
295
0
            {
296
0
                __m128 sums[6][1];
297
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
298
0
                sums[0][0] = bias0;
299
0
                sums[1][0] = bias0;
300
0
                sums[2][0] = bias0;
301
0
                sums[3][0] = bias0;
302
0
                sums[4][0] = bias0;
303
0
                sums[5][0] = bias0;
304
0
                KernelHwcDefaultBody6x1(src, p, weight + dc, sums);
305
0
                _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));
306
0
                _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));
307
0
                _mm_storeu_ps(dst + dc + 2 * dstC, Activate<type>(sums[2][0], params, dc));
308
0
                _mm_storeu_ps(dst + dc + 3 * dstC, Activate<type>(sums[3][0], params, dc));
309
0
                _mm_storeu_ps(dst + dc + 4 * dstC, Activate<type>(sums[4][0], params, dc));
310
0
                _mm_storeu_ps(dst + dc + 5 * dstC, Activate<type>(sums[5][0], params, dc));
311
0
            }
312
0
            if (dc < dstC)
313
0
            {
314
0
                dc = dstC - F;
315
0
                __m128 sums[6][1];
316
0
                __m128 bias0 = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps();
317
0
                sums[0][0] = bias0;
318
0
                sums[1][0] = bias0;
319
0
                sums[2][0] = bias0;
320
0
                sums[3][0] = bias0;
321
0
                sums[4][0] = bias0;
322
0
                sums[5][0] = bias0;
323
0
                KernelHwcDefaultBody6x1(src, p, weight + dc, sums);
324
0
                _mm_storeu_ps(dst + dc + 0 * dstC, Activate<type>(sums[0][0], params, dc));
325
0
                _mm_storeu_ps(dst + dc + 1 * dstC, Activate<type>(sums[1][0], params, dc));
326
0
                _mm_storeu_ps(dst + dc + 2 * dstC, Activate<type>(sums[2][0], params, dc));
327
0
                _mm_storeu_ps(dst + dc + 3 * dstC, Activate<type>(sums[3][0], params, dc));
328
0
                _mm_storeu_ps(dst + dc + 4 * dstC, Activate<type>(sums[4][0], params, dc));
329
0
                _mm_storeu_ps(dst + dc + 5 * dstC, Activate<type>(sums[5][0], params, dc));
330
0
            }
331
0
        }
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::KernelHwcDefaultBody6<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
332
333
        template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
334
0
        {
335
0
            size_t noseH = p.padY, noseW = p.padX;
336
0
            size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW;
337
0
            size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW;
338
0
            size_t bodyW2 = AlignLoAny(bodyW - noseW, 2 * p.strideX) + noseW;
339
0
            size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW;
340
0
            size_t wS = p.srcC*p.dstC;
341
0
            size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1;
342
0
            size_t sy = 0;
343
0
            for (; sy < noseH; sy += p.strideY)
344
0
            {
345
0
                size_t sx = 0;
346
0
                const float * w = weight + (noseH - sy) * p.kernelY * wS;
347
0
                for (; sx < noseW; sx += p.strideX, dst += p.dstC)
348
0
                    KernelHwcDefaultEdge<type>(src, p, kY + sy, kX + sx, w + (noseW - sx)*wS, bias, params, dst);
349
0
                for (; sx < bodyW; sx += p.strideX, dst += p.dstC)
350
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kY + sy, p.kernelX, w, bias, params, dst);
351
0
                for (; sx < tailW; sx += p.strideX, dst += p.dstC)
352
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kY + sy, kW - sx, w, bias, params, dst);
353
0
            }
354
0
            src += (sy - noseH)*p.srcW*p.srcC;
355
0
            for (; sy < bodyH; sy += p.strideY)
356
0
            {
357
0
                size_t sx = 0;
358
0
                for (; sx < noseW; sx += p.strideX, dst += p.dstC)
359
0
                    KernelHwcDefaultEdge<type>(src, p, p.kernelY, kX + sx, weight + (noseW - sx)*wS, bias, params, dst);
360
0
                for (; sx < bodyW6; sx += 6 * p.strideX, dst += 6 * p.dstC)
361
0
                    KernelHwcDefaultBody6<type>(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst);
362
0
                for (; sx < bodyW2; sx += 2 * p.strideX, dst += 2 * p.dstC)
363
0
                    KernelHwcDefaultBody2<type>(src + (sx - noseW) * p.srcC, p, weight, bias, params, dst);
364
0
                for (; sx < bodyW; sx += p.strideX, dst += p.dstC)
365
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, p.kernelY, p.kernelX, weight, bias, params, dst);
366
0
                for (; sx < tailW; sx += p.strideX, dst += p.dstC)
367
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, p.kernelY, kW - sx, weight, bias, params, dst);
368
0
                src += p.strideY*p.srcW*p.srcC;
369
0
            }
370
0
            for (; sy < tailH; sy += p.strideY)
371
0
            {
372
0
                size_t sx = 0;
373
0
                for (; sx < noseW; sx += p.strideX, dst += p.dstC)
374
0
                    KernelHwcDefaultEdge<type>(src, p, kH - sy, kX + sx, weight + (noseW - sx)*wS, bias, params, dst);
375
0
                for (; sx < bodyW; sx += p.strideX, dst += p.dstC)
376
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kH - sy, p.kernelX, weight, bias, params, dst);
377
0
                for (; sx < tailW; sx += p.strideX, dst += p.dstC)
378
0
                    KernelHwcDefaultEdge<type>(src + (sx - noseW) * p.srcC, p, kH - sy, kW - sx, weight, bias, params, dst);
379
0
                src += p.strideY*p.srcW*p.srcC;
380
0
            }
381
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDefault<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
382
383
        //-------------------------------------------------------------------------------------------------
384
385
        template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
386
0
        {
387
0
            size_t size = p.group;
388
0
            size_t sizeF = AlignLo(size, F);
389
0
            size_t size2F = AlignLo(size, 2 * F);
390
0
            size_t size4F = AlignLo(size, 4 * F);
391
0
            size_t size8F = AlignLo(size, 8 * F);
392
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
393
0
            {
394
0
                for (size_t dx = 0; dx < p.dstW; ++dx)
395
0
                {
396
0
                    size_t i = 0;
397
0
                    for (; i < size8F; i += 8 * F)
398
0
                    {
399
0
                        __m128 sums[8];
400
0
                        if (bias)
401
0
                        {
402
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
403
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
404
0
                            sums[2] = _mm_loadu_ps(bias + i + 2 * F);
405
0
                            sums[3] = _mm_loadu_ps(bias + i + 3 * F);
406
0
                            sums[4] = _mm_loadu_ps(bias + i + 4 * F);
407
0
                            sums[5] = _mm_loadu_ps(bias + i + 5 * F);
408
0
                            sums[6] = _mm_loadu_ps(bias + i + 6 * F);
409
0
                            sums[7] = _mm_loadu_ps(bias + i + 7 * F);
410
0
                        }
411
0
                        else
412
0
                        {
413
0
                            sums[0] = _mm_setzero_ps();
414
0
                            sums[1] = _mm_setzero_ps();
415
0
                            sums[2] = _mm_setzero_ps();
416
0
                            sums[3] = _mm_setzero_ps();
417
0
                            sums[4] = _mm_setzero_ps();
418
0
                            sums[5] = _mm_setzero_ps();
419
0
                            sums[6] = _mm_setzero_ps();
420
0
                            sums[7] = _mm_setzero_ps();
421
0
                        }
422
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
423
0
                        {
424
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
425
0
                            if (sy < p.srcH)
426
0
                            {
427
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
428
0
                                {
429
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
430
0
                                    if (sx < p.srcW)
431
0
                                    {
432
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
433
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
434
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
435
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
436
0
                                        sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);
437
0
                                        sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);
438
0
                                        sums[4] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * F), _mm_loadu_ps(pw + 4 * F)), sums[4]);
439
0
                                        sums[5] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * F), _mm_loadu_ps(pw + 5 * F)), sums[5]);
440
0
                                        sums[6] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * F), _mm_loadu_ps(pw + 6 * F)), sums[6]);
441
0
                                        sums[7] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * F), _mm_loadu_ps(pw + 7 * F)), sums[7]);
442
0
                                    }
443
0
                                }
444
0
                            }
445
0
                        }
446
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
447
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
448
0
                        _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));
449
0
                        _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));
450
0
                        _mm_storeu_ps(dst + i + 4 * F, Activate<type>(sums[4], params, i + 4 * F));
451
0
                        _mm_storeu_ps(dst + i + 5 * F, Activate<type>(sums[5], params, i + 5 * F));
452
0
                        _mm_storeu_ps(dst + i + 6 * F, Activate<type>(sums[6], params, i + 6 * F));
453
0
                        _mm_storeu_ps(dst + i + 7 * F, Activate<type>(sums[7], params, i + 7 * F));
454
0
                    }
455
0
                    for (; i < size4F; i += 4 * F)
456
0
                    {
457
0
                        __m128 sums[4];
458
0
                        if (bias)
459
0
                        {
460
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
461
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
462
0
                            sums[2] = _mm_loadu_ps(bias + i + 2 * F);
463
0
                            sums[3] = _mm_loadu_ps(bias + i + 3 * F);
464
0
                        }
465
0
                        else
466
0
                        {
467
0
                            sums[0] = _mm_setzero_ps();
468
0
                            sums[1] = _mm_setzero_ps();
469
0
                            sums[2] = _mm_setzero_ps();
470
0
                            sums[3] = _mm_setzero_ps();
471
0
                        }
472
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
473
0
                        {
474
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
475
0
                            if (sy < p.srcH)
476
0
                            {
477
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
478
0
                                {
479
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
480
0
                                    if (sx < p.srcW)
481
0
                                    {
482
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
483
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
484
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
485
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
486
0
                                        sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);
487
0
                                        sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);
488
0
                                    }
489
0
                                }
490
0
                            }
491
0
                        }
492
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
493
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
494
0
                        _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));
495
0
                        _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));
496
0
                    }
497
0
                    for (; i < size2F; i += 2 * F)
498
0
                    {
499
0
                        __m128 sums[2];
500
0
                        if (bias)
501
0
                        {
502
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
503
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
504
0
                        }
505
0
                        else
506
0
                        {
507
0
                            sums[0] = _mm_setzero_ps();
508
0
                            sums[1] = _mm_setzero_ps();
509
0
                        }
510
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
511
0
                        {
512
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
513
0
                            if (sy < p.srcH)
514
0
                            {
515
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
516
0
                                {
517
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
518
0
                                    if (sx < p.srcW)
519
0
                                    {
520
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
521
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
522
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
523
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
524
0
                                    }
525
0
                                }
526
0
                            }
527
0
                        }
528
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
529
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
530
0
                    }
531
0
                    for (; i < size; i += F)
532
0
                    {
533
0
                        size_t ci = i >= sizeF ? size - F : i;
534
0
                        __m128 sum = bias ? _mm_loadu_ps(bias + ci) : _mm_setzero_ps();
535
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
536
0
                        {
537
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
538
0
                            if (sy < p.srcH)
539
0
                            {
540
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
541
0
                                {
542
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
543
0
                                    if (sx < p.srcW)
544
0
                                    {
545
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + ci;
546
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + ci;
547
0
                                        sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
548
0
                                    }
549
0
                                }
550
0
                            }
551
0
                        }
552
0
                        _mm_storeu_ps(dst + ci, Activate<type>(sum, params, ci));
553
0
                    }
554
0
                    dst += p.dstC;
555
0
                }
556
0
            }
557
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
558
559
        //-------------------------------------------------------------------------------------------------
560
561
        template<SimdConvolutionActivationType type>
562
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge(const float * src, const ConvParam & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst)
563
0
        {
564
0
            size_t srcC = p.srcC;
565
0
            size_t srcCF = AlignLo(srcC, F);
566
0
            size_t c = 0;
567
0
            for (; c < srcCF; c += F)
568
0
            {
569
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
570
0
                for (size_t ky = 0; ky < 3; ++ky)
571
0
                {
572
0
                    size_t sy = dy * p.strideY + ky - p.padY;
573
0
                    if (sy < p.srcH)
574
0
                    {
575
0
                        for (size_t kx = 0; kx < 3; ++kx)
576
0
                        {
577
0
                            size_t sx = dx * p.strideX + kx - p.padX;
578
0
                            if (sx < p.srcW)
579
0
                            {
580
0
                                const float * pw = weight + (ky * 3 + kx) * srcC;
581
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
582
0
                                sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
583
0
                            }
584
0
                        }
585
0
                    }
586
0
                }
587
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
588
0
                src += F;
589
0
                weight += F;
590
0
            }
591
0
            if (c < srcC)
592
0
            {
593
0
                c = srcC - F;
594
0
                src -= srcCF - c;
595
0
                weight -= srcCF - c;
596
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
597
0
                for (size_t ky = 0; ky < 3; ++ky)
598
0
                {
599
0
                    size_t sy = dy * p.strideY + ky - p.padY;
600
0
                    if (sy < p.srcH)
601
0
                    {
602
0
                        for (size_t kx = 0; kx < 3; ++kx)
603
0
                        {
604
0
                            size_t sx = dx * p.strideX + kx - p.padX;
605
0
                            if (sx < p.srcW)
606
0
                            {
607
0
                                const float * pw = weight + (ky * 3 + kx) * srcC;
608
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
609
0
                                sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
610
0
                            }
611
0
                        }
612
0
                    }
613
0
                }
614
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
615
0
            }
616
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
617
618
        template<::SimdConvolutionActivationType type>
619
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
620
0
        {
621
0
            size_t srcCF = AlignLo(srcC, F);
622
0
            size_t c = 0;
623
0
            for (; c < srcCF; c += F)
624
0
            {
625
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
626
0
                for (size_t ky = 0; ky < 3; ++ky)
627
0
                {
628
0
                    const float * ps = src + ky * srcS;
629
0
                    const float * pw = weight + ky * 3 * srcC;
630
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);
631
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);
632
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);
633
0
                }
634
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
635
0
                src += F;
636
0
                weight += F;
637
0
            }
638
0
            if (c < srcC)
639
0
            {
640
0
                c = srcC - F;
641
0
                src -= srcCF - c;
642
0
                weight -= srcCF - c;
643
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
644
0
                for (size_t ky = 0; ky < 3; ++ky)
645
0
                {
646
0
                    const float * ps = src + ky * srcS;
647
0
                    const float * pw = weight + ky * 3 * srcC;
648
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);
649
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);
650
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);
651
0
                }
652
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
653
0
            }
654
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
655
656
        template<::SimdConvolutionActivationType type>
657
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
658
0
        {
659
0
            size_t srcCF = AlignLo(srcC, F);
660
0
            size_t c = 0;
661
0
            __m128 sum0, sum1, w0;
662
0
            for (; c < srcCF; c += F)
663
0
            {
664
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
665
0
                sum1 = sum0;
666
0
                const float * pw = weight + c;
667
0
                for (size_t ky = 0; ky < 3; ++ky)
668
0
                {
669
0
                    const float * ps0 = src + ky * srcS;
670
0
                    const float * ps1 = ps0 + srcX;
671
0
                    w0 = _mm_loadu_ps(pw);
672
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);
673
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);
674
0
                    pw += srcC;
675
0
                    w0 = _mm_loadu_ps(pw);
676
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);
677
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);
678
0
                    pw += srcC;
679
0
                    w0 = _mm_loadu_ps(pw);
680
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);
681
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);
682
0
                    pw += srcC;
683
0
                }
684
0
                _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));
685
0
                _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));
686
0
                src += F;
687
0
            }
688
0
            if (c < srcC)
689
0
            {
690
0
                c = srcC - F;
691
0
                src -= srcCF - c;
692
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
693
0
                sum1 = sum0;
694
0
                const float * pw = weight + c;
695
0
                for (size_t ky = 0; ky < 3; ++ky)
696
0
                {
697
0
                    const float * ps0 = src + ky * srcS;
698
0
                    const float * ps1 = ps0 + srcX;
699
0
                    w0 = _mm_loadu_ps(pw);
700
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);
701
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);
702
0
                    pw += srcC;
703
0
                    w0 = _mm_loadu_ps(pw);
704
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);
705
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);
706
0
                    pw += srcC;
707
0
                    w0 = _mm_loadu_ps(pw);
708
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);
709
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);
710
0
                    pw += srcC;
711
0
                }
712
0
                _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));
713
0
                _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));
714
0
            }
715
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
716
717
        template<::SimdConvolutionActivationType type>
718
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
719
0
        {
720
0
            size_t srcCF = AlignLo(srcC, F);
721
0
            size_t c = 0;
722
0
            for (; c < srcCF; c += F)
723
0
            {
724
0
                __m128 sum0, sum1, sum2, sum3, w0;
725
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
726
0
                sum1 = sum0;
727
0
                sum2 = sum0;
728
0
                sum3 = sum0;
729
0
                const float * pw = weight + c;
730
0
                const float * ps0 = src + 0 * srcX;
731
0
                const float * ps1 = src + 1 * srcX;
732
0
                const float * ps2 = src + 2 * srcX;
733
0
                const float * ps3 = src + 3 * srcX;
734
0
                for (size_t ky = 0; ky < 3; ++ky)
735
0
                {
736
0
                    size_t offset = ky * srcS;
737
0
                    w0 = _mm_loadu_ps(pw);
738
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
739
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
740
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
741
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
742
0
                    pw += srcC, offset += srcC;
743
0
                    w0 = _mm_loadu_ps(pw);
744
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
745
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
746
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
747
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
748
0
                    pw += srcC, offset += srcC;
749
0
                    w0 = _mm_loadu_ps(pw);
750
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
751
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
752
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
753
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
754
0
                    pw += srcC, offset += srcC;
755
0
                }
756
0
                _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));
757
0
                _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));
758
0
                _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));
759
0
                _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));
760
0
                src += F;
761
0
                dst += F;
762
0
            }
763
0
            if (c < srcC)
764
0
            {
765
0
                c = srcC - F;
766
0
                src -= srcCF - c;
767
0
                dst -= srcCF - c;
768
0
                __m128 sum0, sum1, sum2, sum3, w0;
769
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
770
0
                sum1 = sum0;
771
0
                sum2 = sum0;
772
0
                sum3 = sum0;
773
0
                const float * pw = weight + c;
774
0
                const float * ps0 = src + 0 * srcX;
775
0
                const float * ps1 = src + 1 * srcX;
776
0
                const float * ps2 = src + 2 * srcX;
777
0
                const float * ps3 = src + 3 * srcX;
778
0
                for (size_t ky = 0; ky < 3; ++ky)
779
0
                {
780
0
                    size_t offset = ky * srcS;
781
0
                    w0 = _mm_loadu_ps(pw);
782
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
783
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
784
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
785
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
786
0
                    pw += srcC, offset += srcC;
787
0
                    w0 = _mm_loadu_ps(pw);
788
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
789
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
790
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
791
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
792
0
                    pw += srcC, offset += srcC;
793
0
                    w0 = _mm_loadu_ps(pw);
794
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
795
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
796
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
797
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
798
0
                    pw += srcC, offset += srcC;
799
0
                }
800
0
                _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));
801
0
                _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));
802
0
                _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));
803
0
                _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));
804
0
            }
805
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
806
807
        template<::SimdConvolutionActivationType type>
808
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4(const float * src, const ConvParam & p, size_t dy, size_t dx, const __m128 * weight, __m128 bias, const float * params, float * dst)
809
0
        {
810
0
            __m128 sum = bias;
811
0
            for (size_t ky = 0; ky < 3; ++ky)
812
0
            {
813
0
                size_t sy = dy * p.strideY + ky - p.padY;
814
0
                if (sy < p.srcH)
815
0
                {
816
0
                    for (size_t kx = 0; kx < 3; ++kx)
817
0
                    {
818
0
                        size_t sx = dx * p.strideX + kx - p.padX;
819
0
                        if (sx < p.srcW)
820
0
                        {
821
0
                            const float * ps = src + (sy*p.srcW + sx) * F;
822
0
                            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), weight[ky * 3 + kx]), sum);
823
0
                        }
824
0
                    }
825
0
                }
826
0
            }
827
0
            _mm_storeu_ps(dst, Activate<type>(sum, params, 0));
828
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
829
830
        template<::SimdConvolutionActivationType type>
831
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)
832
0
        {
833
0
            __m128 sum = bias;
834
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[0]), sum);
835
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[1]), sum);
836
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[2]), sum);
837
0
            src += srcS;
838
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[3]), sum);
839
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[4]), sum);
840
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[5]), sum);
841
0
            src += srcS;
842
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[6]), sum);
843
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[7]), sum);
844
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[8]), sum);
845
0
            _mm_storeu_ps(dst, Activate<type>(sum, params, 0));
846
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
847
848
        template<::SimdConvolutionActivationType type>
849
        SIMD_INLINE void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)
850
0
        {
851
0
            __m128 sum0 = bias;
852
0
            __m128 sum1 = bias;
853
0
            for (size_t ky = 0; ky < 3; ++ky)
854
0
            {
855
0
                __m128 s0 = _mm_loadu_ps(src + 0 * F);
856
0
                __m128 s1 = _mm_loadu_ps(src + 1 * F);
857
0
                __m128 s2 = _mm_loadu_ps(src + 2 * F);
858
0
                __m128 s3 = _mm_loadu_ps(src + 3 * F);
859
0
                sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum0);
860
0
                sum1 = _mm_add_ps(_mm_mul_ps(s1, weight[0]), sum1);
861
0
                sum0 = _mm_add_ps(_mm_mul_ps(s1, weight[1]), sum0);
862
0
                sum1 = _mm_add_ps(_mm_mul_ps(s2, weight[1]), sum1);
863
0
                sum0 = _mm_add_ps(_mm_mul_ps(s2, weight[2]), sum0);
864
0
                sum1 = _mm_add_ps(_mm_mul_ps(s3, weight[2]), sum1);
865
0
                src += srcS;
866
0
                weight += 3;
867
0
            }
868
0
            _mm_storeu_ps(dst + 0, Activate<type>(sum0, params, 0));
869
0
            _mm_storeu_ps(dst + F, Activate<type>(sum1, params, 0));
870
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
871
872
        template<::SimdConvolutionActivationType type> void ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
873
0
        {
874
0
            size_t srcS = p.srcC*p.srcW;
875
0
            size_t srcX = p.srcC*p.strideX;
876
0
            size_t dstH = p.dstH - p.padH;
877
0
            size_t dstW = p.dstW - p.padW;
878
0
            size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX;
879
0
            size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX;
880
0
            if (p.dstC == F && p.strideX == 1)
881
0
            {
882
0
                __m128 _weight[9];
883
0
                for (size_t i = 0; i < 9; ++i)
884
0
                    _weight[i] = _mm_loadu_ps(weight + i * F);
885
0
                __m128 _bias = bias ? _mm_loadu_ps(bias) : _mm_setzero_ps();
886
0
                size_t dy = 0;
887
0
                for (; dy < p.padY; ++dy)
888
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
889
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
890
0
                for (; dy < dstH; ++dy)
891
0
                {
892
0
                    size_t dx = 0;
893
0
                    for (; dx < p.padX; ++dx)
894
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
895
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
896
0
                    for (; dx < dstW2; dx += 2)
897
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x2<type>(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F;
898
0
                    for (; dx < dstW; ++dx)
899
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4x1<type>(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F;
900
0
                    for (; dx < p.dstW; ++dx)
901
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
902
0
                }
903
0
                for (; dy < p.dstH; ++dy)
904
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
905
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
906
0
            }
907
0
            else
908
0
            {
909
0
                size_t dy = 0;
910
0
                for (; dy < p.padY; ++dy)
911
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
912
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
913
0
                for (; dy < dstH; ++dy)
914
0
                {
915
0
                    size_t dx = 0;
916
0
                    for (; dx < p.padX; ++dx)
917
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
918
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
919
0
                    for (; dx < dstW4; dx += 4)
920
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main4<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX;
921
0
                    for (; dx < dstW2; dx += 2)
922
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main2<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX;
923
0
                    for (; dx < dstW; ++dx)
924
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Main1<type>(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX;
925
0
                    for (; dx < p.dstW; ++dx)
926
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
927
0
                }
928
0
                for (; dy < p.dstH; ++dy)
929
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
930
0
                        ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
931
0
            }
932
0
        }
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
933
934
        template <::SimdConvolutionActivationType type> SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr GetConvolutionBiasActivation(const ConvParam & p)
935
0
        {
936
0
            if (p.group == 1)
937
0
                return ConvolutionDirectNhwcConvolutionBiasActivationDefault<type>;
938
0
            else if (p.IsDepthwise())
939
0
            {
940
0
                if (p.IsKernel(3) && p.IsDilation(1))
941
0
                    return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise3x3<type>;
942
0
                else
943
0
                    return ConvolutionDirectNhwcConvolutionBiasActivationDepthwise<type>;
944
0
            }
945
0
            return NULL;
946
0
        }
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)0>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)1>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)2>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)3>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)4>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)5>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)6>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)7>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)8>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)9>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::GetConvolutionBiasActivation<(SimdConvolutionActivationType)10>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
947
948
        SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation()
949
0
        {
950
0
            const ConvParam & p = _param;
951
0
            SynetConvolution32fDirectNhwc::ConvolutionBiasActivationPtr func = NULL;
952
0
            if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW)
953
0
            {
954
0
                switch (p.activation)
955
0
                {
956
0
                case ::SimdConvolutionActivationIdentity: func = GetConvolutionBiasActivation<::SimdConvolutionActivationIdentity>(p); break;
957
0
                case ::SimdConvolutionActivationRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRelu>(p); break;
958
0
                case ::SimdConvolutionActivationLeakyRelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationLeakyRelu>(p); break;
959
0
                case ::SimdConvolutionActivationRestrictRange: func = GetConvolutionBiasActivation<::SimdConvolutionActivationRestrictRange>(p); break;
960
0
                case ::SimdConvolutionActivationPrelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationPrelu>(p); break;
961
0
                case ::SimdConvolutionActivationElu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationElu>(p); break;
962
0
                case ::SimdConvolutionActivationHswish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHswish>(p); break;
963
0
                case ::SimdConvolutionActivationMish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationMish>(p); break;
964
0
                case ::SimdConvolutionActivationHardSigmoid: func = GetConvolutionBiasActivation<::SimdConvolutionActivationHardSigmoid>(p); break;
965
0
                case ::SimdConvolutionActivationSwish: func = GetConvolutionBiasActivation<::SimdConvolutionActivationSwish>(p); break;
966
0
                case ::SimdConvolutionActivationGelu: func = GetConvolutionBiasActivation<::SimdConvolutionActivationGelu>(p); break;
967
0
                }
968
0
            }
969
0
            return func ? func : Base::SynetConvolution32fDirectNhwc::SetConvolutionBiasActivation();
970
0
        };
971
    }
972
#endif
973
}