Coverage Report

Created: 2025-11-16 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdSse41SynetConvolution32fNhwcDepthwise.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetConvolution32f.h"
25
#include "Simd/SimdSynetConvolution32fCommon.h"
26
#include "Simd/SimdExtract.h"
27
#include "Simd/SimdSynet.h"
28
#include "Simd/SimdGemm.h"
29
#include "Simd/SimdExp.h"
30
31
namespace Simd
32
{
33
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)   
34
    namespace Sse41
35
    {
36
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwiseDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
37
0
        {
38
0
            size_t size = p.group;
39
0
            size_t sizeF = AlignLo(size, F);
40
0
            size_t size2F = AlignLo(size, 2 * F);
41
0
            size_t size4F = AlignLo(size, 4 * F);
42
0
            size_t size8F = AlignLo(size, 8 * F);
43
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
44
0
            {
45
0
                for (size_t dx = 0; dx < p.dstW; ++dx)
46
0
                {
47
0
                    size_t i = 0;
48
0
                    for (; i < size8F; i += 8 * F)
49
0
                    {
50
0
                        __m128 sums[8];
51
0
                        if (bias)
52
0
                        {
53
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
54
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
55
0
                            sums[2] = _mm_loadu_ps(bias + i + 2 * F);
56
0
                            sums[3] = _mm_loadu_ps(bias + i + 3 * F);
57
0
                            sums[4] = _mm_loadu_ps(bias + i + 4 * F);
58
0
                            sums[5] = _mm_loadu_ps(bias + i + 5 * F);
59
0
                            sums[6] = _mm_loadu_ps(bias + i + 6 * F);
60
0
                            sums[7] = _mm_loadu_ps(bias + i + 7 * F);
61
0
                        }
62
0
                        else
63
0
                        {
64
0
                            sums[0] = _mm_setzero_ps();
65
0
                            sums[1] = _mm_setzero_ps();
66
0
                            sums[2] = _mm_setzero_ps();
67
0
                            sums[3] = _mm_setzero_ps();
68
0
                            sums[4] = _mm_setzero_ps();
69
0
                            sums[5] = _mm_setzero_ps();
70
0
                            sums[6] = _mm_setzero_ps();
71
0
                            sums[7] = _mm_setzero_ps();
72
0
                        }
73
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
74
0
                        {
75
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
76
0
                            if (sy < p.srcH)
77
0
                            {
78
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
79
0
                                {
80
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
81
0
                                    if (sx < p.srcW)
82
0
                                    {
83
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
84
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
85
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
86
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
87
0
                                        sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);
88
0
                                        sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);
89
0
                                        sums[4] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * F), _mm_loadu_ps(pw + 4 * F)), sums[4]);
90
0
                                        sums[5] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * F), _mm_loadu_ps(pw + 5 * F)), sums[5]);
91
0
                                        sums[6] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * F), _mm_loadu_ps(pw + 6 * F)), sums[6]);
92
0
                                        sums[7] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * F), _mm_loadu_ps(pw + 7 * F)), sums[7]);
93
0
                                    }
94
0
                                }
95
0
                            }
96
0
                        }
97
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
98
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
99
0
                        _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));
100
0
                        _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));
101
0
                        _mm_storeu_ps(dst + i + 4 * F, Activate<type>(sums[4], params, i + 4 * F));
102
0
                        _mm_storeu_ps(dst + i + 5 * F, Activate<type>(sums[5], params, i + 5 * F));
103
0
                        _mm_storeu_ps(dst + i + 6 * F, Activate<type>(sums[6], params, i + 6 * F));
104
0
                        _mm_storeu_ps(dst + i + 7 * F, Activate<type>(sums[7], params, i + 7 * F));
105
0
                    }
106
0
                    for (; i < size4F; i += 4 * F)
107
0
                    {
108
0
                        __m128 sums[4];
109
0
                        if (bias)
110
0
                        {
111
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
112
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
113
0
                            sums[2] = _mm_loadu_ps(bias + i + 2 * F);
114
0
                            sums[3] = _mm_loadu_ps(bias + i + 3 * F);
115
0
                        }
116
0
                        else
117
0
                        {
118
0
                            sums[0] = _mm_setzero_ps();
119
0
                            sums[1] = _mm_setzero_ps();
120
0
                            sums[2] = _mm_setzero_ps();
121
0
                            sums[3] = _mm_setzero_ps();
122
0
                        }
123
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
124
0
                        {
125
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
126
0
                            if (sy < p.srcH)
127
0
                            {
128
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
129
0
                                {
130
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
131
0
                                    if (sx < p.srcW)
132
0
                                    {
133
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
134
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
135
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
136
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
137
0
                                        sums[2] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * F), _mm_loadu_ps(pw + 2 * F)), sums[2]);
138
0
                                        sums[3] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * F), _mm_loadu_ps(pw + 3 * F)), sums[3]);
139
0
                                    }
140
0
                                }
141
0
                            }
142
0
                        }
143
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
144
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
145
0
                        _mm_storeu_ps(dst + i + 2 * F, Activate<type>(sums[2], params, i + 2 * F));
146
0
                        _mm_storeu_ps(dst + i + 3 * F, Activate<type>(sums[3], params, i + 3 * F));
147
0
                    }
148
0
                    for (; i < size2F; i += 2 * F)
149
0
                    {
150
0
                        __m128 sums[2];
151
0
                        if (bias)
152
0
                        {
153
0
                            sums[0] = _mm_loadu_ps(bias + i + 0 * F);
154
0
                            sums[1] = _mm_loadu_ps(bias + i + 1 * F);
155
0
                        }
156
0
                        else
157
0
                        {
158
0
                            sums[0] = _mm_setzero_ps();
159
0
                            sums[1] = _mm_setzero_ps();
160
0
                        }
161
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
162
0
                        {
163
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
164
0
                            if (sy < p.srcH)
165
0
                            {
166
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
167
0
                                {
168
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
169
0
                                    if (sx < p.srcW)
170
0
                                    {
171
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + i;
172
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + i;
173
0
                                        sums[0] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * F), _mm_loadu_ps(pw + 0 * F)), sums[0]);
174
0
                                        sums[1] = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * F), _mm_loadu_ps(pw + 1 * F)), sums[1]);
175
0
                                    }
176
0
                                }
177
0
                            }
178
0
                        }
179
0
                        _mm_storeu_ps(dst + i + 0 * F, Activate<type>(sums[0], params, i + 0 * F));
180
0
                        _mm_storeu_ps(dst + i + 1 * F, Activate<type>(sums[1], params, i + 1 * F));
181
0
                    }
182
0
                    for (; i < size; i += F)
183
0
                    {
184
0
                        size_t ci = i >= sizeF ? size - F : i;
185
0
                        __m128 sum = bias ? _mm_loadu_ps(bias + ci) : _mm_setzero_ps();
186
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
187
0
                        {
188
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
189
0
                            if (sy < p.srcH)
190
0
                            {
191
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
192
0
                                {
193
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
194
0
                                    if (sx < p.srcW)
195
0
                                    {
196
0
                                        const float * pw = weight + (ky*p.kernelX + kx)*size + ci;
197
0
                                        const float * ps = src + (sy*p.srcW + sx)*size + ci;
198
0
                                        sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
199
0
                                    }
200
0
                                }
201
0
                            }
202
0
                        }
203
0
                        _mm_storeu_ps(dst + ci, Activate<type>(sum, params, ci));
204
0
                    }
205
0
                    dst += p.dstC;
206
0
                }
207
0
            }
208
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
209
210
        //-------------------------------------------------------------------------------------------------
211
212
        template<SimdConvolutionActivationType type>
213
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge(const float * src, const ConvParam & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst)
214
0
        {
215
0
            size_t srcC = p.srcC;
216
0
            size_t srcCF = AlignLo(srcC, F);
217
0
            size_t c = 0;
218
0
            for (; c < srcCF; c += F)
219
0
            {
220
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
221
0
                for (size_t ky = 0; ky < 3; ++ky)
222
0
                {
223
0
                    size_t sy = dy * p.strideY + ky - p.padY;
224
0
                    if (sy < p.srcH)
225
0
                    {
226
0
                        for (size_t kx = 0; kx < 3; ++kx)
227
0
                        {
228
0
                            size_t sx = dx * p.strideX + kx - p.padX;
229
0
                            if (sx < p.srcW)
230
0
                            {
231
0
                                const float * pw = weight + (ky * 3 + kx) * srcC;
232
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
233
0
                                sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
234
0
                            }
235
0
                        }
236
0
                    }
237
0
                }
238
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
239
0
                src += F;
240
0
                weight += F;
241
0
            }
242
0
            if (c < srcC)
243
0
            {
244
0
                c = srcC - F;
245
0
                src -= srcCF - c;
246
0
                weight -= srcCF - c;
247
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
248
0
                for (size_t ky = 0; ky < 3; ++ky)
249
0
                {
250
0
                    size_t sy = dy * p.strideY + ky - p.padY;
251
0
                    if (sy < p.srcH)
252
0
                    {
253
0
                        for (size_t kx = 0; kx < 3; ++kx)
254
0
                        {
255
0
                            size_t sx = dx * p.strideX + kx - p.padX;
256
0
                            if (sx < p.srcW)
257
0
                            {
258
0
                                const float * pw = weight + (ky * 3 + kx) * srcC;
259
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
260
0
                                sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum);
261
0
                            }
262
0
                        }
263
0
                    }
264
0
                }
265
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
266
0
            }
267
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
268
269
        template<::SimdConvolutionActivationType type>
270
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
271
0
        {
272
0
            size_t srcCF = AlignLo(srcC, F);
273
0
            size_t c = 0;
274
0
            for (; c < srcCF; c += F)
275
0
            {
276
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
277
0
                for (size_t ky = 0; ky < 3; ++ky)
278
0
                {
279
0
                    const float * ps = src + ky * srcS;
280
0
                    const float * pw = weight + ky * 3 * srcC;
281
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);
282
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);
283
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);
284
0
                }
285
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
286
0
                src += F;
287
0
                weight += F;
288
0
            }
289
0
            if (c < srcC)
290
0
            {
291
0
                c = srcC - F;
292
0
                src -= srcCF - c;
293
0
                weight -= srcCF - c;
294
0
                __m128 sum = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
295
0
                for (size_t ky = 0; ky < 3; ++ky)
296
0
                {
297
0
                    const float * ps = src + ky * srcS;
298
0
                    const float * pw = weight + ky * 3 * srcC;
299
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * srcC), _mm_loadu_ps(pw + 0 * srcC)), sum);
300
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * srcC), _mm_loadu_ps(pw + 1 * srcC)), sum);
301
0
                    sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * srcC), _mm_loadu_ps(pw + 2 * srcC)), sum);
302
0
                }
303
0
                _mm_storeu_ps(dst + c, Activate<type>(sum, params, c));
304
0
            }
305
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
306
307
        template<::SimdConvolutionActivationType type>
308
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
309
0
        {
310
0
            size_t srcCF = AlignLo(srcC, F);
311
0
            size_t c = 0;
312
0
            __m128 sum0, sum1, w0;
313
0
            for (; c < srcCF; c += F)
314
0
            {
315
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
316
0
                sum1 = sum0;
317
0
                const float * pw = weight + c;
318
0
                for (size_t ky = 0; ky < 3; ++ky)
319
0
                {
320
0
                    const float * ps0 = src + ky * srcS;
321
0
                    const float * ps1 = ps0 + srcX;
322
0
                    w0 = _mm_loadu_ps(pw);
323
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);
324
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);
325
0
                    pw += srcC;
326
0
                    w0 = _mm_loadu_ps(pw);
327
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);
328
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);
329
0
                    pw += srcC;
330
0
                    w0 = _mm_loadu_ps(pw);
331
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);
332
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);
333
0
                    pw += srcC;
334
0
                }
335
0
                _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));
336
0
                _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));
337
0
                src += F;
338
0
            }
339
0
            if (c < srcC)
340
0
            {
341
0
                c = srcC - F;
342
0
                src -= srcCF - c;
343
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
344
0
                sum1 = sum0;
345
0
                const float * pw = weight + c;
346
0
                for (size_t ky = 0; ky < 3; ++ky)
347
0
                {
348
0
                    const float * ps0 = src + ky * srcS;
349
0
                    const float * ps1 = ps0 + srcX;
350
0
                    w0 = _mm_loadu_ps(pw);
351
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 0 * srcC), w0), sum0);
352
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 0 * srcC), w0), sum1);
353
0
                    pw += srcC;
354
0
                    w0 = _mm_loadu_ps(pw);
355
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 1 * srcC), w0), sum0);
356
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 1 * srcC), w0), sum1);
357
0
                    pw += srcC;
358
0
                    w0 = _mm_loadu_ps(pw);
359
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + 2 * srcC), w0), sum0);
360
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + 2 * srcC), w0), sum1);
361
0
                    pw += srcC;
362
0
                }
363
0
                _mm_storeu_ps(dst + c, Activate<type>(sum0, params, c));
364
0
                _mm_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));
365
0
            }
366
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
367
368
        template<::SimdConvolutionActivationType type>
369
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
370
0
        {
371
0
            size_t srcCF = AlignLo(srcC, F);
372
0
            size_t c = 0;
373
0
            for (; c < srcCF; c += F)
374
0
            {
375
0
                __m128 sum0, sum1, sum2, sum3, w0;
376
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
377
0
                sum1 = sum0;
378
0
                sum2 = sum0;
379
0
                sum3 = sum0;
380
0
                const float * pw = weight + c;
381
0
                const float * ps0 = src + 0 * srcX;
382
0
                const float * ps1 = src + 1 * srcX;
383
0
                const float * ps2 = src + 2 * srcX;
384
0
                const float * ps3 = src + 3 * srcX;
385
0
                for (size_t ky = 0; ky < 3; ++ky)
386
0
                {
387
0
                    size_t offset = ky * srcS;
388
0
                    w0 = _mm_loadu_ps(pw);
389
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
390
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
391
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
392
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
393
0
                    pw += srcC, offset += srcC;
394
0
                    w0 = _mm_loadu_ps(pw);
395
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
396
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
397
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
398
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
399
0
                    pw += srcC, offset += srcC;
400
0
                    w0 = _mm_loadu_ps(pw);
401
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
402
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
403
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
404
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
405
0
                    pw += srcC, offset += srcC;
406
0
                }
407
0
                _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));
408
0
                _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));
409
0
                _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));
410
0
                _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));
411
0
                src += F;
412
0
                dst += F;
413
0
            }
414
0
            if (c < srcC)
415
0
            {
416
0
                c = srcC - F;
417
0
                src -= srcCF - c;
418
0
                dst -= srcCF - c;
419
0
                __m128 sum0, sum1, sum2, sum3, w0;
420
0
                sum0 = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps();
421
0
                sum1 = sum0;
422
0
                sum2 = sum0;
423
0
                sum3 = sum0;
424
0
                const float * pw = weight + c;
425
0
                const float * ps0 = src + 0 * srcX;
426
0
                const float * ps1 = src + 1 * srcX;
427
0
                const float * ps2 = src + 2 * srcX;
428
0
                const float * ps3 = src + 3 * srcX;
429
0
                for (size_t ky = 0; ky < 3; ++ky)
430
0
                {
431
0
                    size_t offset = ky * srcS;
432
0
                    w0 = _mm_loadu_ps(pw);
433
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
434
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
435
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
436
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
437
0
                    pw += srcC, offset += srcC;
438
0
                    w0 = _mm_loadu_ps(pw);
439
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
440
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
441
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
442
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
443
0
                    pw += srcC, offset += srcC;
444
0
                    w0 = _mm_loadu_ps(pw);
445
0
                    sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps0 + offset), w0), sum0);
446
0
                    sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps1 + offset), w0), sum1);
447
0
                    sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps2 + offset), w0), sum2);
448
0
                    sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps3 + offset), w0), sum3);
449
0
                    pw += srcC, offset += srcC;
450
0
                }
451
0
                _mm_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));
452
0
                _mm_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));
453
0
                _mm_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));
454
0
                _mm_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));
455
0
            }
456
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
457
458
        template<::SimdConvolutionActivationType type>
459
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge4(const float * src, const ConvParam & p, size_t dy, size_t dx, const __m128 * weight, __m128 bias, const float * params, float * dst)
460
0
        {
461
0
            __m128 sum = bias;
462
0
            for (size_t ky = 0; ky < 3; ++ky)
463
0
            {
464
0
                size_t sy = dy * p.strideY + ky - p.padY;
465
0
                if (sy < p.srcH)
466
0
                {
467
0
                    for (size_t kx = 0; kx < 3; ++kx)
468
0
                    {
469
0
                        size_t sx = dx * p.strideX + kx - p.padX;
470
0
                        if (sx < p.srcW)
471
0
                        {
472
0
                            const float * ps = src + (sy*p.srcW + sx) * F;
473
0
                            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), weight[ky * 3 + kx]), sum);
474
0
                        }
475
0
                    }
476
0
                }
477
0
            }
478
0
            _mm_storeu_ps(dst, Activate<type>(sum, params, 0));
479
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Edge4<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
480
481
        template<::SimdConvolutionActivationType type>
482
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main4x1(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)
483
0
        {
484
0
            __m128 sum = bias;
485
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[0]), sum);
486
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[1]), sum);
487
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[2]), sum);
488
0
            src += srcS;
489
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[3]), sum);
490
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[4]), sum);
491
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[5]), sum);
492
0
            src += srcS;
493
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 0 * F), weight[6]), sum);
494
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 1 * F), weight[7]), sum);
495
0
            sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + 2 * F), weight[8]), sum);
496
0
            _mm_storeu_ps(dst, Activate<type>(sum, params, 0));
497
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x1<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
498
499
        template<::SimdConvolutionActivationType type>
500
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main4x2(const float * src, size_t srcS, const __m128 * weight, __m128 bias, const float * params, float * dst)
501
0
        {
502
0
            __m128 sum0 = bias;
503
0
            __m128 sum1 = bias;
504
0
            for (size_t ky = 0; ky < 3; ++ky)
505
0
            {
506
0
                __m128 s0 = _mm_loadu_ps(src + 0 * F);
507
0
                __m128 s1 = _mm_loadu_ps(src + 1 * F);
508
0
                __m128 s2 = _mm_loadu_ps(src + 2 * F);
509
0
                __m128 s3 = _mm_loadu_ps(src + 3 * F);
510
0
                sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum0);
511
0
                sum1 = _mm_add_ps(_mm_mul_ps(s1, weight[0]), sum1);
512
0
                sum0 = _mm_add_ps(_mm_mul_ps(s1, weight[1]), sum0);
513
0
                sum1 = _mm_add_ps(_mm_mul_ps(s2, weight[1]), sum1);
514
0
                sum0 = _mm_add_ps(_mm_mul_ps(s2, weight[2]), sum0);
515
0
                sum1 = _mm_add_ps(_mm_mul_ps(s3, weight[2]), sum1);
516
0
                src += srcS;
517
0
                weight += 3;
518
0
            }
519
0
            _mm_storeu_ps(dst + 0, Activate<type>(sum0, params, 0));
520
0
            _mm_storeu_ps(dst + F, Activate<type>(sum1, params, 0));
521
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3Main4x2<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(4) const*, float __vector(4), float const*, float*)
522
523
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise3x3(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
524
0
        {
525
0
            size_t srcS = p.srcC*p.srcW;
526
0
            size_t srcX = p.srcC*p.strideX;
527
0
            size_t dstH = p.dstH - p.padH;
528
0
            size_t dstW = p.dstW - p.padW;
529
0
            size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX;
530
0
            size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX;
531
0
            if (p.dstC == F && p.strideX == 1)
532
0
            {
533
0
                __m128 _weight[9];
534
0
                for (size_t i = 0; i < 9; ++i)
535
0
                    _weight[i] = _mm_loadu_ps(weight + i * F);
536
0
                __m128 _bias = bias ? _mm_loadu_ps(bias) : _mm_setzero_ps();
537
0
                size_t dy = 0;
538
0
                for (; dy < p.padY; ++dy)
539
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
540
0
                        Convolution32fNhwcDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
541
0
                for (; dy < dstH; ++dy)
542
0
                {
543
0
                    size_t dx = 0;
544
0
                    for (; dx < p.padX; ++dx)
545
0
                        Convolution32fNhwcDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
546
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
547
0
                    for (; dx < dstW2; dx += 2)
548
0
                        Convolution32fNhwcDepthwise3x3Main4x2<type>(src + offset, srcS, _weight, _bias, params, dst), offset += 2 * F, dst += 2 * F;
549
0
                    for (; dx < dstW; ++dx)
550
0
                        Convolution32fNhwcDepthwise3x3Main4x1<type>(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F;
551
0
                    for (; dx < p.dstW; ++dx)
552
0
                        Convolution32fNhwcDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
553
0
                }
554
0
                for (; dy < p.dstH; ++dy)
555
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
556
0
                        Convolution32fNhwcDepthwise3x3Edge4<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
557
0
            }
558
0
            else
559
0
            {
560
0
                size_t dy = 0;
561
0
                for (; dy < p.padY; ++dy)
562
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
563
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
564
0
                for (; dy < dstH; ++dy)
565
0
                {
566
0
                    size_t dx = 0;
567
0
                    for (; dx < p.padX; ++dx)
568
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
569
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
570
0
                    for (; dx < dstW4; dx += 4)
571
0
                        Convolution32fNhwcDepthwise3x3Main4<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX;
572
0
                    for (; dx < dstW2; dx += 2)
573
0
                        Convolution32fNhwcDepthwise3x3Main2<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX;
574
0
                    for (; dx < dstW; ++dx)
575
0
                        Convolution32fNhwcDepthwise3x3Main1<type>(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX;
576
0
                    for (; dx < p.dstW; ++dx)
577
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
578
0
                }
579
0
                for (; dy < p.dstH; ++dy)
580
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
581
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
582
0
            }
583
0
        }
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Sse41::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
584
585
        //-------------------------------------------------------------------------------------------------
586
587
        template <::SimdConvolutionActivationType type> SynetConvolution32fNhwcDepthwise::ConvolutionPtr Get(const ConvParam & p)
588
0
        {
589
0
            if (p.IsKernel(3) && p.IsDilation(1))
590
0
                return Convolution32fNhwcDepthwise3x3<type>;
591
0
            else
592
0
                return Convolution32fNhwcDepthwiseDefault<type>;
593
0
        }
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)0>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)1>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)2>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)3>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)4>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)5>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)6>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)7>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)8>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)9>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Sse41::Get<(SimdConvolutionActivationType)10>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
594
595
        //-------------------------------------------------------------------------------------------------
596
597
        SynetConvolution32fNhwcDepthwise::SynetConvolution32fNhwcDepthwise(const ConvParam& p)
598
0
            : Base::SynetConvolution32fNhwcDepthwise(p)
599
0
        {
600
0
            if (p.dstC >= F && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW)
601
0
            {
602
0
                switch (p.activation)
603
0
                {
604
0
                case ::SimdConvolutionActivationIdentity: _convolution = Get<::SimdConvolutionActivationIdentity>(p); break;
605
0
                case ::SimdConvolutionActivationRelu: _convolution = Get<::SimdConvolutionActivationRelu>(p); break;
606
0
                case ::SimdConvolutionActivationLeakyRelu: _convolution = Get<::SimdConvolutionActivationLeakyRelu>(p); break;
607
0
                case ::SimdConvolutionActivationRestrictRange: _convolution = Get<::SimdConvolutionActivationRestrictRange>(p); break;
608
0
                case ::SimdConvolutionActivationPrelu: _convolution = Get<::SimdConvolutionActivationPrelu>(p); break;
609
0
                case ::SimdConvolutionActivationElu: _convolution = Get<::SimdConvolutionActivationElu>(p); break;
610
0
                case ::SimdConvolutionActivationHswish: _convolution = Get<::SimdConvolutionActivationHswish>(p); break;
611
0
                case ::SimdConvolutionActivationMish: _convolution = Get<::SimdConvolutionActivationMish>(p); break;
612
0
                case ::SimdConvolutionActivationHardSigmoid: _convolution = Get<::SimdConvolutionActivationHardSigmoid>(p); break;
613
0
                case ::SimdConvolutionActivationSwish: _convolution = Get<::SimdConvolutionActivationSwish>(p); break;
614
0
                case ::SimdConvolutionActivationGelu: _convolution = Get<::SimdConvolutionActivationGelu>(p); break;
615
0
                }
616
0
            }
617
0
        }
618
    }
619
#endif
620
}