Coverage Report

Created: 2025-09-27 07:34

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2025 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetConvolution32f.h"
25
#include "Simd/SimdSynetConvolution32fCommon.h"
26
#include "Simd/SimdStore.h"
27
#include "Simd/SimdSynet.h"
28
#include "Simd/SimdGemm.h"
29
#include "Simd/SimdExp.h"
30
31
namespace Simd
32
{
33
#if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE)   
34
    namespace Avx512bw
35
    {
36
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwiseDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
37
0
        {
38
0
            size_t srcW = p.srcW, strideX = p.strideX, dilationX = p.dilationX, kernelX = p.kernelX, sX = strideX * p.dstC;
39
0
            size_t dstC = p.dstC, dstCF = AlignLo(p.dstC, F), dstC2F = AlignLo(p.dstC, 2 * F), dstC4F = AlignLo(p.dstC, 4 * F);
40
0
            size_t dstW2 = AlignLo(p.dstW, 2), dstW4 = AlignLo(p.dstW, 4);
41
0
            __m512 d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, w0;
42
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
43
0
            {
44
0
                size_t dx = 0;
45
0
                for (; dx < dstW4; dx += 4)
46
0
                {
47
0
                    float* dst0 = dst + 0 * p.dstC, * dst1 = dst + 1 * p.dstC, * dst2 = dst + 2 * p.dstC, * dst3 = dst + 3 * p.dstC;
48
0
                    size_t sx0 = dx * p.strideX - p.padX;
49
0
                    size_t dc = 0;
50
0
                    for (; dc < dstC4F; dc += 4 * F)
51
0
                    {
52
0
                        if (bias)
53
0
                        {
54
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
55
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
56
0
                            d02 = _mm512_loadu_ps(bias + dc + 2 * F);
57
0
                            d03 = _mm512_loadu_ps(bias + dc + 3 * F);
58
0
                        }
59
0
                        else
60
0
                        {
61
0
                            d00 = _mm512_setzero_ps();
62
0
                            d01 = _mm512_setzero_ps();
63
0
                            d02 = _mm512_setzero_ps();
64
0
                            d03 = _mm512_setzero_ps();
65
0
                        }
66
0
                        d10 = d00; d11 = d01; d12 = d02; d13 = d03;
67
0
                        d20 = d00; d21 = d01; d22 = d02; d23 = d03;
68
0
                        d30 = d00; d31 = d01; d32 = d02; d33 = d03;
69
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
70
0
                        {
71
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
72
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
73
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
74
0
                            if (sy < p.srcH)
75
0
                            {
76
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
77
0
                                {
78
0
                                    size_t sx = sx0 + kx * dilationX;
79
0
                                    const float* pw = pwy + kx * dstC;
80
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000;
81
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000;
82
0
                                    __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000;
83
0
                                    __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000;
84
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX;
85
86
0
                                    w0 = _mm512_loadu_ps(pw + 0 * F);
87
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
88
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
89
0
                                    d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2);
90
0
                                    d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3);
91
0
                                    w0 = _mm512_loadu_ps(pw + 1 * F);
92
0
                                    d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0);
93
0
                                    d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1);
94
0
                                    d21 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 1 * F), w0, d21, mask2);
95
0
                                    d31 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 1 * F), w0, d31, mask3);
96
0
                                    w0 = _mm512_loadu_ps(pw + 2 * F);
97
0
                                    d02 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 2 * F), w0, d02, mask0);
98
0
                                    d12 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 2 * F), w0, d12, mask1);
99
0
                                    d22 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 2 * F), w0, d22, mask2);
100
0
                                    d32 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 2 * F), w0, d32, mask3);
101
0
                                    w0 = _mm512_loadu_ps(pw + 3 * F);
102
0
                                    d03 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 3 * F), w0, d03, mask0);
103
0
                                    d13 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 3 * F), w0, d13, mask1);
104
0
                                    d23 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 3 * F), w0, d23, mask2);
105
0
                                    d33 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 3 * F), w0, d33, mask3);
106
0
                                }
107
0
                            }
108
0
                        }
109
0
                        _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
110
0
                        _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
111
0
                        _mm512_storeu_ps(dst0 + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F));
112
0
                        _mm512_storeu_ps(dst0 + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F));
113
0
                        _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F));
114
0
                        _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F));
115
0
                        _mm512_storeu_ps(dst1 + dc + 2 * F, Activate<type>(d12, params, dc + 2 * F));
116
0
                        _mm512_storeu_ps(dst1 + dc + 3 * F, Activate<type>(d13, params, dc + 3 * F));
117
0
                        _mm512_storeu_ps(dst2 + dc + 0 * F, Activate<type>(d20, params, dc + 0 * F));
118
0
                        _mm512_storeu_ps(dst2 + dc + 1 * F, Activate<type>(d21, params, dc + 1 * F));
119
0
                        _mm512_storeu_ps(dst2 + dc + 2 * F, Activate<type>(d22, params, dc + 2 * F));
120
0
                        _mm512_storeu_ps(dst2 + dc + 3 * F, Activate<type>(d23, params, dc + 3 * F));
121
0
                        _mm512_storeu_ps(dst3 + dc + 0 * F, Activate<type>(d30, params, dc + 0 * F));
122
0
                        _mm512_storeu_ps(dst3 + dc + 1 * F, Activate<type>(d31, params, dc + 1 * F));
123
0
                        _mm512_storeu_ps(dst3 + dc + 2 * F, Activate<type>(d32, params, dc + 2 * F));
124
0
                        _mm512_storeu_ps(dst3 + dc + 3 * F, Activate<type>(d33, params, dc + 3 * F));
125
0
                    }
126
0
                    for (; dc < dstC2F; dc += 2 * F)
127
0
                    {
128
0
                        if (bias)
129
0
                        {
130
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
131
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
132
0
                        }
133
0
                        else
134
0
                        {
135
0
                            d00 = _mm512_setzero_ps();
136
0
                            d01 = _mm512_setzero_ps();
137
0
                        }
138
0
                        d10 = d00; d11 = d01;
139
0
                        d20 = d00; d21 = d01;
140
0
                        d30 = d00; d31 = d01;
141
142
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
143
0
                        {
144
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
145
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
146
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
147
0
                            if (sy < p.srcH)
148
0
                            {
149
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
150
0
                                {
151
0
                                    size_t sx = sx0 + kx * dilationX;
152
0
                                    const float* pw = pwy + kx * dstC;
153
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000;
154
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000;
155
0
                                    __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000;
156
0
                                    __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000;
157
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX;
158
159
0
                                    w0 = _mm512_loadu_ps(pw + 0 * F);
160
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
161
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
162
0
                                    d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2);
163
0
                                    d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3);
164
0
                                    w0 = _mm512_loadu_ps(pw + 1 * F);
165
0
                                    d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0);
166
0
                                    d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1);
167
0
                                    d21 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 1 * F), w0, d21, mask2);
168
0
                                    d31 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 1 * F), w0, d31, mask3);
169
0
                                }
170
0
                            }
171
0
                        }
172
0
                        _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
173
0
                        _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
174
0
                        _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F));
175
0
                        _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F));
176
0
                        _mm512_storeu_ps(dst2 + dc + 0 * F, Activate<type>(d20, params, dc + 0 * F));
177
0
                        _mm512_storeu_ps(dst2 + dc + 1 * F, Activate<type>(d21, params, dc + 1 * F));
178
0
                        _mm512_storeu_ps(dst3 + dc + 0 * F, Activate<type>(d30, params, dc + 0 * F));
179
0
                        _mm512_storeu_ps(dst3 + dc + 1 * F, Activate<type>(d31, params, dc + 1 * F));
180
0
                    }
181
0
                    for (; dc < dstC; dc += F)
182
0
                    {
183
0
                        __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc);
184
0
                        d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps();
185
0
                        d10 = d00; d20 = d00; d30 = d00;
186
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
187
0
                        {
188
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
189
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
190
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
191
0
                            if (sy < p.srcH)
192
0
                            {
193
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
194
0
                                {
195
0
                                    size_t sx = sx0 + kx * dilationX;
196
0
                                    const float* pw = pwy + kx * dstC;
197
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? tailC : 0x0000;
198
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000;
199
0
                                    __mmask16 mask2 = sx + 2 * strideX < srcW ? tailC : 0x0000;
200
0
                                    __mmask16 mask3 = sx + 3 * strideX < srcW ? tailC : 0x0000;
201
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX;
202
203
0
                                    w0 = _mm512_loadu_ps(pw + 0 * F);
204
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
205
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
206
0
                                    d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2);
207
0
                                    d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3);
208
0
                                }
209
0
                            }
210
0
                        }
211
0
                        _mm512_mask_storeu_ps(dst0 + dc, tailC, Activate<type>(d00, params, dc, tailC));
212
0
                        _mm512_mask_storeu_ps(dst1 + dc, tailC, Activate<type>(d10, params, dc, tailC));
213
0
                        _mm512_mask_storeu_ps(dst2 + dc, tailC, Activate<type>(d20, params, dc, tailC));
214
0
                        _mm512_mask_storeu_ps(dst3 + dc, tailC, Activate<type>(d30, params, dc, tailC));
215
0
                    }
216
0
                    dst += 4 * p.dstC;
217
0
                }
218
0
                for (; dx < dstW2; dx += 2)
219
0
                {
220
0
                    float* dst0 = dst + 0 * p.dstC, *dst1 = dst + 1 * p.dstC;
221
0
                    size_t sx0 = dx * p.strideX - p.padX;
222
0
                    size_t dc = 0;
223
0
                    for (; dc < dstC4F; dc += 4 * F)
224
0
                    {
225
0
                        if (bias)
226
0
                        {
227
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
228
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
229
0
                            d02 = _mm512_loadu_ps(bias + dc + 2 * F);
230
0
                            d03 = _mm512_loadu_ps(bias + dc + 3 * F);
231
0
                        }
232
0
                        else
233
0
                        {
234
0
                            d00 = _mm512_setzero_ps();
235
0
                            d01 = _mm512_setzero_ps();
236
0
                            d02 = _mm512_setzero_ps();
237
0
                            d03 = _mm512_setzero_ps();
238
0
                        }
239
0
                        d10 = d00; d11 = d01; d12 = d02; d13 = d03;
240
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
241
0
                        {
242
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
243
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
244
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
245
0
                            if (sy < p.srcH)
246
0
                            {
247
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
248
0
                                {
249
0
                                    size_t sx = sx0 + kx * dilationX;
250
0
                                    const float* pw = pwy + kx * dstC;
251
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000;
252
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000;
253
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX;
254
255
0
                                    w0 = _mm512_loadu_ps(pw + 0 * F);
256
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
257
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
258
0
                                    w0 = _mm512_loadu_ps(pw + 1 * F);
259
0
                                    d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0);
260
0
                                    d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1);                                    
261
0
                                    w0 = _mm512_loadu_ps(pw + 2 * F);
262
0
                                    d02 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 2 * F), w0, d02, mask0);
263
0
                                    d12 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 2 * F), w0, d12, mask1);                                    
264
0
                                    w0 = _mm512_loadu_ps(pw + 3 * F);
265
0
                                    d03 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 3 * F), w0, d03, mask0);
266
0
                                    d13 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 3 * F), w0, d13, mask1);
267
0
                                }
268
0
                            }
269
0
                        }
270
0
                        _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
271
0
                        _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
272
0
                        _mm512_storeu_ps(dst0 + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F));
273
0
                        _mm512_storeu_ps(dst0 + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F));
274
0
                        _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F));
275
0
                        _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F));
276
0
                        _mm512_storeu_ps(dst1 + dc + 2 * F, Activate<type>(d12, params, dc + 2 * F));
277
0
                        _mm512_storeu_ps(dst1 + dc + 3 * F, Activate<type>(d13, params, dc + 3 * F));
278
0
                    }
279
0
                    for (; dc < dstC2F; dc += 2 * F)
280
0
                    {
281
0
                        if (bias)
282
0
                        {
283
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
284
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
285
0
                        }
286
0
                        else
287
0
                        {
288
0
                            d00 = _mm512_setzero_ps();
289
0
                            d01 = _mm512_setzero_ps();
290
0
                        }
291
0
                        d10 = d00; d11 = d01;
292
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
293
0
                        {
294
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
295
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
296
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
297
0
                            if (sy < p.srcH)
298
0
                            {
299
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
300
0
                                {
301
0
                                    size_t sx = sx0 + kx * dilationX;
302
0
                                    const float* pw = pwy + kx * dstC;
303
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000;
304
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000;
305
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX;
306
307
0
                                    w0 = _mm512_loadu_ps(pw + 0 * F);
308
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
309
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
310
0
                                    w0 = _mm512_loadu_ps(pw + 1 * F);
311
0
                                    d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0);
312
0
                                    d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1);
313
0
                                }
314
0
                            }
315
0
                        }
316
0
                        _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
317
0
                        _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
318
0
                        _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F));
319
0
                        _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F));
320
0
                    }
321
0
                    for (; dc < dstC; dc += F)
322
0
                    {
323
0
                        __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc);
324
0
                        d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps();
325
0
                        d10 = d00;
326
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
327
0
                        {
328
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
329
0
                            const float* psy = src + sy * p.srcW * dstC + dc;
330
0
                            const float* pwy = weight + ky * p.kernelX * dstC + dc;
331
0
                            if (sy < p.srcH)
332
0
                            {
333
0
                                for (size_t kx = 0; kx < kernelX; ++kx)
334
0
                                {
335
0
                                    size_t sx = sx0 + kx * dilationX;
336
0
                                    const float* pw = pwy + kx * dstC;
337
0
                                    __mmask16 mask0 = sx + 0 * strideX < srcW ? tailC : 0x0000;
338
0
                                    __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000;
339
0
                                    const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX;
340
341
0
                                    w0 = _mm512_maskz_loadu_ps(tailC, pw);
342
0
                                    d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0);
343
0
                                    d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1);
344
0
                                }
345
0
                            }
346
0
                        }
347
0
                        _mm512_mask_storeu_ps(dst0 + dc, tailC, Activate<type>(d00, params, dc, tailC));
348
0
                        _mm512_mask_storeu_ps(dst1 + dc, tailC, Activate<type>(d10, params, dc, tailC));
349
0
                    }
350
0
                    dst += 2 * p.dstC;
351
0
                }
352
0
                for (; dx < p.dstW; ++dx)
353
0
                {
354
0
                    size_t dc = 0;
355
0
                    for (; dc < dstC4F; dc += 4 * F)
356
0
                    {
357
0
                        if (bias)
358
0
                        {
359
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
360
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
361
0
                            d02 = _mm512_loadu_ps(bias + dc + 2 * F);
362
0
                            d03 = _mm512_loadu_ps(bias + dc + 3 * F);
363
0
                        }
364
0
                        else
365
0
                        {
366
0
                            d00 = _mm512_setzero_ps();
367
0
                            d01 = _mm512_setzero_ps();
368
0
                            d02 = _mm512_setzero_ps();
369
0
                            d03 = _mm512_setzero_ps();
370
0
                        }
371
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
372
0
                        {
373
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
374
0
                            if (sy < p.srcH)
375
0
                            {
376
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
377
0
                                {
378
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
379
0
                                    if (sx < p.srcW)
380
0
                                    {
381
0
                                        const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc;
382
0
                                        const float * ps = src + (sy*p.srcW + sx)* dstC + dc;
383
0
                                        d00 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), d00);
384
0
                                        d01 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), d01);
385
0
                                        d02 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * F), _mm512_loadu_ps(pw + 2 * F), d02);
386
0
                                        d03 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * F), _mm512_loadu_ps(pw + 3 * F), d03);
387
0
                                    }
388
0
                                }
389
0
                            }
390
0
                        }
391
0
                        _mm512_storeu_ps(dst + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
392
0
                        _mm512_storeu_ps(dst + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
393
0
                        _mm512_storeu_ps(dst + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F));
394
0
                        _mm512_storeu_ps(dst + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F));
395
0
                    }
396
0
                    for (; dc < dstC2F; dc += 2 * F)
397
0
                    {
398
0
                        if (bias)
399
0
                        {
400
0
                            d00 = _mm512_loadu_ps(bias + dc + 0 * F);
401
0
                            d01 = _mm512_loadu_ps(bias + dc + 1 * F);
402
0
                        }
403
0
                        else
404
0
                        {
405
0
                            d00 = _mm512_setzero_ps();
406
0
                            d01 = _mm512_setzero_ps();
407
0
                        }
408
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
409
0
                        {
410
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
411
0
                            if (sy < p.srcH)
412
0
                            {
413
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
414
0
                                {
415
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
416
0
                                    if (sx < p.srcW)
417
0
                                    {
418
0
                                        const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc;
419
0
                                        const float * ps = src + (sy*p.srcW + sx)* dstC + dc;
420
0
                                        d00 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), d00);
421
0
                                        d01 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), d01);
422
0
                                    }
423
0
                                }
424
0
                            }
425
0
                        }
426
0
                        _mm512_storeu_ps(dst + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F));
427
0
                        _mm512_storeu_ps(dst + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F));
428
0
                    }
429
0
                    for (; dc < dstC; dc += F)
430
0
                    {
431
0
                        __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc);
432
0
                        d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps();
433
0
                        for (size_t ky = 0; ky < p.kernelY; ++ky)
434
0
                        {
435
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
436
0
                            if (sy < p.srcH)
437
0
                            {
438
0
                                for (size_t kx = 0; kx < p.kernelX; ++kx)
439
0
                                {
440
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
441
0
                                    if (sx < p.srcW)
442
0
                                    {
443
0
                                        const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc;
444
0
                                        const float * ps = src + (sy*p.srcW + sx)* dstC + dc;
445
0
                                        d00 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tailC, ps), _mm512_maskz_loadu_ps(tailC, pw), d00);
446
0
                                    }
447
0
                                }
448
0
                            }
449
0
                        }
450
0
                        _mm512_mask_storeu_ps(dst + dc, tailC, Activate<type>(d00, params, dc, tailC));
451
0
                    }
452
0
                    dst += p.dstC;
453
0
                }
454
0
            }
455
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
456
457
        //-------------------------------------------------------------------------------------------------
458
459
        template<::SimdConvolutionActivationType type> 
460
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge(const float * src, const ConvParam & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst)
461
0
        {
462
0
            size_t srcC = p.srcC;
463
0
            size_t srcCF = AlignLo(srcC, F);
464
0
            size_t c = 0;
465
0
            for (; c < srcCF; c += F)
466
0
            {
467
0
                __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps();
468
0
                for (size_t ky = 0; ky < 3; ++ky)
469
0
                {
470
0
                    size_t sy = dy * p.strideY + ky - p.padY;
471
0
                    if (sy < p.srcH)
472
0
                    {
473
0
                        for (size_t kx = 0; kx < 3; ++kx)
474
0
                        {
475
0
                            size_t sx = dx * p.strideX + kx - p.padX;
476
0
                            if (sx < p.srcW)
477
0
                            {
478
0
                                const float * pw = weight + (ky * 3 + kx) * srcC;
479
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
480
0
                                sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum);
481
0
                            }
482
0
                        }
483
0
                    }
484
0
                }
485
0
                _mm512_storeu_ps(dst + c, Activate<type>(sum, params, c));
486
0
                src += F;
487
0
                weight += F;
488
0
            }
489
0
            if (c < srcC)
490
0
            {
491
0
                __mmask16 tail = TailMask16(srcC - c);
492
0
                __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps();
493
0
                for (size_t ky = 0; ky < 3; ++ky)
494
0
                {
495
0
                    size_t sy = dy * p.strideY + ky - p.padY;
496
0
                    if (sy < p.srcH)
497
0
                    {
498
0
                        for (size_t kx = 0; kx < 3; ++kx)
499
0
                        {
500
0
                            size_t sx = dx * p.strideX + kx - p.padX;
501
0
                            if (sx < p.srcW)
502
0
                            {
503
0
                                const float * pw = weight + (ky*3 + kx) * srcC;
504
0
                                const float * ps = src + (sy*p.srcW + sx) * srcC;
505
0
                                sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps), _mm512_maskz_loadu_ps(tail, pw), sum);
506
0
                            }
507
0
                        }
508
0
                    }
509
0
                }
510
0
                _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum, params, c, tail));
511
0
            }
512
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*)
513
514
        template<::SimdConvolutionActivationType type> 
515
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
516
0
        {
517
0
            size_t srcCF = AlignLo(srcC, F);
518
0
            size_t c = 0;
519
0
            for (; c < srcCF; c += F)
520
0
            {
521
0
                __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps();
522
0
                for (size_t ky = 0; ky < 3; ++ky)
523
0
                {
524
0
                    const float * ps = src + ky * srcS;
525
0
                    const float * pw = weight + ky * 3 * srcC;
526
0
                    sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * srcC), _mm512_loadu_ps(pw + 0 * srcC), sum);
527
0
                    sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * srcC), _mm512_loadu_ps(pw + 1 * srcC), sum);
528
0
                    sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * srcC), _mm512_loadu_ps(pw + 2 * srcC), sum);
529
0
                }
530
0
                _mm512_storeu_ps(dst + c, Activate<type>(sum, params, c));
531
0
                src += F;
532
0
                weight += F;
533
0
            }
534
0
            if (c < srcC)
535
0
            {
536
0
                __mmask16 tail = TailMask16(srcC - c);
537
0
                __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps();
538
0
                for (size_t ky = 0; ky < 3; ++ky)
539
0
                {
540
0
                    const float * ps = src + ky * srcS;
541
0
                    const float * pw = weight + ky * 3 * srcC;
542
0
                    sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 0 * srcC), _mm512_maskz_loadu_ps(tail, pw + 0 * srcC), sum);
543
0
                    sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 1 * srcC), _mm512_maskz_loadu_ps(tail, pw + 1 * srcC), sum);
544
0
                    sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 2 * srcC), _mm512_maskz_loadu_ps(tail, pw + 2 * srcC), sum);
545
0
                }
546
0
                _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum, params, c, tail));
547
0
            }
548
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*)
549
550
        template<::SimdConvolutionActivationType type> 
551
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
552
0
        {
553
0
            size_t srcCF = AlignLo(srcC, F);
554
0
            size_t c = 0;
555
0
            __m512 sum0, sum1, w0;
556
0
            for (; c < srcCF; c += F)
557
0
            {
558
0
                sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps();
559
0
                sum1 = sum0;
560
0
                const float * pw = weight + c;
561
0
                for (size_t ky = 0; ky < 3; ++ky)
562
0
                {
563
0
                    const float * ps0 = src + ky * srcS;
564
0
                    const float * ps1 = ps0 + srcX;
565
0
                    w0 = _mm512_loadu_ps(pw);
566
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 0 * srcC), w0, sum0);
567
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 0 * srcC), w0, sum1);
568
0
                    pw += srcC;
569
0
                    w0 = _mm512_loadu_ps(pw);
570
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 1 * srcC), w0, sum0);
571
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 1 * srcC), w0, sum1);
572
0
                    pw += srcC;
573
0
                    w0 = _mm512_loadu_ps(pw);
574
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 2 * srcC), w0, sum0);
575
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 2 * srcC), w0, sum1);
576
0
                    pw += srcC;
577
0
                }
578
0
                _mm512_storeu_ps(dst + c, Activate<type>(sum0, params, c));
579
0
                _mm512_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c));
580
0
                src += F;
581
0
            }
582
0
            if (c < srcC)
583
0
            {
584
0
                __mmask16 tail = TailMask16(srcC - c);
585
0
                sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps();
586
0
                sum1 = sum0;
587
0
                const float * pw = weight + c;
588
0
                for (size_t ky = 0; ky < 3; ++ky)
589
0
                {
590
0
                    const float * ps0 = src + ky * srcS;
591
0
                    const float * ps1 = ps0 + srcX;
592
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
593
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 0 * srcC), w0, sum0);
594
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 0 * srcC), w0, sum1);
595
0
                    pw += srcC;
596
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
597
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 1 * srcC), w0, sum0);
598
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 1 * srcC), w0, sum1);
599
0
                    pw += srcC;
600
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
601
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 2 * srcC), w0, sum0);
602
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 2 * srcC), w0, sum1);
603
0
                    pw += srcC;
604
0
                }
605
0
                _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum0, params, c, tail));
606
0
                _mm512_mask_storeu_ps(dst + c + srcC, tail, Activate<type>(sum1, params, c, tail));
607
0
            }
608
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
609
610
        template<::SimdConvolutionActivationType type> 
611
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst)
612
0
        {
613
0
            size_t srcCF = AlignLo(srcC, F);
614
0
            size_t c = 0;
615
0
            for (; c < srcCF; c += F)
616
0
            {
617
0
                __m512 sum0, sum1, sum2, sum3, w0;
618
0
                sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps();
619
0
                sum1 = sum0;
620
0
                sum2 = sum0;
621
0
                sum3 = sum0;
622
0
                const float * pw = weight + c;
623
0
                const float * ps0 = src + 0 * srcX;
624
0
                const float * ps1 = src + 1 * srcX;
625
0
                const float * ps2 = src + 2 * srcX;
626
0
                const float * ps3 = src + 3 * srcX;
627
0
                for (size_t ky = 0; ky < 3; ++ky)
628
0
                {
629
0
                    size_t offset = ky * srcS;
630
0
                    w0 = _mm512_loadu_ps(pw);
631
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0);
632
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1);
633
0
                    sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2);                    
634
0
                    sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3);                    
635
0
                    pw += srcC, offset += srcC;
636
0
                    w0 = _mm512_loadu_ps(pw);
637
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0);
638
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1);
639
0
                    sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2);
640
0
                    sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3);
641
0
                    pw += srcC, offset += srcC;
642
0
                    w0 = _mm512_loadu_ps(pw);
643
0
                    sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0);
644
0
                    sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1);
645
0
                    sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2);
646
0
                    sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3);
647
0
                    pw += srcC, offset += srcC;
648
0
                }
649
0
                _mm512_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c));
650
0
                _mm512_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c));
651
0
                _mm512_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c));
652
0
                _mm512_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c));
653
0
                src += F;
654
0
                dst += F;
655
0
            }
656
0
            if (c < srcC)
657
0
            {
658
0
                __mmask16 tail = TailMask16(srcC - c);
659
0
                __m512 sum0, sum1, sum2, sum3, w0;
660
0
                sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps();
661
0
                sum1 = sum0;
662
0
                sum2 = sum0;
663
0
                sum3 = sum0;
664
0
                const float * pw = weight + c;
665
0
                const float * ps0 = src + 0 * srcX;
666
0
                const float * ps1 = src + 1 * srcX;
667
0
                const float * ps2 = src + 2 * srcX;
668
0
                const float * ps3 = src + 3 * srcX;
669
0
                for (size_t ky = 0; ky < 3; ++ky)
670
0
                {
671
0
                    size_t offset = ky * srcS;
672
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
673
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0);
674
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1);
675
0
                    sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2);
676
0
                    sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3);
677
0
                    pw += srcC, offset += srcC;
678
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
679
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0);
680
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1);
681
0
                    sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2);
682
0
                    sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3);
683
0
                    pw += srcC, offset += srcC;
684
0
                    w0 = _mm512_maskz_loadu_ps(tail, pw);
685
0
                    sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0);
686
0
                    sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1);
687
0
                    sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2);
688
0
                    sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3);
689
0
                    pw += srcC, offset += srcC;
690
0
                }
691
0
                _mm512_mask_storeu_ps(dst + 0 * srcC, tail, Activate<type>(sum0, params, c, tail));
692
0
                _mm512_mask_storeu_ps(dst + 1 * srcC, tail, Activate<type>(sum1, params, c, tail));
693
0
                _mm512_mask_storeu_ps(dst + 2 * srcC, tail, Activate<type>(sum2, params, c, tail));
694
0
                _mm512_mask_storeu_ps(dst + 3 * srcC, tail, Activate<type>(sum3, params, c, tail));
695
0
            }
696
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*)
697
698
        template<::SimdConvolutionActivationType type> 
699
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge16(const float * src, const ConvParam & p, size_t dy, size_t dx, const __m512 * weight, __m512 bias, const float * params, float * dst)
700
0
        {
701
0
            __m512 sum = bias;
702
0
            for (size_t ky = 0; ky < 3; ++ky)
703
0
            {
704
0
                size_t sy = dy * p.strideY + ky - p.padY;
705
0
                if (sy < p.srcH)
706
0
                {
707
0
                    for (size_t kx = 0; kx < 3; ++kx)
708
0
                    {
709
0
                        size_t sx = dx * p.strideX + kx - p.padX;
710
0
                        if (sx < p.srcW)
711
0
                        {
712
0
                            const float * ps = src + (sy*p.srcW + sx) * F;
713
0
                            sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), weight[ky * 3 + kx], sum);
714
0
                        }
715
0
                    }
716
0
                }
717
0
            }
718
0
            _mm512_storeu_ps(dst, Activate<type>(sum, params, 0));
719
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
720
721
        template<::SimdConvolutionActivationType type> 
722
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main16x1(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst)
723
0
        {
724
0
            __m512 sum = bias;
725
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum);
726
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum);
727
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum);
728
0
            src += srcS;
729
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[3], sum);
730
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[4], sum);
731
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[5], sum);
732
0
            src += srcS;
733
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[6], sum);
734
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[7], sum);
735
0
            sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[8], sum);
736
0
            _mm512_storeu_ps(dst, Activate<type>(sum, params, 0));
737
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
738
739
        template<::SimdConvolutionActivationType type> 
740
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main16x2(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst)
741
0
        {
742
0
            __m512 sum0 = bias;
743
0
            __m512 sum1 = bias;
744
0
            for (size_t ky = 0; ky < 3; ++ky)
745
0
            {
746
0
                __m512 s0 = _mm512_loadu_ps(src + 0 * F);
747
0
                __m512 s1 = _mm512_loadu_ps(src + 1 * F);
748
0
                __m512 s2 = _mm512_loadu_ps(src + 2 * F);
749
0
                __m512 s3 = _mm512_loadu_ps(src + 3 * F);
750
0
                sum0 = _mm512_fmadd_ps(s0, weight[0], sum0);
751
0
                sum1 = _mm512_fmadd_ps(s1, weight[0], sum1);
752
0
                sum0 = _mm512_fmadd_ps(s1, weight[1], sum0);
753
0
                sum1 = _mm512_fmadd_ps(s2, weight[1], sum1);
754
0
                sum0 = _mm512_fmadd_ps(s2, weight[2], sum0);
755
0
                sum1 = _mm512_fmadd_ps(s3, weight[2], sum1);
756
0
                src += srcS;
757
0
                weight += 3;
758
0
            }
759
0
            _mm512_storeu_ps(dst + 0, Activate<type>(sum0, params, 0));
760
0
            _mm512_storeu_ps(dst + F, Activate<type>(sum1, params, 0));
761
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*)
762
763
        template<::SimdConvolutionActivationType type>
764
        SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main48(const float * src, size_t srcS, const __m512 * weight, const float * bias, const float * params, float * dst)
765
0
        {
766
0
            __m512 sum0, sum1, sum2;
767
0
            if (bias)
768
0
            {
769
0
                sum0 = _mm512_loadu_ps(bias + 0 * F);
770
0
                sum1 = _mm512_loadu_ps(bias + 1 * F);
771
0
                sum2 = _mm512_loadu_ps(bias + 2 * F);
772
0
            }
773
0
            else
774
0
            {
775
0
                sum0 = _mm512_setzero_ps();
776
0
                sum1 = _mm512_setzero_ps();
777
0
                sum2 = _mm512_setzero_ps();
778
0
            }
779
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0);
780
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1);
781
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2);
782
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0);
783
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1);
784
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2);
785
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0);
786
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1);
787
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2);
788
0
            src += srcS;
789
0
            weight += 9;
790
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0);
791
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1);
792
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2);
793
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0);
794
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1);
795
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2);
796
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0);
797
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1);
798
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2);
799
0
            src += srcS;
800
0
            weight += 9;
801
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0);
802
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1);
803
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2);
804
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0);
805
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1);
806
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2);
807
0
            sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0);
808
0
            sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1);
809
0
            sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2);
810
0
            _mm512_storeu_ps(dst + 0 * F, Activate<type>(sum0, params, 0 * F));
811
0
            _mm512_storeu_ps(dst + 1 * F, Activate<type>(sum1, params, 1 * F));
812
0
            _mm512_storeu_ps(dst + 2 * F, Activate<type>(sum2, params, 2 * F));
813
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*)
814
815
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise3x3(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst)
816
0
        {
817
0
            size_t srcS = p.srcC*p.srcW;
818
0
            size_t srcX = p.srcC*p.strideX;
819
0
            size_t dstH = p.dstH - p.padH;
820
0
            size_t dstW = p.dstW - p.padW;
821
0
            size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX;
822
0
            size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX;
823
0
            if (p.dstC == F && p.strideX == 1)
824
0
            {
825
0
                __m512 _weight[9];
826
0
                for (size_t i = 0; i < 9; ++i)
827
0
                    _weight[i] = _mm512_loadu_ps(weight + i * F);
828
0
                __m512 _bias = bias ? _mm512_loadu_ps(bias) : _mm512_setzero_ps();
829
0
                size_t dy = 0;
830
0
                for (; dy < p.padY; ++dy)
831
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
832
0
                        Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
833
0
                for (; dy < dstH; ++dy)
834
0
                {
835
0
                    size_t dx = 0;
836
0
                    for (; dx < p.padX; ++dx)
837
0
                        Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
838
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
839
0
                    for (; dx < dstW2; dx += 2)
840
0
                        Convolution32fNhwcDepthwise3x3Main16x2<type>(src + offset, srcS, _weight, _bias, params, dst), offset += 2*F, dst += 2*F;
841
0
                    for (; dx < dstW; ++dx)
842
0
                        Convolution32fNhwcDepthwise3x3Main16x1<type>(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F;
843
0
                    for (; dx < p.dstW; ++dx)
844
0
                        Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
845
0
                }
846
0
                for (; dy < p.dstH; ++dy)
847
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
848
0
                        Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F;
849
0
            }
850
0
            else
851
0
            {
852
0
                size_t dy = 0;
853
0
                for (; dy < p.padY; ++dy)
854
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
855
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
856
0
                for (; dy < dstH; ++dy)
857
0
                {
858
0
                    size_t dx = 0;
859
0
                    for (; dx < p.padX; ++dx)
860
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
861
0
                    size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC;
862
0
                    if (p.srcC == 48)
863
0
                    {
864
0
                        __m512 _weight[27];
865
0
                        for (size_t i = 0; i < 27; ++i)
866
0
                            _weight[i] = _mm512_loadu_ps(weight + i * F);
867
0
                        for (; dx < dstW; ++dx)
868
0
                            Convolution32fNhwcDepthwise3x3Main48<type>(src + offset, srcS, _weight, bias, params, dst), dst += p.dstC, offset += srcX;
869
0
                    }
870
0
                    else
871
0
                        for (; dx < dstW4; dx += 4)
872
0
                            Convolution32fNhwcDepthwise3x3Main4<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX;
873
0
                    for (; dx < dstW2; dx += 2)
874
0
                        Convolution32fNhwcDepthwise3x3Main2<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX;
875
0
                    for (; dx < dstW; ++dx)
876
0
                        Convolution32fNhwcDepthwise3x3Main1<type>(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX;
877
0
                    for (; dx < p.dstW; ++dx)
878
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
879
0
                }
880
0
                for (; dy < p.dstH; ++dy)
881
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
882
0
                        Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC;
883
0
            }
884
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
885
886
        //-------------------------------------------------------------------------------------------------
887
888
        static SIMD_INLINE bool Preferable_k7p3d1s1w4(const ConvParam& p)
889
0
        {
890
0
            return p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7;
891
0
        }
892
893
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w4(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst)
894
0
        {
895
0
            assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7);
896
897
0
            size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, endW = dstW - 4;
898
0
            __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, _params[2];
899
0
            _params[0] = _mm512_set1_ps(params[0]);
900
0
            if (type == SimdConvolutionActivationRestrictRange ||
901
0
                type == SimdConvolutionActivationHswish ||
902
0
                type == SimdConvolutionActivationHardSigmoid)
903
0
                _params[1] = _mm512_set1_ps(params[1]);
904
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
905
0
            {
906
0
                for (size_t dx = 0;; dx += Min<size_t>(4, endW - dx))
907
0
                {
908
0
                    for (size_t dc = 0; dc < dstC; dc += F)
909
0
                    {
910
0
                        __mmask16 tail = TailMask16(dstC - dc);
911
0
                        if (type == SimdConvolutionActivationPrelu)
912
0
                            _params[0] = _mm512_maskz_loadu_ps(tail, params + dc);
913
0
                        d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps();
914
0
                        d1 = d0; d2 = d0; d3 = d0;
915
0
                        for (size_t ky = 0; ky < 7; ++ky)
916
0
                        {
917
0
                            size_t sy = dy + ky - 3;
918
0
                            const float* ps = src + (sy * dstW + dx - 3) * dstC + dc;
919
0
                            const float* pw = weight + ky * 7 * dstC + dc;
920
0
                            if (sy < srcH)
921
0
                            {
922
0
                                w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC);
923
0
                                w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC);
924
0
                                w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC);
925
0
                                if (dx)
926
0
                                {
927
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC);
928
0
                                    d0 = _mm512_fmadd_ps(s0, w0, d0);
929
930
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC);
931
0
                                    d0 = _mm512_fmadd_ps(s1, w1, d0);
932
0
                                    d1 = _mm512_fmadd_ps(s1, w0, d1);
933
934
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC);
935
0
                                    d0 = _mm512_fmadd_ps(s0, w2, d0);
936
0
                                    d1 = _mm512_fmadd_ps(s0, w1, d1);
937
0
                                    d2 = _mm512_fmadd_ps(s0, w0, d2);
938
0
                                }
939
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC);
940
0
                                w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC);
941
0
                                d0 = _mm512_fmadd_ps(s1, w3, d0);
942
0
                                d1 = _mm512_fmadd_ps(s1, w2, d1);
943
0
                                d2 = _mm512_fmadd_ps(s1, w1, d2);
944
0
                                d3 = _mm512_fmadd_ps(s1, w0, d3);
945
946
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC);
947
0
                                w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC);
948
0
                                d0 = _mm512_fmadd_ps(s0, w4, d0);
949
0
                                d1 = _mm512_fmadd_ps(s0, w3, d1);
950
0
                                d2 = _mm512_fmadd_ps(s0, w2, d2);
951
0
                                d3 = _mm512_fmadd_ps(s0, w1, d3);
952
953
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC);
954
0
                                w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC);
955
0
                                d0 = _mm512_fmadd_ps(s1, w5, d0);
956
0
                                d1 = _mm512_fmadd_ps(s1, w4, d1);
957
0
                                d2 = _mm512_fmadd_ps(s1, w3, d2);
958
0
                                d3 = _mm512_fmadd_ps(s1, w2, d3);
959
960
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC);
961
0
                                w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC);
962
0
                                d0 = _mm512_fmadd_ps(s0, w6, d0);
963
0
                                d1 = _mm512_fmadd_ps(s0, w5, d1);
964
0
                                d2 = _mm512_fmadd_ps(s0, w4, d2);
965
0
                                d3 = _mm512_fmadd_ps(s0, w3, d3);
966
0
                                if (dx < endW)
967
0
                                {
968
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC);
969
0
                                    d1 = _mm512_fmadd_ps(s1, w6, d1);
970
0
                                    d2 = _mm512_fmadd_ps(s1, w5, d2);
971
0
                                    d3 = _mm512_fmadd_ps(s1, w4, d3);
972
973
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC);
974
0
                                    d2 = _mm512_fmadd_ps(s0, w6, d2);
975
0
                                    d3 = _mm512_fmadd_ps(s0, w5, d3);
976
977
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC);
978
0
                                    d3 = _mm512_fmadd_ps(s1, w6, d3);
979
0
                                }
980
0
                            }
981
0
                        }
982
0
                        float* pd = dst + (dy * dstW + dx) * dstC + dc;
983
0
                        _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0));
984
0
                        _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0));
985
0
                        _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0));
986
0
                        _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0));
987
0
                    }
988
0
                    if (dx == endW)
989
0
                        break;
990
0
                }
991
0
            }
992
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
993
994
        //-------------------------------------------------------------------------------------------------
995
996
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w6(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst)
997
0
        {
998
0
            assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && AlignedAny(p.srcW, 6));
999
1000
0
            size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, end = dstW - 6;
1001
0
            __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, d4, d5, _params[2];
1002
0
            _params[0] = _mm512_set1_ps(params[0]);
1003
0
            if (type == SimdConvolutionActivationRestrictRange ||
1004
0
                type == SimdConvolutionActivationHswish ||
1005
0
                type == SimdConvolutionActivationHardSigmoid)
1006
0
                _params[1] = _mm512_set1_ps(params[1]);
1007
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
1008
0
            {
1009
0
                for (size_t dx = 0; dx < dstW; dx += 6)
1010
0
                {
1011
0
                    for (size_t dc = 0; dc < dstC; dc += F)
1012
0
                    {
1013
0
                        __mmask16 tail = TailMask16(dstC - dc);
1014
0
                        if (type == SimdConvolutionActivationPrelu)
1015
0
                            _params[0] = _mm512_maskz_loadu_ps(tail, params + dc);
1016
0
                        d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps();
1017
0
                        d1 = d0; d2 = d0; d3 = d0, d4 = d0, d5 = d0;
1018
0
                        for (size_t ky = 0; ky < 7; ++ky)
1019
0
                        {
1020
0
                            size_t sy = dy + ky - 3;
1021
0
                            const float* ps = src + (sy * dstW + dx - 3) * dstC + dc;
1022
0
                            const float* pw = weight + ky * 7 * dstC + dc;
1023
0
                            if (sy < srcH)
1024
0
                            {
1025
0
                                w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC);
1026
0
                                w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC);
1027
0
                                w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC);
1028
0
                                if (dx)
1029
0
                                {
1030
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC);
1031
0
                                    d0 = _mm512_fmadd_ps(s0, w0, d0);
1032
1033
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC);
1034
0
                                    d0 = _mm512_fmadd_ps(s1, w1, d0);
1035
0
                                    d1 = _mm512_fmadd_ps(s1, w0, d1);
1036
1037
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC);
1038
0
                                    d0 = _mm512_fmadd_ps(s0, w2, d0);
1039
0
                                    d1 = _mm512_fmadd_ps(s0, w1, d1);
1040
0
                                    d2 = _mm512_fmadd_ps(s0, w0, d2);
1041
0
                                }
1042
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC);
1043
0
                                w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC);
1044
0
                                d0 = _mm512_fmadd_ps(s1, w3, d0);
1045
0
                                d1 = _mm512_fmadd_ps(s1, w2, d1);
1046
0
                                d2 = _mm512_fmadd_ps(s1, w1, d2);
1047
0
                                d3 = _mm512_fmadd_ps(s1, w0, d3);
1048
1049
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC);
1050
0
                                w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC);
1051
0
                                d0 = _mm512_fmadd_ps(s0, w4, d0);
1052
0
                                d1 = _mm512_fmadd_ps(s0, w3, d1);
1053
0
                                d2 = _mm512_fmadd_ps(s0, w2, d2);
1054
0
                                d3 = _mm512_fmadd_ps(s0, w1, d3);
1055
0
                                d4 = _mm512_fmadd_ps(s0, w0, d4);
1056
1057
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC);
1058
0
                                w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC);
1059
0
                                d0 = _mm512_fmadd_ps(s1, w5, d0);
1060
0
                                d1 = _mm512_fmadd_ps(s1, w4, d1);
1061
0
                                d2 = _mm512_fmadd_ps(s1, w3, d2);
1062
0
                                d3 = _mm512_fmadd_ps(s1, w2, d3);
1063
0
                                d4 = _mm512_fmadd_ps(s1, w1, d4);
1064
0
                                d5 = _mm512_fmadd_ps(s1, w0, d5);
1065
1066
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC);
1067
0
                                w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC);
1068
0
                                d0 = _mm512_fmadd_ps(s0, w6, d0);
1069
0
                                d1 = _mm512_fmadd_ps(s0, w5, d1);
1070
0
                                d2 = _mm512_fmadd_ps(s0, w4, d2);
1071
0
                                d3 = _mm512_fmadd_ps(s0, w3, d3);
1072
0
                                d4 = _mm512_fmadd_ps(s0, w2, d4);
1073
0
                                d5 = _mm512_fmadd_ps(s0, w1, d5);
1074
1075
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC);
1076
0
                                d1 = _mm512_fmadd_ps(s1, w6, d1);
1077
0
                                d2 = _mm512_fmadd_ps(s1, w5, d2);
1078
0
                                d3 = _mm512_fmadd_ps(s1, w4, d3);
1079
0
                                d4 = _mm512_fmadd_ps(s1, w3, d4);
1080
0
                                d5 = _mm512_fmadd_ps(s1, w2, d5);
1081
1082
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC);
1083
0
                                d2 = _mm512_fmadd_ps(s0, w6, d2);
1084
0
                                d3 = _mm512_fmadd_ps(s0, w5, d3);
1085
0
                                d4 = _mm512_fmadd_ps(s0, w4, d4);
1086
0
                                d5 = _mm512_fmadd_ps(s0, w3, d5);
1087
1088
0
                                if (dx < end)
1089
0
                                {
1090
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC);
1091
0
                                    d3 = _mm512_fmadd_ps(s1, w6, d3);
1092
0
                                    d4 = _mm512_fmadd_ps(s1, w5, d4);
1093
0
                                    d5 = _mm512_fmadd_ps(s1, w4, d5);
1094
1095
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 10 * dstC);
1096
0
                                    d4 = _mm512_fmadd_ps(s0, w6, d4);
1097
0
                                    d5 = _mm512_fmadd_ps(s0, w5, d5);
1098
1099
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 11 * dstC);
1100
0
                                    d5 = _mm512_fmadd_ps(s1, w6, d5);
1101
0
                                }
1102
0
                            }
1103
0
                        }
1104
0
                        float* pd = dst + (dy * dstW + dx) * dstC + dc;
1105
0
                        _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0));
1106
0
                        _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0));
1107
0
                        _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0));
1108
0
                        _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0));
1109
0
                        _mm512_mask_storeu_ps(pd + 4 * dstC, tail, Activate<type>(d4, _params, 0));
1110
0
                        _mm512_mask_storeu_ps(pd + 5 * dstC, tail, Activate<type>(d5, _params, 0));
1111
0
                    }
1112
0
                }
1113
0
            }
1114
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
1115
1116
        //-------------------------------------------------------------------------------------------------
1117
1118
        template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w8(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst)
1119
0
        {
1120
0
            assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 8));
1121
1122
0
            size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, end = dstW - 8;
1123
0
            __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, d4, d5, d6, d7, _params[2];
1124
0
            _params[0] = _mm512_set1_ps(params[0]);
1125
0
            if (type == SimdConvolutionActivationRestrictRange ||
1126
0
                type == SimdConvolutionActivationHswish ||
1127
0
                type == SimdConvolutionActivationHardSigmoid)
1128
0
                _params[1] = _mm512_set1_ps(params[1]);
1129
0
            for (size_t dy = 0; dy < p.dstH; ++dy)
1130
0
            {
1131
0
                for (size_t dx = 0; dx < dstW; dx += 8)
1132
0
                {
1133
0
                    for (size_t dc = 0; dc < dstC; dc += F)
1134
0
                    {
1135
0
                        __mmask16 tail = TailMask16(dstC - dc);
1136
0
                        if (type == SimdConvolutionActivationPrelu)
1137
0
                            _params[0] = _mm512_maskz_loadu_ps(tail, params + dc);
1138
0
                        d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps();
1139
0
                        d1 = d0; d2 = d0; d3 = d0, d4 = d0, d5 = d0, d6 = d0, d7 = d0;
1140
0
                        for (size_t ky = 0; ky < 7; ++ky)
1141
0
                        {
1142
0
                            size_t sy = dy + ky - 3;
1143
0
                            const float* ps = src + (sy * dstW + dx - 3) * dstC + dc;
1144
0
                            const float* pw = weight + ky * 7 * dstC + dc;
1145
0
                            if (sy < srcH)
1146
0
                            {
1147
0
                                w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC);
1148
0
                                w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC);
1149
0
                                w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC);
1150
0
                                if (dx)
1151
0
                                {
1152
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC);
1153
0
                                    d0 = _mm512_fmadd_ps(s0, w0, d0);
1154
1155
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC);
1156
0
                                    d0 = _mm512_fmadd_ps(s1, w1, d0);
1157
0
                                    d1 = _mm512_fmadd_ps(s1, w0, d1);
1158
1159
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC);
1160
0
                                    d0 = _mm512_fmadd_ps(s0, w2, d0);
1161
0
                                    d1 = _mm512_fmadd_ps(s0, w1, d1);
1162
0
                                    d2 = _mm512_fmadd_ps(s0, w0, d2);
1163
0
                                }
1164
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC);
1165
0
                                w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC);
1166
0
                                d0 = _mm512_fmadd_ps(s1, w3, d0);
1167
0
                                d1 = _mm512_fmadd_ps(s1, w2, d1);
1168
0
                                d2 = _mm512_fmadd_ps(s1, w1, d2);
1169
0
                                d3 = _mm512_fmadd_ps(s1, w0, d3);
1170
1171
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC);
1172
0
                                w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC);
1173
0
                                d0 = _mm512_fmadd_ps(s0, w4, d0);
1174
0
                                d1 = _mm512_fmadd_ps(s0, w3, d1);
1175
0
                                d2 = _mm512_fmadd_ps(s0, w2, d2);
1176
0
                                d3 = _mm512_fmadd_ps(s0, w1, d3);
1177
0
                                d4 = _mm512_fmadd_ps(s0, w0, d4);
1178
1179
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC);
1180
0
                                w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC);
1181
0
                                d0 = _mm512_fmadd_ps(s1, w5, d0);
1182
0
                                d1 = _mm512_fmadd_ps(s1, w4, d1);
1183
0
                                d2 = _mm512_fmadd_ps(s1, w3, d2);
1184
0
                                d3 = _mm512_fmadd_ps(s1, w2, d3);
1185
0
                                d4 = _mm512_fmadd_ps(s1, w1, d4);
1186
0
                                d5 = _mm512_fmadd_ps(s1, w0, d5);
1187
1188
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC);
1189
0
                                w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC);
1190
0
                                d0 = _mm512_fmadd_ps(s0, w6, d0);
1191
0
                                d1 = _mm512_fmadd_ps(s0, w5, d1);
1192
0
                                d2 = _mm512_fmadd_ps(s0, w4, d2);
1193
0
                                d3 = _mm512_fmadd_ps(s0, w3, d3);
1194
0
                                d4 = _mm512_fmadd_ps(s0, w2, d4);
1195
0
                                d5 = _mm512_fmadd_ps(s0, w1, d5);
1196
0
                                d6 = _mm512_fmadd_ps(s0, w0, d6);
1197
1198
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC);
1199
0
                                d1 = _mm512_fmadd_ps(s1, w6, d1);
1200
0
                                d2 = _mm512_fmadd_ps(s1, w5, d2);
1201
0
                                d3 = _mm512_fmadd_ps(s1, w4, d3);
1202
0
                                d4 = _mm512_fmadd_ps(s1, w3, d4);
1203
0
                                d5 = _mm512_fmadd_ps(s1, w2, d5);
1204
0
                                d6 = _mm512_fmadd_ps(s1, w1, d6);
1205
0
                                d7 = _mm512_fmadd_ps(s1, w0, d7);
1206
1207
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC);
1208
0
                                d2 = _mm512_fmadd_ps(s0, w6, d2);
1209
0
                                d3 = _mm512_fmadd_ps(s0, w5, d3);
1210
0
                                d4 = _mm512_fmadd_ps(s0, w4, d4);
1211
0
                                d5 = _mm512_fmadd_ps(s0, w3, d5);
1212
0
                                d6 = _mm512_fmadd_ps(s0, w2, d6);
1213
0
                                d7 = _mm512_fmadd_ps(s0, w1, d7);
1214
1215
0
                                s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC);
1216
0
                                d3 = _mm512_fmadd_ps(s1, w6, d3);
1217
0
                                d4 = _mm512_fmadd_ps(s1, w5, d4);
1218
0
                                d5 = _mm512_fmadd_ps(s1, w4, d5);
1219
0
                                d6 = _mm512_fmadd_ps(s1, w3, d6);
1220
0
                                d7 = _mm512_fmadd_ps(s1, w2, d7);
1221
1222
0
                                s0 = _mm512_maskz_loadu_ps(tail, ps + 10 * dstC);
1223
0
                                d4 = _mm512_fmadd_ps(s0, w6, d4);
1224
0
                                d5 = _mm512_fmadd_ps(s0, w5, d5);
1225
0
                                d6 = _mm512_fmadd_ps(s0, w4, d6);
1226
0
                                d7 = _mm512_fmadd_ps(s0, w3, d7);
1227
1228
0
                                if (dx < end)
1229
0
                                {
1230
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 11 * dstC);
1231
0
                                    d5 = _mm512_fmadd_ps(s1, w6, d5);
1232
0
                                    d6 = _mm512_fmadd_ps(s1, w5, d6);
1233
0
                                    d7 = _mm512_fmadd_ps(s1, w4, d7);
1234
1235
0
                                    s0 = _mm512_maskz_loadu_ps(tail, ps + 12 * dstC);
1236
0
                                    d6 = _mm512_fmadd_ps(s0, w6, d6);
1237
0
                                    d7 = _mm512_fmadd_ps(s0, w5, d7);
1238
1239
0
                                    s1 = _mm512_maskz_loadu_ps(tail, ps + 13 * dstC);
1240
0
                                    d7 = _mm512_fmadd_ps(s1, w6, d7);
1241
0
                                }
1242
0
                            }
1243
0
                        }
1244
0
                        float* pd = dst + (dy * dstW + dx) * dstC + dc;
1245
0
                        _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0));
1246
0
                        _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0));
1247
0
                        _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0));
1248
0
                        _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0));
1249
0
                        _mm512_mask_storeu_ps(pd + 4 * dstC, tail, Activate<type>(d4, _params, 0));
1250
0
                        _mm512_mask_storeu_ps(pd + 5 * dstC, tail, Activate<type>(d5, _params, 0));
1251
0
                        _mm512_mask_storeu_ps(pd + 6 * dstC, tail, Activate<type>(d6, _params, 0));
1252
0
                        _mm512_mask_storeu_ps(pd + 7 * dstC, tail, Activate<type>(d7, _params, 0));
1253
0
                    }
1254
0
                }
1255
0
            }
1256
0
        }
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
1257
1258
        //-------------------------------------------------------------------------------------------------
1259
1260
        template <::SimdConvolutionActivationType type> SynetConvolution32fNhwcDepthwise::ConvolutionPtr Get(const ConvParam& p)
1261
0
        {
1262
0
            if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 8))
1263
0
                return Convolution32fNhwcDepthwise_k7p3d1s1w8<type>;
1264
0
            else if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && AlignedAny(p.srcW, 6))
1265
0
                return Convolution32fNhwcDepthwise_k7p3d1s1w6<type>;
1266
0
            else if (Preferable_k7p3d1s1w4(p))
1267
0
                return Convolution32fNhwcDepthwise_k7p3d1s1w4<type>;
1268
0
            else if (p.IsKernel(3) && p.IsDilation(1))
1269
0
                return Convolution32fNhwcDepthwise3x3<type>;
1270
0
            else
1271
0
                return Convolution32fNhwcDepthwiseDefault<type>;
1272
0
        }
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)0>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)1>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)2>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)3>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)4>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)5>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)6>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)7>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)8>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)9>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)10>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*)
1273
1274
        //-------------------------------------------------------------------------------------------------
1275
1276
        SynetConvolution32fNhwcDepthwise::SynetConvolution32fNhwcDepthwise(const ConvParam& p)
1277
0
            : Avx2::SynetConvolution32fNhwcDepthwise(p)
1278
0
        {
1279
0
            if (p.dstC > HF && p.dstC != 24 && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW)
1280
0
            {
1281
0
                switch (p.activation)
1282
0
                {
1283
0
                case ::SimdConvolutionActivationIdentity: _convolution = Get<::SimdConvolutionActivationIdentity>(p); break;
1284
0
                case ::SimdConvolutionActivationRelu: _convolution = Get<::SimdConvolutionActivationRelu>(p); break;
1285
0
                case ::SimdConvolutionActivationLeakyRelu: _convolution = Get<::SimdConvolutionActivationLeakyRelu>(p); break;
1286
0
                case ::SimdConvolutionActivationRestrictRange: _convolution = Get<::SimdConvolutionActivationRestrictRange>(p); break;
1287
0
                case ::SimdConvolutionActivationPrelu: _convolution = Get<::SimdConvolutionActivationPrelu>(p); break;
1288
0
                case ::SimdConvolutionActivationElu: _convolution = Get<::SimdConvolutionActivationElu>(p); break;
1289
0
                case ::SimdConvolutionActivationHswish: _convolution = Get<::SimdConvolutionActivationHswish>(p); break;
1290
0
                case ::SimdConvolutionActivationMish: _convolution = Get<::SimdConvolutionActivationMish>(p); break;
1291
0
                case ::SimdConvolutionActivationHardSigmoid: _convolution = Get<::SimdConvolutionActivationHardSigmoid>(p); break;
1292
0
                case ::SimdConvolutionActivationSwish: _convolution = Get<::SimdConvolutionActivationSwish>(p); break;
1293
0
                case ::SimdConvolutionActivationGelu: _convolution = Get<::SimdConvolutionActivationGelu>(p); break;
1294
0
                }
1295
0
            }
1296
0
        }
1297
    }
1298
#endif
1299
}