Coverage Report

Created: 2026-04-01 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetConvolution32f.h"
25
#include "Simd/SimdSynetConvolution32fCommon.h"
26
#include "Simd/SimdCpu.h"
27
#include "Simd/SimdPrefetch.h"
28
29
namespace Simd
30
{
31
#if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE)   
32
    namespace Avx512bw
33
    {
34
        using AlgParam = SynetConvolution32fNhwcDirect::AlgParam;
35
36
        typedef void(*ConvolutionNhwcDirect_NxM_Ptr)(const float* src0, const ConvParam& p, const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails, int first);
37
        typedef void(*ConvolutionNhwcDirect1x1_NxM_Ptr)(const float* src0, const ConvParam& p, const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails, int first);
38
39
        //---------------------------------------------------------------------
40
41
        template<TermType term, SimdConvolutionActivationType type> void ConvolutionNhwcDirect_2x1(const float* src0, const ConvParam& p,
42
            const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails, int first)
43
0
        {
44
0
            __m512 d00, d01, s0, w0, w1;
45
0
            size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX;
46
0
            size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F;
47
0
            size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX;
48
0
            size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX;
49
0
            const float* weight1 = weight0 + a.stepW;
50
0
            if (tails[1])
51
0
            {
52
0
                if (first)
53
0
                    d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps();
54
0
                else
55
0
                    d00 = _mm512_loadu_ps(dst + 0), d01 = _mm512_maskz_loadu_ps(tails[1], dst + F);
56
0
                for (size_t ky = 0; ky < kY; ky += dilY)
57
0
                {
58
0
                    size_t beg = (sy + ky) * dY + sx * dX;
59
0
                    for (size_t kx = 0; kx < kX; kx += dilX)
60
0
                    {
61
0
                        if (sy + ky < srcH && sx + kx < srcW)
62
0
                        {
63
0
                            size_t offs = beg + kx * dX, end = offs + srcC, offw = 0;
64
0
                            for (; offs < end; ++offs, offw += F)
65
0
                            {
66
0
                                w0 = _mm512_loadu_ps(weight0 + offw);
67
0
                                w1 = _mm512_loadu_ps(weight1 + offw);
68
0
                                s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01);
69
0
                            }
70
0
                        }
71
0
                        weight0 += dW, weight1 += dW;
72
0
                    }
73
0
                }
74
0
                Save2<term, type>(dst, d00, d01, bias, params, tails);
75
0
            }
76
0
            else
77
0
            {
78
0
                if (first)
79
0
                    d00 = _mm512_setzero_ps();
80
0
                else
81
0
                    d00 = _mm512_maskz_loadu_ps(tails[0], dst + 0);
82
0
                for (size_t ky = 0; ky < kY; ky += dilY)
83
0
                {
84
0
                    size_t beg = (sy + ky) * dY + sx * dX;
85
0
                    for (size_t kx = 0; kx < kX; kx += dilX)
86
0
                    {
87
0
                        if (sy + ky < srcH && sx + kx < srcW)
88
0
                        {
89
0
                            size_t offs = beg + kx * dX, end = offs + srcC, offw = 0;
90
0
                            for (; offs < end; ++offs, offw += F)
91
0
                            {
92
0
                                w0 = _mm512_loadu_ps(weight0 + offw);
93
0
                                s0 = _mm512_set1_ps(src0[offs]), d00 = _mm512_fmadd_ps(s0, w0, d00);
94
0
                            }
95
0
                        }
96
0
                        weight0 += dW;
97
0
                    }
98
0
                }
99
0
                Save1<term, type>(dst, d00, bias, params, tails);
100
0
            }
101
0
        }
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2x1<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
102
103
        template<TermType term, SimdConvolutionActivationType type, int M> void ConvolutionNhwcDirect_2xM(const float* src0, const ConvParam& p,
104
            const AlgParam& a, size_t dy, size_t dx, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails, int first)
105
0
        {
106
0
            __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1;
107
0
            size_t srcH = p.srcH, srcW = p.srcW, dilY = p.dilationY, dilX = p.dilationX;
108
0
            size_t dY = p.srcW * p.srcC, dX = p.srcC, dS = p.srcC * p.strideX, dW = p.srcC * F, dWz = p.kernelX * p.srcC * F, dD = p.dstC;
109
0
            size_t sy = dy * p.strideY - p.padY, sx = dx * p.strideX - p.padX;
110
0
            size_t kY = p.kernelY * p.dilationY, kX = p.kernelX * p.dilationX;
111
0
            const float* weight1 = weight0 + a.stepW;
112
0
            const float* src1 = src0 + 1 * dS;
113
0
            const float* src2 = src0 + 2 * dS;
114
0
            const float* src3 = src0 + 3 * dS;
115
0
            const float* src4 = src0 + 4 * dS;
116
0
            const float* src5 = src0 + 5 * dS;
117
0
            const float* src6 = src0 + 6 * dS;
118
0
            if (tails[1])
119
0
            {
120
0
                if (first)
121
0
                {
122
0
                    if (M > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps();
123
0
                    if (M > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps();
124
0
                    if (M > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps();
125
0
                    if (M > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps();
126
0
                    if (M > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps();
127
0
                    if (M > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps();
128
0
                    if (M > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps();
129
0
                    if (M > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps();
130
0
                    if (M > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps();
131
0
                    if (M > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps();
132
0
                    if (M > 0xa) da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps();
133
0
                    if (M > 0xb) db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps();
134
0
                    if (M > 0xc) dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps();
135
0
                    if (M > 0xd) dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps();
136
0
                }
137
0
                else
138
0
                {
139
0
                    if (M > 0x0) d00 = _mm512_loadu_ps(dst + 0x0 * dD + 0), d01 = _mm512_maskz_loadu_ps(tails[1], dst + 0x0 * dD + F);
140
0
                    if (M > 0x1) d10 = _mm512_loadu_ps(dst + 0x1 * dD + 0), d11 = _mm512_maskz_loadu_ps(tails[1], dst + 0x1 * dD + F);
141
0
                    if (M > 0x2) d20 = _mm512_loadu_ps(dst + 0x2 * dD + 0), d21 = _mm512_maskz_loadu_ps(tails[1], dst + 0x2 * dD + F);
142
0
                    if (M > 0x3) d30 = _mm512_loadu_ps(dst + 0x3 * dD + 0), d31 = _mm512_maskz_loadu_ps(tails[1], dst + 0x3 * dD + F);
143
0
                    if (M > 0x4) d40 = _mm512_loadu_ps(dst + 0x4 * dD + 0), d41 = _mm512_maskz_loadu_ps(tails[1], dst + 0x4 * dD + F);
144
0
                    if (M > 0x5) d50 = _mm512_loadu_ps(dst + 0x5 * dD + 0), d51 = _mm512_maskz_loadu_ps(tails[1], dst + 0x5 * dD + F);
145
0
                    if (M > 0x6) d60 = _mm512_loadu_ps(dst + 0x6 * dD + 0), d61 = _mm512_maskz_loadu_ps(tails[1], dst + 0x6 * dD + F);
146
0
                    if (M > 0x7) d70 = _mm512_loadu_ps(dst + 0x7 * dD + 0), d71 = _mm512_maskz_loadu_ps(tails[1], dst + 0x7 * dD + F);
147
0
                    if (M > 0x8) d80 = _mm512_loadu_ps(dst + 0x8 * dD + 0), d81 = _mm512_maskz_loadu_ps(tails[1], dst + 0x8 * dD + F);
148
0
                    if (M > 0x9) d90 = _mm512_loadu_ps(dst + 0x9 * dD + 0), d91 = _mm512_maskz_loadu_ps(tails[1], dst + 0x9 * dD + F);
149
0
                    if (M > 0xa) da0 = _mm512_loadu_ps(dst + 0xa * dD + 0), da1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xa * dD + F);
150
0
                    if (M > 0xb) db0 = _mm512_loadu_ps(dst + 0xb * dD + 0), db1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xb * dD + F);
151
0
                    if (M > 0xc) dc0 = _mm512_loadu_ps(dst + 0xc * dD + 0), dc1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xc * dD + F);
152
0
                    if (M > 0xd) dd0 = _mm512_loadu_ps(dst + 0xd * dD + 0), dd1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xd * dD + F);
153
0
                }
154
0
                if (p.kernelY * p.kernelX * srcC * F * sizeof(float) > PREFETCH_SIZE)
155
0
                {
156
0
                    for (size_t ky = 0; ky < kY; ky += dilY)
157
0
                    {
158
0
                        if (sy + ky < srcH)
159
0
                        {
160
0
                            size_t beg = (sy + ky) * dY + sx * dX;
161
0
                            for (size_t kx = 0; kx < kX; kx += dilX)
162
0
                            {
163
0
                                assert(sx + kx < srcW&& sx + kx + M <= srcW);
164
0
                                size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0;
165
0
                                for (; off0 < end; ++off0, ++off7, offw += F)
166
0
                                {
167
0
                                    PrefetchL1(weight0 + offw);
168
0
                                    PrefetchL1(weight1 + offw);
169
0
                                    w0 = _mm512_loadu_ps(weight0 + offw);
170
0
                                    w1 = _mm512_loadu_ps(weight1 + offw);
171
0
                                    if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01);
172
0
                                    if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11);
173
0
                                    if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21);
174
0
                                    if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31);
175
0
                                    if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41);
176
0
                                    if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51);
177
0
                                    if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61);
178
0
                                    if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71);
179
0
                                    if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81);
180
0
                                    if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91);
181
0
                                    if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1);
182
0
                                    if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1);
183
0
                                    if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1);
184
0
                                    if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1);
185
0
                                }
186
0
                                weight0 += dW, weight1 += dW;
187
0
                            }
188
0
                        }
189
0
                        else
190
0
                            weight0 += dWz, weight1 += dWz;
191
0
                    }
192
0
                }
193
0
                else
194
0
                {
195
0
                    for (size_t ky = 0; ky < kY; ky += dilY)
196
0
                    {
197
0
                        if (sy + ky < srcH)
198
0
                        {
199
0
                            size_t beg = (sy + ky) * dY + sx * dX;
200
0
                            for (size_t kx = 0; kx < kX; kx += dilX)
201
0
                            {
202
0
                                assert(sx + kx < srcW&& sx + kx + M <= srcW);
203
0
                                size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0;
204
0
                                for (; off0 < end; ++off0, ++off7, offw += F)
205
0
                                {
206
0
                                    w0 = _mm512_loadu_ps(weight0 + offw);
207
0
                                    w1 = _mm512_loadu_ps(weight1 + offw);
208
0
                                    if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01);
209
0
                                    if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11);
210
0
                                    if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21);
211
0
                                    if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31);
212
0
                                    if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41);
213
0
                                    if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51);
214
0
                                    if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61);
215
0
                                    if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71);
216
0
                                    if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81);
217
0
                                    if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91);
218
0
                                    if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1);
219
0
                                    if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1);
220
0
                                    if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1);
221
0
                                    if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1);
222
0
                                }
223
0
                                weight0 += dW, weight1 += dW;
224
0
                            }
225
0
                        }
226
0
                        else
227
0
                            weight0 += dWz, weight1 += dWz;
228
0
                    }
229
0
                }
230
0
                if (M > 0x0) Save2<term, type>(dst, d00, d01, bias, params, tails), dst += dD;
231
0
                if (M > 0x1) Save2<term, type>(dst, d10, d11, bias, params, tails), dst += dD;
232
0
                if (M > 0x2) Save2<term, type>(dst, d20, d21, bias, params, tails), dst += dD;
233
0
                if (M > 0x3) Save2<term, type>(dst, d30, d31, bias, params, tails), dst += dD;
234
0
                if (M > 0x4) Save2<term, type>(dst, d40, d41, bias, params, tails), dst += dD;
235
0
                if (M > 0x5) Save2<term, type>(dst, d50, d51, bias, params, tails), dst += dD;
236
0
                if (M > 0x6) Save2<term, type>(dst, d60, d61, bias, params, tails), dst += dD;
237
0
                if (M > 0x7) Save2<term, type>(dst, d70, d71, bias, params, tails), dst += dD;
238
0
                if (M > 0x8) Save2<term, type>(dst, d80, d81, bias, params, tails), dst += dD;
239
0
                if (M > 0x9) Save2<term, type>(dst, d90, d91, bias, params, tails), dst += dD;
240
0
                if (M > 0xa) Save2<term, type>(dst, da0, da1, bias, params, tails), dst += dD;
241
0
                if (M > 0xb) Save2<term, type>(dst, db0, db1, bias, params, tails), dst += dD;
242
0
                if (M > 0xc) Save2<term, type>(dst, dc0, dc1, bias, params, tails), dst += dD;
243
0
                if (M > 0xd) Save2<term, type>(dst, dd0, dd1, bias, params, tails), dst += dD;
244
0
            }
245
0
            else
246
0
            {
247
0
                if (first)
248
0
                {
249
0
                    if (M > 0x0) d00 = _mm512_setzero_ps();
250
0
                    if (M > 0x1) d10 = _mm512_setzero_ps();
251
0
                    if (M > 0x2) d20 = _mm512_setzero_ps();
252
0
                    if (M > 0x3) d30 = _mm512_setzero_ps();
253
0
                    if (M > 0x4) d40 = _mm512_setzero_ps();
254
0
                    if (M > 0x5) d50 = _mm512_setzero_ps();
255
0
                    if (M > 0x6) d60 = _mm512_setzero_ps();
256
0
                    if (M > 0x7) d70 = _mm512_setzero_ps();
257
0
                    if (M > 0x8) d80 = _mm512_setzero_ps();
258
0
                    if (M > 0x9) d90 = _mm512_setzero_ps();
259
0
                    if (M > 0xa) da0 = _mm512_setzero_ps();
260
0
                    if (M > 0xb) db0 = _mm512_setzero_ps();
261
0
                    if (M > 0xc) dc0 = _mm512_setzero_ps();
262
0
                    if (M > 0xd) dd0 = _mm512_setzero_ps();
263
0
                }
264
0
                else
265
0
                {
266
0
                    if (M > 0x0) d00 = _mm512_maskz_loadu_ps(tails[0], dst + 0x0 * dD + 0);
267
0
                    if (M > 0x1) d10 = _mm512_maskz_loadu_ps(tails[0], dst + 0x1 * dD + 0);
268
0
                    if (M > 0x2) d20 = _mm512_maskz_loadu_ps(tails[0], dst + 0x2 * dD + 0);
269
0
                    if (M > 0x3) d30 = _mm512_maskz_loadu_ps(tails[0], dst + 0x3 * dD + 0);
270
0
                    if (M > 0x4) d40 = _mm512_maskz_loadu_ps(tails[0], dst + 0x4 * dD + 0);
271
0
                    if (M > 0x5) d50 = _mm512_maskz_loadu_ps(tails[0], dst + 0x5 * dD + 0);
272
0
                    if (M > 0x6) d60 = _mm512_maskz_loadu_ps(tails[0], dst + 0x6 * dD + 0);
273
0
                    if (M > 0x7) d70 = _mm512_maskz_loadu_ps(tails[0], dst + 0x7 * dD + 0);
274
0
                    if (M > 0x8) d80 = _mm512_maskz_loadu_ps(tails[0], dst + 0x8 * dD + 0);
275
0
                    if (M > 0x9) d90 = _mm512_maskz_loadu_ps(tails[0], dst + 0x9 * dD + 0);
276
0
                    if (M > 0xa) da0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xa * dD + 0);
277
0
                    if (M > 0xb) db0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xb * dD + 0);
278
0
                    if (M > 0xc) dc0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xc * dD + 0);
279
0
                    if (M > 0xd) dd0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xd * dD + 0);
280
0
                }
281
0
                for (size_t ky = 0; ky < kY; ky += dilY)
282
0
                {
283
0
                    if (sy + ky < srcH)
284
0
                    {
285
0
                        size_t beg = (sy + ky) * dY + sx * dX;
286
0
                        for (size_t kx = 0; kx < kX; kx += dilX)
287
0
                        {
288
0
                            assert(sx + kx < srcW && sx + kx + M <= srcW);
289
0
                            size_t off0 = beg + kx * dX, end = off0 + srcC, off7 = off0 + 7 * dS, offw = 0;
290
0
                            for (; off0 < end; ++off0, ++off7, offw += F)
291
0
                            {
292
0
                                w0 = _mm512_loadu_ps(weight0 + offw);
293
0
                                if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00);
294
0
                                if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10);
295
0
                                if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20);
296
0
                                if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30);
297
0
                                if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40);
298
0
                                if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50);
299
0
                                if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60);
300
0
                                if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70);
301
0
                                if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80);
302
0
                                if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90);
303
0
                                if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0);
304
0
                                if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0);
305
0
                                if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0);
306
0
                                if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0);
307
0
                            }
308
0
                            weight0 += dW;
309
0
                        }
310
0
                    }
311
0
                    else
312
0
                        weight0 += dWz;
313
0
                }
314
0
                if (M > 0x0) Save1<term, type>(dst, d00, bias, params, tails), dst += dD;
315
0
                if (M > 0x1) Save1<term, type>(dst, d10, bias, params, tails), dst += dD;
316
0
                if (M > 0x2) Save1<term, type>(dst, d20, bias, params, tails), dst += dD;
317
0
                if (M > 0x3) Save1<term, type>(dst, d30, bias, params, tails), dst += dD;
318
0
                if (M > 0x4) Save1<term, type>(dst, d40, bias, params, tails), dst += dD;
319
0
                if (M > 0x5) Save1<term, type>(dst, d50, bias, params, tails), dst += dD;
320
0
                if (M > 0x6) Save1<term, type>(dst, d60, bias, params, tails), dst += dD;
321
0
                if (M > 0x7) Save1<term, type>(dst, d70, bias, params, tails), dst += dD;
322
0
                if (M > 0x8) Save1<term, type>(dst, d80, bias, params, tails), dst += dD;
323
0
                if (M > 0x9) Save1<term, type>(dst, d90, bias, params, tails), dst += dD;
324
0
                if (M > 0xa) Save1<term, type>(dst, da0, bias, params, tails), dst += dD;
325
0
                if (M > 0xb) Save1<term, type>(dst, db0, bias, params, tails), dst += dD;
326
0
                if (M > 0xc) Save1<term, type>(dst, dc0, bias, params, tails), dst += dD;
327
0
                if (M > 0xd) Save1<term, type>(dst, dd0, bias, params, tails), dst += dD;
328
0
            }
329
0
        }
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
330
331
        template<TermType term, SimdConvolutionActivationType type> ConvolutionNhwcDirect_NxM_Ptr GetConvolutionNhwcDirect_2xM(size_t M)
332
0
        {
333
0
            switch (M)
334
0
            {
335
0
            case 0x0: return NULL;
336
0
            case 0x1: return ConvolutionNhwcDirect_2xM<term, type, 0x1>;
337
0
            case 0x2: return ConvolutionNhwcDirect_2xM<term, type, 0x2>;
338
0
            case 0x3: return ConvolutionNhwcDirect_2xM<term, type, 0x3>;
339
0
            case 0x4: return ConvolutionNhwcDirect_2xM<term, type, 0x4>;
340
0
            case 0x5: return ConvolutionNhwcDirect_2xM<term, type, 0x5>;
341
0
            case 0x6: return ConvolutionNhwcDirect_2xM<term, type, 0x6>;
342
0
            case 0x7: return ConvolutionNhwcDirect_2xM<term, type, 0x7>;
343
0
            case 0x8: return ConvolutionNhwcDirect_2xM<term, type, 0x8>;
344
0
            case 0x9: return ConvolutionNhwcDirect_2xM<term, type, 0x9>;
345
0
            case 0xa: return ConvolutionNhwcDirect_2xM<term, type, 0xa>;
346
0
            case 0xb: return ConvolutionNhwcDirect_2xM<term, type, 0xb>;
347
0
            case 0xc: return ConvolutionNhwcDirect_2xM<term, type, 0xc>;
348
0
            case 0xd: return ConvolutionNhwcDirect_2xM<term, type, 0xd>;
349
0
            case 0xe: return ConvolutionNhwcDirect_2xM<term, type, 0xe>;
350
0
            }
351
0
            assert(0);
352
0
            return NULL;
353
0
        }
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
354
355
        template<TermType term, SimdConvolutionActivationType type> void ConvolutionNhwcDirect_2(const float* src, const ConvParam& p, const AlgParam& a,
356
            size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst, int first)
357
0
        {
358
0
            size_t noseH = p.NoseH(), noseW = p.NoseW(), bodyH = p.BodyH(), bodyW = p.BodyW();
359
0
            size_t n = 14, bodyWn = AlignLoAny(bodyW - noseW, n) + noseW, m = bodyW - bodyWn;
360
0
            ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2x1 = ConvolutionNhwcDirect_2x1<term, type>;
361
0
            ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xN = GetConvolutionNhwcDirect_2xM<term, type>(n);
362
0
            ConvolutionNhwcDirect_NxM_Ptr convolutionNhwcDirect_2xM = GetConvolutionNhwcDirect_2xM<term, type>(m);
363
0
            size_t tailH = p.dstH, tailW = p.dstW;
364
0
            size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1;
365
366
0
            __m512 _params[2], _bias[2];
367
0
            _params[0] = _mm512_set1_ps(params[0]);
368
0
            if (type == SimdConvolutionActivationRestrictRange ||
369
0
                type == SimdConvolutionActivationHswish ||
370
0
                type == SimdConvolutionActivationHardSigmoid)
371
0
                _params[1] = _mm512_set1_ps(params[1]);
372
373
0
            for (size_t dc = 0; dc < dstC; dc += a.microD)
374
0
            {
375
0
                size_t dC = Simd::Min(a.microD, dstC - dc);
376
0
                __mmask16 tails[2] = { TailMask16(dC), TailMask16(dC - F) };
377
0
                if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F);
378
0
                if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F);
379
0
                if (type == ::SimdConvolutionActivationPrelu)
380
0
                {
381
0
                    if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F);
382
0
                    if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F);
383
0
                }
384
0
                float* d = dst + dc + yBeg * p.dstW * p.dstC;
385
0
                for (size_t dy = yBeg; dy < yEnd; dy++)
386
0
                {
387
0
                    size_t dx = 0;
388
0
                    for (; dx < noseW; dx++, d += p.dstC)
389
0
                        convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails, first);
390
0
                    for (; dx < bodyWn; dx += n, d += p.dstC * n)
391
0
                        convolutionNhwcDirect_2xN(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails, first);
392
0
                    for (; dx < bodyW; dx += m, d += p.dstC * m)
393
0
                        convolutionNhwcDirect_2xM(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails, first);
394
0
                    for (; dx < tailW; dx++, d += p.dstC)
395
0
                        convolutionNhwcDirect_2x1(src, p, a, dy, dx, srcC, weight, _bias, _params, d, tails, first);
396
0
                }
397
0
                weight += p.kernelY * p.kernelX * p.srcC * a.microD;
398
0
            }
399
0
        }
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect_2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
400
401
        //---------------------------------------------------------------------
402
403
        template<TermType term, SimdConvolutionActivationType type, int M> void ConvolutionNhwcDirect1x1_2xM(const float* src0, const ConvParam& p,
404
            const AlgParam& a, size_t srcC, const float* weight0, const __m512* bias, const __m512* params, float* dst, const __mmask16* tails, int first)
405
0
        {
406
0
            __m512 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, d60, d61, d70, d71, d80, d81, d90, d91, da0, da1, db0, db1, dc0, dc1, dd0, dd1, s0, w0, w1;
407
0
            size_t dS = p.srcC, dD = p.dstC;
408
0
            const float* weight1 = weight0 + a.stepW;
409
0
            const float* src1 = src0 + 1 * dS;
410
0
            const float* src2 = src0 + 2 * dS;
411
0
            const float* src3 = src0 + 3 * dS;
412
0
            const float* src4 = src0 + 4 * dS;
413
0
            const float* src5 = src0 + 5 * dS;
414
0
            const float* src6 = src0 + 6 * dS;
415
0
            if (tails[1])
416
0
            {
417
0
                if (first)
418
0
                {
419
0
                    if (M > 0x0) d00 = _mm512_setzero_ps(), d01 = _mm512_setzero_ps();
420
0
                    if (M > 0x1) d10 = _mm512_setzero_ps(), d11 = _mm512_setzero_ps();
421
0
                    if (M > 0x2) d20 = _mm512_setzero_ps(), d21 = _mm512_setzero_ps();
422
0
                    if (M > 0x3) d30 = _mm512_setzero_ps(), d31 = _mm512_setzero_ps();
423
0
                    if (M > 0x4) d40 = _mm512_setzero_ps(), d41 = _mm512_setzero_ps();
424
0
                    if (M > 0x5) d50 = _mm512_setzero_ps(), d51 = _mm512_setzero_ps();
425
0
                    if (M > 0x6) d60 = _mm512_setzero_ps(), d61 = _mm512_setzero_ps();
426
0
                    if (M > 0x7) d70 = _mm512_setzero_ps(), d71 = _mm512_setzero_ps();
427
0
                    if (M > 0x8) d80 = _mm512_setzero_ps(), d81 = _mm512_setzero_ps();
428
0
                    if (M > 0x9) d90 = _mm512_setzero_ps(), d91 = _mm512_setzero_ps();
429
0
                    if (M > 0xa) da0 = _mm512_setzero_ps(), da1 = _mm512_setzero_ps();
430
0
                    if (M > 0xb) db0 = _mm512_setzero_ps(), db1 = _mm512_setzero_ps();
431
0
                    if (M > 0xc) dc0 = _mm512_setzero_ps(), dc1 = _mm512_setzero_ps();
432
0
                    if (M > 0xd) dd0 = _mm512_setzero_ps(), dd1 = _mm512_setzero_ps();
433
0
                }
434
0
                else
435
0
                {
436
0
                    if (M > 0x0) d00 = _mm512_loadu_ps(dst + 0x0 * dD + 0), d01 = _mm512_maskz_loadu_ps(tails[1], dst + 0x0 * dD + F);
437
0
                    if (M > 0x1) d10 = _mm512_loadu_ps(dst + 0x1 * dD + 0), d11 = _mm512_maskz_loadu_ps(tails[1], dst + 0x1 * dD + F);
438
0
                    if (M > 0x2) d20 = _mm512_loadu_ps(dst + 0x2 * dD + 0), d21 = _mm512_maskz_loadu_ps(tails[1], dst + 0x2 * dD + F);
439
0
                    if (M > 0x3) d30 = _mm512_loadu_ps(dst + 0x3 * dD + 0), d31 = _mm512_maskz_loadu_ps(tails[1], dst + 0x3 * dD + F);
440
0
                    if (M > 0x4) d40 = _mm512_loadu_ps(dst + 0x4 * dD + 0), d41 = _mm512_maskz_loadu_ps(tails[1], dst + 0x4 * dD + F);
441
0
                    if (M > 0x5) d50 = _mm512_loadu_ps(dst + 0x5 * dD + 0), d51 = _mm512_maskz_loadu_ps(tails[1], dst + 0x5 * dD + F);
442
0
                    if (M > 0x6) d60 = _mm512_loadu_ps(dst + 0x6 * dD + 0), d61 = _mm512_maskz_loadu_ps(tails[1], dst + 0x6 * dD + F);
443
0
                    if (M > 0x7) d70 = _mm512_loadu_ps(dst + 0x7 * dD + 0), d71 = _mm512_maskz_loadu_ps(tails[1], dst + 0x7 * dD + F);
444
0
                    if (M > 0x8) d80 = _mm512_loadu_ps(dst + 0x8 * dD + 0), d81 = _mm512_maskz_loadu_ps(tails[1], dst + 0x8 * dD + F);
445
0
                    if (M > 0x9) d90 = _mm512_loadu_ps(dst + 0x9 * dD + 0), d91 = _mm512_maskz_loadu_ps(tails[1], dst + 0x9 * dD + F);
446
0
                    if (M > 0xa) da0 = _mm512_loadu_ps(dst + 0xa * dD + 0), da1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xa * dD + F);
447
0
                    if (M > 0xb) db0 = _mm512_loadu_ps(dst + 0xb * dD + 0), db1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xb * dD + F);
448
0
                    if (M > 0xc) dc0 = _mm512_loadu_ps(dst + 0xc * dD + 0), dc1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xc * dD + F);
449
0
                    if (M > 0xd) dd0 = _mm512_loadu_ps(dst + 0xd * dD + 0), dd1 = _mm512_maskz_loadu_ps(tails[1], dst + 0xd * dD + F);
450
0
                }
451
0
                if (srcC * F * sizeof(float) > PREFETCH_SIZE)
452
0
                {
453
0
                    for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F)
454
0
                    {
455
0
                        PrefetchL1(weight0 + offw);
456
0
                        PrefetchL1(weight1 + offw);
457
0
                        w0 = _mm512_loadu_ps(weight0 + offw);
458
0
                        w1 = _mm512_loadu_ps(weight1 + offw);
459
0
                        if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01);
460
0
                        if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11);
461
0
                        if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21);
462
0
                        if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31);
463
0
                        if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41);
464
0
                        if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51);
465
0
                        if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61);
466
0
                        if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71);
467
0
                        if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81);
468
0
                        if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91);
469
0
                        if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1);
470
0
                        if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1);
471
0
                        if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1);
472
0
                        if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1);
473
0
                    }
474
0
                }
475
0
                else
476
0
                {
477
0
                    for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F)
478
0
                    {
479
0
                        w0 = _mm512_loadu_ps(weight0 + offw);
480
0
                        w1 = _mm512_loadu_ps(weight1 + offw);
481
0
                        if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00), d01 = _mm512_fmadd_ps(s0, w1, d01);
482
0
                        if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10), d11 = _mm512_fmadd_ps(s0, w1, d11);
483
0
                        if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20), d21 = _mm512_fmadd_ps(s0, w1, d21);
484
0
                        if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30), d31 = _mm512_fmadd_ps(s0, w1, d31);
485
0
                        if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40), d41 = _mm512_fmadd_ps(s0, w1, d41);
486
0
                        if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50), d51 = _mm512_fmadd_ps(s0, w1, d51);
487
0
                        if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60), d61 = _mm512_fmadd_ps(s0, w1, d61);
488
0
                        if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70), d71 = _mm512_fmadd_ps(s0, w1, d71);
489
0
                        if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80), d81 = _mm512_fmadd_ps(s0, w1, d81);
490
0
                        if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90), d91 = _mm512_fmadd_ps(s0, w1, d91);
491
0
                        if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0), da1 = _mm512_fmadd_ps(s0, w1, da1);
492
0
                        if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0), db1 = _mm512_fmadd_ps(s0, w1, db1);
493
0
                        if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0), dc1 = _mm512_fmadd_ps(s0, w1, dc1);
494
0
                        if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0), dd1 = _mm512_fmadd_ps(s0, w1, dd1);
495
0
                    }
496
0
                }
497
0
                if (M > 0x0) Save2<term, type>(dst, d00, d01, bias, params, tails), dst += dD;
498
0
                if (M > 0x1) Save2<term, type>(dst, d10, d11, bias, params, tails), dst += dD;
499
0
                if (M > 0x2) Save2<term, type>(dst, d20, d21, bias, params, tails), dst += dD;
500
0
                if (M > 0x3) Save2<term, type>(dst, d30, d31, bias, params, tails), dst += dD;
501
0
                if (M > 0x4) Save2<term, type>(dst, d40, d41, bias, params, tails), dst += dD;
502
0
                if (M > 0x5) Save2<term, type>(dst, d50, d51, bias, params, tails), dst += dD;
503
0
                if (M > 0x6) Save2<term, type>(dst, d60, d61, bias, params, tails), dst += dD;
504
0
                if (M > 0x7) Save2<term, type>(dst, d70, d71, bias, params, tails), dst += dD;
505
0
                if (M > 0x8) Save2<term, type>(dst, d80, d81, bias, params, tails), dst += dD;
506
0
                if (M > 0x9) Save2<term, type>(dst, d90, d91, bias, params, tails), dst += dD;
507
0
                if (M > 0xa) Save2<term, type>(dst, da0, da1, bias, params, tails), dst += dD;
508
0
                if (M > 0xb) Save2<term, type>(dst, db0, db1, bias, params, tails), dst += dD;
509
0
                if (M > 0xc) Save2<term, type>(dst, dc0, dc1, bias, params, tails), dst += dD;
510
0
                if (M > 0xd) Save2<term, type>(dst, dd0, dd1, bias, params, tails), dst += dD;
511
0
            }
512
0
            else
513
0
            {
514
0
                if (first)
515
0
                {
516
0
                    if (M > 0x0) d00 = _mm512_setzero_ps();
517
0
                    if (M > 0x1) d10 = _mm512_setzero_ps();
518
0
                    if (M > 0x2) d20 = _mm512_setzero_ps();
519
0
                    if (M > 0x3) d30 = _mm512_setzero_ps();
520
0
                    if (M > 0x4) d40 = _mm512_setzero_ps();
521
0
                    if (M > 0x5) d50 = _mm512_setzero_ps();
522
0
                    if (M > 0x6) d60 = _mm512_setzero_ps();
523
0
                    if (M > 0x7) d70 = _mm512_setzero_ps();
524
0
                    if (M > 0x8) d80 = _mm512_setzero_ps();
525
0
                    if (M > 0x9) d90 = _mm512_setzero_ps();
526
0
                    if (M > 0xa) da0 = _mm512_setzero_ps();
527
0
                    if (M > 0xb) db0 = _mm512_setzero_ps();
528
0
                    if (M > 0xc) dc0 = _mm512_setzero_ps();
529
0
                    if (M > 0xd) dd0 = _mm512_setzero_ps();
530
0
                }
531
0
                else
532
0
                {
533
0
                    if (M > 0x0) d00 = _mm512_maskz_loadu_ps(tails[0], dst + 0x0 * dD + 0);
534
0
                    if (M > 0x1) d10 = _mm512_maskz_loadu_ps(tails[0], dst + 0x1 * dD + 0);
535
0
                    if (M > 0x2) d20 = _mm512_maskz_loadu_ps(tails[0], dst + 0x2 * dD + 0);
536
0
                    if (M > 0x3) d30 = _mm512_maskz_loadu_ps(tails[0], dst + 0x3 * dD + 0);
537
0
                    if (M > 0x4) d40 = _mm512_maskz_loadu_ps(tails[0], dst + 0x4 * dD + 0);
538
0
                    if (M > 0x5) d50 = _mm512_maskz_loadu_ps(tails[0], dst + 0x5 * dD + 0);
539
0
                    if (M > 0x6) d60 = _mm512_maskz_loadu_ps(tails[0], dst + 0x6 * dD + 0);
540
0
                    if (M > 0x7) d70 = _mm512_maskz_loadu_ps(tails[0], dst + 0x7 * dD + 0);
541
0
                    if (M > 0x8) d80 = _mm512_maskz_loadu_ps(tails[0], dst + 0x8 * dD + 0);
542
0
                    if (M > 0x9) d90 = _mm512_maskz_loadu_ps(tails[0], dst + 0x9 * dD + 0);
543
0
                    if (M > 0xa) da0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xa * dD + 0);
544
0
                    if (M > 0xb) db0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xb * dD + 0);
545
0
                    if (M > 0xc) dc0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xc * dD + 0);
546
0
                    if (M > 0xd) dd0 = _mm512_maskz_loadu_ps(tails[0], dst + 0xd * dD + 0);
547
0
                }
548
0
                for (size_t off0 = 0, off7 = 7 * dS, offw = 0; off0 < srcC; ++off0, ++off7, offw += F)
549
0
                {
550
0
                    w0 = _mm512_loadu_ps(weight0 + offw);
551
0
                    if (M > 0x0) s0 = _mm512_set1_ps(src0[off0]), d00 = _mm512_fmadd_ps(s0, w0, d00);
552
0
                    if (M > 0x1) s0 = _mm512_set1_ps(src1[off0]), d10 = _mm512_fmadd_ps(s0, w0, d10);
553
0
                    if (M > 0x2) s0 = _mm512_set1_ps(src2[off0]), d20 = _mm512_fmadd_ps(s0, w0, d20);
554
0
                    if (M > 0x3) s0 = _mm512_set1_ps(src3[off0]), d30 = _mm512_fmadd_ps(s0, w0, d30);
555
0
                    if (M > 0x4) s0 = _mm512_set1_ps(src4[off0]), d40 = _mm512_fmadd_ps(s0, w0, d40);
556
0
                    if (M > 0x5) s0 = _mm512_set1_ps(src5[off0]), d50 = _mm512_fmadd_ps(s0, w0, d50);
557
0
                    if (M > 0x6) s0 = _mm512_set1_ps(src6[off0]), d60 = _mm512_fmadd_ps(s0, w0, d60);
558
0
                    if (M > 0x7) s0 = _mm512_set1_ps(src0[off7]), d70 = _mm512_fmadd_ps(s0, w0, d70);
559
0
                    if (M > 0x8) s0 = _mm512_set1_ps(src1[off7]), d80 = _mm512_fmadd_ps(s0, w0, d80);
560
0
                    if (M > 0x9) s0 = _mm512_set1_ps(src2[off7]), d90 = _mm512_fmadd_ps(s0, w0, d90);
561
0
                    if (M > 0xa) s0 = _mm512_set1_ps(src3[off7]), da0 = _mm512_fmadd_ps(s0, w0, da0);
562
0
                    if (M > 0xb) s0 = _mm512_set1_ps(src4[off7]), db0 = _mm512_fmadd_ps(s0, w0, db0);
563
0
                    if (M > 0xc) s0 = _mm512_set1_ps(src5[off7]), dc0 = _mm512_fmadd_ps(s0, w0, dc0);
564
0
                    if (M > 0xd) s0 = _mm512_set1_ps(src6[off7]), dd0 = _mm512_fmadd_ps(s0, w0, dd0);
565
0
                }
566
0
                if (M > 0x0) Save1<term, type>(dst, d00, bias, params, tails), dst += dD;
567
0
                if (M > 0x1) Save1<term, type>(dst, d10, bias, params, tails), dst += dD;
568
0
                if (M > 0x2) Save1<term, type>(dst, d20, bias, params, tails), dst += dD;
569
0
                if (M > 0x3) Save1<term, type>(dst, d30, bias, params, tails), dst += dD;
570
0
                if (M > 0x4) Save1<term, type>(dst, d40, bias, params, tails), dst += dD;
571
0
                if (M > 0x5) Save1<term, type>(dst, d50, bias, params, tails), dst += dD;
572
0
                if (M > 0x6) Save1<term, type>(dst, d60, bias, params, tails), dst += dD;
573
0
                if (M > 0x7) Save1<term, type>(dst, d70, bias, params, tails), dst += dD;
574
0
                if (M > 0x8) Save1<term, type>(dst, d80, bias, params, tails), dst += dD;
575
0
                if (M > 0x9) Save1<term, type>(dst, d90, bias, params, tails), dst += dD;
576
0
                if (M > 0xa) Save1<term, type>(dst, da0, bias, params, tails), dst += dD;
577
0
                if (M > 0xb) Save1<term, type>(dst, db0, bias, params, tails), dst += dD;
578
0
                if (M > 0xc) Save1<term, type>(dst, dc0, bias, params, tails), dst += dD;
579
0
                if (M > 0xd) Save1<term, type>(dst, dd0, bias, params, tails), dst += dD;
580
0
            }
581
0
        }
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 1>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 2>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 11>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 12>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 13>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10, 14>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
582
583
        template<TermType term, SimdConvolutionActivationType type> ConvolutionNhwcDirect1x1_NxM_Ptr GetConvolutionNhwcDirect1x1_2xM(size_t M)
584
0
        {
585
0
            switch (M)
586
0
            {
587
0
            case 0: return NULL;
588
0
            case 0x1: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x1>;
589
0
            case 0x2: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x2>;
590
0
            case 0x3: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x3>;
591
0
            case 0x4: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x4>;
592
0
            case 0x5: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x5>;
593
0
            case 0x6: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x6>;
594
0
            case 0x7: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x7>;
595
0
            case 0x8: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x8>;
596
0
            case 0x9: return ConvolutionNhwcDirect1x1_2xM<term, type, 0x9>;
597
0
            case 0xa: return ConvolutionNhwcDirect1x1_2xM<term, type, 0xa>;
598
0
            case 0xb: return ConvolutionNhwcDirect1x1_2xM<term, type, 0xb>;
599
0
            case 0xc: return ConvolutionNhwcDirect1x1_2xM<term, type, 0xc>;
600
0
            case 0xd: return ConvolutionNhwcDirect1x1_2xM<term, type, 0xd>;
601
0
            case 0xe: return ConvolutionNhwcDirect1x1_2xM<term, type, 0xe>;
602
0
            }
603
0
            assert(0);
604
0
            return NULL;
605
0
        }
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)3>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)1, (SimdConvolutionActivationType)0>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)4>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)5>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)6>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)7>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)8>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)9>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
Unexecuted instantiation: void (*Simd::Avx512bw::GetConvolutionNhwcDirect1x1_2xM<(Simd::TermType)0, (SimdConvolutionActivationType)10>(unsigned long))(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, float const*, float __vector(16) const*, float __vector(16) const*, float*, unsigned short const*, int)
606
607
        template<TermType term, SimdConvolutionActivationType type> void ConvolutionNhwcDirect1x1_2(const float* src, const ConvParam& p, const AlgParam& a,
608
            size_t dstC, size_t yBeg, size_t yEnd, size_t srcC, const float* weight, const float* bias, const float* params, float* dst, int first)
609
0
        {
610
0
            size_t n = 14, n1 = (yEnd - yBeg) * p.dstW, nn = AlignLoAny(n1, n), m = n1 - nn;
611
0
            ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xN = GetConvolutionNhwcDirect1x1_2xM<term, type>(n);
612
0
            ConvolutionNhwcDirect1x1_NxM_Ptr convolutionNhwcDirect1x1_2xM = GetConvolutionNhwcDirect1x1_2xM<term, type>(m);
613
614
0
            __m512 _params[2], _bias[2];
615
0
            _params[0] = _mm512_set1_ps(params[0]);
616
0
            if (type == SimdConvolutionActivationRestrictRange ||
617
0
                type == SimdConvolutionActivationHswish ||
618
0
                type == SimdConvolutionActivationHardSigmoid)
619
0
                _params[1] = _mm512_set1_ps(params[1]);
620
621
0
            for (size_t dc = 0; dc < dstC; dc += a.microD)
622
0
            {
623
0
                size_t dC = Simd::Min(a.microD, dstC - dc);
624
0
                __mmask16 tails[2] = { TailMask16(dC), TailMask16(dC - F) };
625
0
                if (dC > 0 * F) _bias[0] = _mm512_loadu_ps(bias + dc + 0 * F);
626
0
                if (dC > 1 * F) _bias[1] = _mm512_loadu_ps(bias + dc + 1 * F);
627
0
                if (type == ::SimdConvolutionActivationPrelu)
628
0
                {
629
0
                    if (dC > 0 * F) _params[0] = _mm512_loadu_ps(params + dc + 0 * F);
630
0
                    if (dC > 1 * F) _params[1] = _mm512_loadu_ps(params + dc + 1 * F);
631
0
                }
632
0
                const float* ps = src + yBeg * p.srcW * p.srcC;
633
0
                float* pd = dst + dc + yBeg * p.dstW * p.dstC;
634
0
                size_t i = 0;
635
0
                for (; i < nn; i += n, ps += n * p.srcC, pd += n * p.dstC)
636
0
                    convolutionNhwcDirect1x1_2xN(ps, p, a, srcC, weight, _bias, _params, pd, tails, first);
637
0
                for (; i < n1; i += m, ps += m * p.srcC, pd += m * p.dstC)
638
0
                    convolutionNhwcDirect1x1_2xM(ps, p, a, srcC, weight, _bias, _params, pd, tails, first);
639
0
                weight += p.srcC * a.microD;
640
0
            }
641
0
        }
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)1, (SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
Unexecuted instantiation: void Simd::Avx512bw::ConvolutionNhwcDirect1x1_2<(Simd::TermType)0, (SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam const&, unsigned long, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*, int)
642
643
        //---------------------------------------------------------------------
644
645
        template <TermType term, SimdConvolutionActivationType type> static SIMD_INLINE void Set(const ConvParam& p, AlgParam& a)
646
0
        {
647
0
            a.convolutions[term] = p.Is1x1() ? ConvolutionNhwcDirect1x1_2<term, type> : ConvolutionNhwcDirect_2<term, type>;
648
0
        }
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)3>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)1, (SimdConvolutionActivationType)0>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)4>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)5>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)6>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)7>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)8>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)9>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(Simd::TermType)0, (SimdConvolutionActivationType)10>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
649
650
        template <SimdConvolutionActivationType type> static SIMD_INLINE void Set(const ConvParam& p, AlgParam& a)
651
0
        {
652
0
            Set<TermLast, type>(p, a);
653
0
            Set<TermInterim, SimdConvolutionActivationIdentity>(p, a);
654
0
        }
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)3>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)4>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)5>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)6>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)7>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)8>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)9>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
Unexecuted instantiation: SimdAvx512bwSynetConvolution32fNhwcDirect2r.cpp:void Simd::Avx512bw::Set<(SimdConvolutionActivationType)10>(Simd::ConvParam const&, Simd::Base::SynetConvolution32fNhwcDirect::AlgParam&)
655
656
        bool SynetConvolution32fNhwcDirect::Set2r(const ConvParam& p, AlgParam& a)
657
0
        {
658
0
            assert(a.microD == 2 * F);
659
0
            switch (p.activation)
660
0
            {
661
0
            case SimdConvolutionActivationIdentity: Set<SimdConvolutionActivationRestrictRange>(p, a); break;
662
0
            case SimdConvolutionActivationRelu: Set<SimdConvolutionActivationRestrictRange>(p, a); break;
663
0
            case SimdConvolutionActivationLeakyRelu: Set<SimdConvolutionActivationPrelu>(p, a); break;
664
0
            case SimdConvolutionActivationRestrictRange: Set<SimdConvolutionActivationRestrictRange>(p, a); break;
665
0
            case SimdConvolutionActivationPrelu: Set<SimdConvolutionActivationPrelu>(p, a); break;
666
0
            case SimdConvolutionActivationElu: Set<SimdConvolutionActivationElu>(p, a); break;
667
0
            case SimdConvolutionActivationHswish: Set<SimdConvolutionActivationHswish>(p, a); break;
668
0
            case SimdConvolutionActivationMish: Set<SimdConvolutionActivationMish>(p, a); break;
669
0
            case SimdConvolutionActivationHardSigmoid: Set<SimdConvolutionActivationHardSigmoid>(p, a); break;
670
0
            case SimdConvolutionActivationSwish: Set<SimdConvolutionActivationSwish>(p, a); break;
671
0
            case SimdConvolutionActivationGelu: Set<SimdConvolutionActivationGelu>(p, a); break;
672
0
            default: assert(0);
673
0
            }
674
0
            return true;
675
0
        }
676
    }
677
#endif
678
}