/src/Simd/src/Simd/SimdAvx512bwSynetConvolution32fNhwcDepthwise.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2025 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetConvolution32f.h" |
25 | | #include "Simd/SimdSynetConvolution32fCommon.h" |
26 | | #include "Simd/SimdStore.h" |
27 | | #include "Simd/SimdSynet.h" |
28 | | #include "Simd/SimdGemm.h" |
29 | | #include "Simd/SimdExp.h" |
30 | | |
31 | | namespace Simd |
32 | | { |
33 | | #if defined(SIMD_AVX512BW_ENABLE) && defined(SIMD_SYNET_ENABLE) |
34 | | namespace Avx512bw |
35 | | { |
36 | | template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwiseDefault(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst) |
37 | 0 | { |
38 | 0 | size_t srcW = p.srcW, strideX = p.strideX, dilationX = p.dilationX, kernelX = p.kernelX, sX = strideX * p.dstC; |
39 | 0 | size_t dstC = p.dstC, dstCF = AlignLo(p.dstC, F), dstC2F = AlignLo(p.dstC, 2 * F), dstC4F = AlignLo(p.dstC, 4 * F); |
40 | 0 | size_t dstW2 = AlignLo(p.dstW, 2), dstW4 = AlignLo(p.dstW, 4); |
41 | 0 | __m512 d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33, w0; |
42 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
43 | 0 | { |
44 | 0 | size_t dx = 0; |
45 | 0 | for (; dx < dstW4; dx += 4) |
46 | 0 | { |
47 | 0 | float* dst0 = dst + 0 * p.dstC, * dst1 = dst + 1 * p.dstC, * dst2 = dst + 2 * p.dstC, * dst3 = dst + 3 * p.dstC; |
48 | 0 | size_t sx0 = dx * p.strideX - p.padX; |
49 | 0 | size_t dc = 0; |
50 | 0 | for (; dc < dstC4F; dc += 4 * F) |
51 | 0 | { |
52 | 0 | if (bias) |
53 | 0 | { |
54 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
55 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
56 | 0 | d02 = _mm512_loadu_ps(bias + dc + 2 * F); |
57 | 0 | d03 = _mm512_loadu_ps(bias + dc + 3 * F); |
58 | 0 | } |
59 | 0 | else |
60 | 0 | { |
61 | 0 | d00 = _mm512_setzero_ps(); |
62 | 0 | d01 = _mm512_setzero_ps(); |
63 | 0 | d02 = _mm512_setzero_ps(); |
64 | 0 | d03 = _mm512_setzero_ps(); |
65 | 0 | } |
66 | 0 | d10 = d00; d11 = d01; d12 = d02; d13 = d03; |
67 | 0 | d20 = d00; d21 = d01; d22 = d02; d23 = d03; |
68 | 0 | d30 = d00; d31 = d01; d32 = d02; d33 = d03; |
69 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
70 | 0 | { |
71 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
72 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
73 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
74 | 0 | if (sy < p.srcH) |
75 | 0 | { |
76 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
77 | 0 | { |
78 | 0 | size_t sx = sx0 + kx * dilationX; |
79 | 0 | const float* pw = pwy + kx * dstC; |
80 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; |
81 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; |
82 | 0 | __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000; |
83 | 0 | __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000; |
84 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; |
85 | |
|
86 | 0 | w0 = _mm512_loadu_ps(pw + 0 * F); |
87 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
88 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
89 | 0 | d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2); |
90 | 0 | d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3); |
91 | 0 | w0 = _mm512_loadu_ps(pw + 1 * F); |
92 | 0 | d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0); |
93 | 0 | d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1); |
94 | 0 | d21 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 1 * F), w0, d21, mask2); |
95 | 0 | d31 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 1 * F), w0, d31, mask3); |
96 | 0 | w0 = _mm512_loadu_ps(pw + 2 * F); |
97 | 0 | d02 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 2 * F), w0, d02, mask0); |
98 | 0 | d12 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 2 * F), w0, d12, mask1); |
99 | 0 | d22 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 2 * F), w0, d22, mask2); |
100 | 0 | d32 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 2 * F), w0, d32, mask3); |
101 | 0 | w0 = _mm512_loadu_ps(pw + 3 * F); |
102 | 0 | d03 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 3 * F), w0, d03, mask0); |
103 | 0 | d13 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 3 * F), w0, d13, mask1); |
104 | 0 | d23 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 3 * F), w0, d23, mask2); |
105 | 0 | d33 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 3 * F), w0, d33, mask3); |
106 | 0 | } |
107 | 0 | } |
108 | 0 | } |
109 | 0 | _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
110 | 0 | _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
111 | 0 | _mm512_storeu_ps(dst0 + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F)); |
112 | 0 | _mm512_storeu_ps(dst0 + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F)); |
113 | 0 | _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F)); |
114 | 0 | _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F)); |
115 | 0 | _mm512_storeu_ps(dst1 + dc + 2 * F, Activate<type>(d12, params, dc + 2 * F)); |
116 | 0 | _mm512_storeu_ps(dst1 + dc + 3 * F, Activate<type>(d13, params, dc + 3 * F)); |
117 | 0 | _mm512_storeu_ps(dst2 + dc + 0 * F, Activate<type>(d20, params, dc + 0 * F)); |
118 | 0 | _mm512_storeu_ps(dst2 + dc + 1 * F, Activate<type>(d21, params, dc + 1 * F)); |
119 | 0 | _mm512_storeu_ps(dst2 + dc + 2 * F, Activate<type>(d22, params, dc + 2 * F)); |
120 | 0 | _mm512_storeu_ps(dst2 + dc + 3 * F, Activate<type>(d23, params, dc + 3 * F)); |
121 | 0 | _mm512_storeu_ps(dst3 + dc + 0 * F, Activate<type>(d30, params, dc + 0 * F)); |
122 | 0 | _mm512_storeu_ps(dst3 + dc + 1 * F, Activate<type>(d31, params, dc + 1 * F)); |
123 | 0 | _mm512_storeu_ps(dst3 + dc + 2 * F, Activate<type>(d32, params, dc + 2 * F)); |
124 | 0 | _mm512_storeu_ps(dst3 + dc + 3 * F, Activate<type>(d33, params, dc + 3 * F)); |
125 | 0 | } |
126 | 0 | for (; dc < dstC2F; dc += 2 * F) |
127 | 0 | { |
128 | 0 | if (bias) |
129 | 0 | { |
130 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
131 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
132 | 0 | } |
133 | 0 | else |
134 | 0 | { |
135 | 0 | d00 = _mm512_setzero_ps(); |
136 | 0 | d01 = _mm512_setzero_ps(); |
137 | 0 | } |
138 | 0 | d10 = d00; d11 = d01; |
139 | 0 | d20 = d00; d21 = d01; |
140 | 0 | d30 = d00; d31 = d01; |
141 | |
|
142 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
143 | 0 | { |
144 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
145 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
146 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
147 | 0 | if (sy < p.srcH) |
148 | 0 | { |
149 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
150 | 0 | { |
151 | 0 | size_t sx = sx0 + kx * dilationX; |
152 | 0 | const float* pw = pwy + kx * dstC; |
153 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; |
154 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; |
155 | 0 | __mmask16 mask2 = sx + 2 * strideX < srcW ? 0xFFFF : 0x0000; |
156 | 0 | __mmask16 mask3 = sx + 3 * strideX < srcW ? 0xFFFF : 0x0000; |
157 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; |
158 | |
|
159 | 0 | w0 = _mm512_loadu_ps(pw + 0 * F); |
160 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
161 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
162 | 0 | d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2); |
163 | 0 | d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3); |
164 | 0 | w0 = _mm512_loadu_ps(pw + 1 * F); |
165 | 0 | d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0); |
166 | 0 | d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1); |
167 | 0 | d21 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 1 * F), w0, d21, mask2); |
168 | 0 | d31 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 1 * F), w0, d31, mask3); |
169 | 0 | } |
170 | 0 | } |
171 | 0 | } |
172 | 0 | _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
173 | 0 | _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
174 | 0 | _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F)); |
175 | 0 | _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F)); |
176 | 0 | _mm512_storeu_ps(dst2 + dc + 0 * F, Activate<type>(d20, params, dc + 0 * F)); |
177 | 0 | _mm512_storeu_ps(dst2 + dc + 1 * F, Activate<type>(d21, params, dc + 1 * F)); |
178 | 0 | _mm512_storeu_ps(dst3 + dc + 0 * F, Activate<type>(d30, params, dc + 0 * F)); |
179 | 0 | _mm512_storeu_ps(dst3 + dc + 1 * F, Activate<type>(d31, params, dc + 1 * F)); |
180 | 0 | } |
181 | 0 | for (; dc < dstC; dc += F) |
182 | 0 | { |
183 | 0 | __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); |
184 | 0 | d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); |
185 | 0 | d10 = d00; d20 = d00; d30 = d00; |
186 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
187 | 0 | { |
188 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
189 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
190 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
191 | 0 | if (sy < p.srcH) |
192 | 0 | { |
193 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
194 | 0 | { |
195 | 0 | size_t sx = sx0 + kx * dilationX; |
196 | 0 | const float* pw = pwy + kx * dstC; |
197 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? tailC : 0x0000; |
198 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000; |
199 | 0 | __mmask16 mask2 = sx + 2 * strideX < srcW ? tailC : 0x0000; |
200 | 0 | __mmask16 mask3 = sx + 3 * strideX < srcW ? tailC : 0x0000; |
201 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX, * ps2 = ps0 + 2 * sX, * ps3 = ps0 + 3 * sX; |
202 | |
|
203 | 0 | w0 = _mm512_loadu_ps(pw + 0 * F); |
204 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
205 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
206 | 0 | d20 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask2, ps2 + 0 * F), w0, d20, mask2); |
207 | 0 | d30 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask3, ps3 + 0 * F), w0, d30, mask3); |
208 | 0 | } |
209 | 0 | } |
210 | 0 | } |
211 | 0 | _mm512_mask_storeu_ps(dst0 + dc, tailC, Activate<type>(d00, params, dc, tailC)); |
212 | 0 | _mm512_mask_storeu_ps(dst1 + dc, tailC, Activate<type>(d10, params, dc, tailC)); |
213 | 0 | _mm512_mask_storeu_ps(dst2 + dc, tailC, Activate<type>(d20, params, dc, tailC)); |
214 | 0 | _mm512_mask_storeu_ps(dst3 + dc, tailC, Activate<type>(d30, params, dc, tailC)); |
215 | 0 | } |
216 | 0 | dst += 4 * p.dstC; |
217 | 0 | } |
218 | 0 | for (; dx < dstW2; dx += 2) |
219 | 0 | { |
220 | 0 | float* dst0 = dst + 0 * p.dstC, *dst1 = dst + 1 * p.dstC; |
221 | 0 | size_t sx0 = dx * p.strideX - p.padX; |
222 | 0 | size_t dc = 0; |
223 | 0 | for (; dc < dstC4F; dc += 4 * F) |
224 | 0 | { |
225 | 0 | if (bias) |
226 | 0 | { |
227 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
228 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
229 | 0 | d02 = _mm512_loadu_ps(bias + dc + 2 * F); |
230 | 0 | d03 = _mm512_loadu_ps(bias + dc + 3 * F); |
231 | 0 | } |
232 | 0 | else |
233 | 0 | { |
234 | 0 | d00 = _mm512_setzero_ps(); |
235 | 0 | d01 = _mm512_setzero_ps(); |
236 | 0 | d02 = _mm512_setzero_ps(); |
237 | 0 | d03 = _mm512_setzero_ps(); |
238 | 0 | } |
239 | 0 | d10 = d00; d11 = d01; d12 = d02; d13 = d03; |
240 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
241 | 0 | { |
242 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
243 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
244 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
245 | 0 | if (sy < p.srcH) |
246 | 0 | { |
247 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
248 | 0 | { |
249 | 0 | size_t sx = sx0 + kx * dilationX; |
250 | 0 | const float* pw = pwy + kx * dstC; |
251 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; |
252 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; |
253 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; |
254 | |
|
255 | 0 | w0 = _mm512_loadu_ps(pw + 0 * F); |
256 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
257 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
258 | 0 | w0 = _mm512_loadu_ps(pw + 1 * F); |
259 | 0 | d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0); |
260 | 0 | d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1); |
261 | 0 | w0 = _mm512_loadu_ps(pw + 2 * F); |
262 | 0 | d02 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 2 * F), w0, d02, mask0); |
263 | 0 | d12 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 2 * F), w0, d12, mask1); |
264 | 0 | w0 = _mm512_loadu_ps(pw + 3 * F); |
265 | 0 | d03 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 3 * F), w0, d03, mask0); |
266 | 0 | d13 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 3 * F), w0, d13, mask1); |
267 | 0 | } |
268 | 0 | } |
269 | 0 | } |
270 | 0 | _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
271 | 0 | _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
272 | 0 | _mm512_storeu_ps(dst0 + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F)); |
273 | 0 | _mm512_storeu_ps(dst0 + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F)); |
274 | 0 | _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F)); |
275 | 0 | _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F)); |
276 | 0 | _mm512_storeu_ps(dst1 + dc + 2 * F, Activate<type>(d12, params, dc + 2 * F)); |
277 | 0 | _mm512_storeu_ps(dst1 + dc + 3 * F, Activate<type>(d13, params, dc + 3 * F)); |
278 | 0 | } |
279 | 0 | for (; dc < dstC2F; dc += 2 * F) |
280 | 0 | { |
281 | 0 | if (bias) |
282 | 0 | { |
283 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
284 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
285 | 0 | } |
286 | 0 | else |
287 | 0 | { |
288 | 0 | d00 = _mm512_setzero_ps(); |
289 | 0 | d01 = _mm512_setzero_ps(); |
290 | 0 | } |
291 | 0 | d10 = d00; d11 = d01; |
292 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
293 | 0 | { |
294 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
295 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
296 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
297 | 0 | if (sy < p.srcH) |
298 | 0 | { |
299 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
300 | 0 | { |
301 | 0 | size_t sx = sx0 + kx * dilationX; |
302 | 0 | const float* pw = pwy + kx * dstC; |
303 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? 0xFFFF : 0x0000; |
304 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? 0xFFFF : 0x0000; |
305 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; |
306 | |
|
307 | 0 | w0 = _mm512_loadu_ps(pw + 0 * F); |
308 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
309 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
310 | 0 | w0 = _mm512_loadu_ps(pw + 1 * F); |
311 | 0 | d01 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 1 * F), w0, d01, mask0); |
312 | 0 | d11 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 1 * F), w0, d11, mask1); |
313 | 0 | } |
314 | 0 | } |
315 | 0 | } |
316 | 0 | _mm512_storeu_ps(dst0 + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
317 | 0 | _mm512_storeu_ps(dst0 + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
318 | 0 | _mm512_storeu_ps(dst1 + dc + 0 * F, Activate<type>(d10, params, dc + 0 * F)); |
319 | 0 | _mm512_storeu_ps(dst1 + dc + 1 * F, Activate<type>(d11, params, dc + 1 * F)); |
320 | 0 | } |
321 | 0 | for (; dc < dstC; dc += F) |
322 | 0 | { |
323 | 0 | __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); |
324 | 0 | d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); |
325 | 0 | d10 = d00; |
326 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
327 | 0 | { |
328 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
329 | 0 | const float* psy = src + sy * p.srcW * dstC + dc; |
330 | 0 | const float* pwy = weight + ky * p.kernelX * dstC + dc; |
331 | 0 | if (sy < p.srcH) |
332 | 0 | { |
333 | 0 | for (size_t kx = 0; kx < kernelX; ++kx) |
334 | 0 | { |
335 | 0 | size_t sx = sx0 + kx * dilationX; |
336 | 0 | const float* pw = pwy + kx * dstC; |
337 | 0 | __mmask16 mask0 = sx + 0 * strideX < srcW ? tailC : 0x0000; |
338 | 0 | __mmask16 mask1 = sx + 1 * strideX < srcW ? tailC : 0x0000; |
339 | 0 | const float* ps0 = psy + sx * dstC, * ps1 = ps0 + 1 * sX; |
340 | |
|
341 | 0 | w0 = _mm512_maskz_loadu_ps(tailC, pw); |
342 | 0 | d00 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask0, ps0 + 0 * F), w0, d00, mask0); |
343 | 0 | d10 = _mm512_mask3_fmadd_ps(_mm512_maskz_loadu_ps(mask1, ps1 + 0 * F), w0, d10, mask1); |
344 | 0 | } |
345 | 0 | } |
346 | 0 | } |
347 | 0 | _mm512_mask_storeu_ps(dst0 + dc, tailC, Activate<type>(d00, params, dc, tailC)); |
348 | 0 | _mm512_mask_storeu_ps(dst1 + dc, tailC, Activate<type>(d10, params, dc, tailC)); |
349 | 0 | } |
350 | 0 | dst += 2 * p.dstC; |
351 | 0 | } |
352 | 0 | for (; dx < p.dstW; ++dx) |
353 | 0 | { |
354 | 0 | size_t dc = 0; |
355 | 0 | for (; dc < dstC4F; dc += 4 * F) |
356 | 0 | { |
357 | 0 | if (bias) |
358 | 0 | { |
359 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
360 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
361 | 0 | d02 = _mm512_loadu_ps(bias + dc + 2 * F); |
362 | 0 | d03 = _mm512_loadu_ps(bias + dc + 3 * F); |
363 | 0 | } |
364 | 0 | else |
365 | 0 | { |
366 | 0 | d00 = _mm512_setzero_ps(); |
367 | 0 | d01 = _mm512_setzero_ps(); |
368 | 0 | d02 = _mm512_setzero_ps(); |
369 | 0 | d03 = _mm512_setzero_ps(); |
370 | 0 | } |
371 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
372 | 0 | { |
373 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
374 | 0 | if (sy < p.srcH) |
375 | 0 | { |
376 | 0 | for (size_t kx = 0; kx < p.kernelX; ++kx) |
377 | 0 | { |
378 | 0 | size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; |
379 | 0 | if (sx < p.srcW) |
380 | 0 | { |
381 | 0 | const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc; |
382 | 0 | const float * ps = src + (sy*p.srcW + sx)* dstC + dc; |
383 | 0 | d00 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), d00); |
384 | 0 | d01 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), d01); |
385 | 0 | d02 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * F), _mm512_loadu_ps(pw + 2 * F), d02); |
386 | 0 | d03 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 3 * F), _mm512_loadu_ps(pw + 3 * F), d03); |
387 | 0 | } |
388 | 0 | } |
389 | 0 | } |
390 | 0 | } |
391 | 0 | _mm512_storeu_ps(dst + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
392 | 0 | _mm512_storeu_ps(dst + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
393 | 0 | _mm512_storeu_ps(dst + dc + 2 * F, Activate<type>(d02, params, dc + 2 * F)); |
394 | 0 | _mm512_storeu_ps(dst + dc + 3 * F, Activate<type>(d03, params, dc + 3 * F)); |
395 | 0 | } |
396 | 0 | for (; dc < dstC2F; dc += 2 * F) |
397 | 0 | { |
398 | 0 | if (bias) |
399 | 0 | { |
400 | 0 | d00 = _mm512_loadu_ps(bias + dc + 0 * F); |
401 | 0 | d01 = _mm512_loadu_ps(bias + dc + 1 * F); |
402 | 0 | } |
403 | 0 | else |
404 | 0 | { |
405 | 0 | d00 = _mm512_setzero_ps(); |
406 | 0 | d01 = _mm512_setzero_ps(); |
407 | 0 | } |
408 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
409 | 0 | { |
410 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
411 | 0 | if (sy < p.srcH) |
412 | 0 | { |
413 | 0 | for (size_t kx = 0; kx < p.kernelX; ++kx) |
414 | 0 | { |
415 | 0 | size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; |
416 | 0 | if (sx < p.srcW) |
417 | 0 | { |
418 | 0 | const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc; |
419 | 0 | const float * ps = src + (sy*p.srcW + sx)* dstC + dc; |
420 | 0 | d00 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * F), _mm512_loadu_ps(pw + 0 * F), d00); |
421 | 0 | d01 = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * F), _mm512_loadu_ps(pw + 1 * F), d01); |
422 | 0 | } |
423 | 0 | } |
424 | 0 | } |
425 | 0 | } |
426 | 0 | _mm512_storeu_ps(dst + dc + 0 * F, Activate<type>(d00, params, dc + 0 * F)); |
427 | 0 | _mm512_storeu_ps(dst + dc + 1 * F, Activate<type>(d01, params, dc + 1 * F)); |
428 | 0 | } |
429 | 0 | for (; dc < dstC; dc += F) |
430 | 0 | { |
431 | 0 | __mmask16 tailC = dc < dstCF ? __mmask16(-1) : TailMask16(dstC - dc); |
432 | 0 | d00 = bias ? _mm512_maskz_loadu_ps(tailC, bias + dc) : _mm512_setzero_ps(); |
433 | 0 | for (size_t ky = 0; ky < p.kernelY; ++ky) |
434 | 0 | { |
435 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
436 | 0 | if (sy < p.srcH) |
437 | 0 | { |
438 | 0 | for (size_t kx = 0; kx < p.kernelX; ++kx) |
439 | 0 | { |
440 | 0 | size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; |
441 | 0 | if (sx < p.srcW) |
442 | 0 | { |
443 | 0 | const float * pw = weight + (ky*p.kernelX + kx)* dstC + dc; |
444 | 0 | const float * ps = src + (sy*p.srcW + sx)* dstC + dc; |
445 | 0 | d00 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tailC, ps), _mm512_maskz_loadu_ps(tailC, pw), d00); |
446 | 0 | } |
447 | 0 | } |
448 | 0 | } |
449 | 0 | } |
450 | 0 | _mm512_mask_storeu_ps(dst + dc, tailC, Activate<type>(d00, params, dc, tailC)); |
451 | 0 | } |
452 | 0 | dst += p.dstC; |
453 | 0 | } |
454 | 0 | } |
455 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwiseDefault<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
456 | | |
457 | | //------------------------------------------------------------------------------------------------- |
458 | | |
459 | | template<::SimdConvolutionActivationType type> |
460 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge(const float * src, const ConvParam & p, size_t dy, size_t dx, const float * weight, const float * bias, const float * params, float * dst) |
461 | 0 | { |
462 | 0 | size_t srcC = p.srcC; |
463 | 0 | size_t srcCF = AlignLo(srcC, F); |
464 | 0 | size_t c = 0; |
465 | 0 | for (; c < srcCF; c += F) |
466 | 0 | { |
467 | 0 | __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); |
468 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
469 | 0 | { |
470 | 0 | size_t sy = dy * p.strideY + ky - p.padY; |
471 | 0 | if (sy < p.srcH) |
472 | 0 | { |
473 | 0 | for (size_t kx = 0; kx < 3; ++kx) |
474 | 0 | { |
475 | 0 | size_t sx = dx * p.strideX + kx - p.padX; |
476 | 0 | if (sx < p.srcW) |
477 | 0 | { |
478 | 0 | const float * pw = weight + (ky * 3 + kx) * srcC; |
479 | 0 | const float * ps = src + (sy*p.srcW + sx) * srcC; |
480 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), _mm512_loadu_ps(pw), sum); |
481 | 0 | } |
482 | 0 | } |
483 | 0 | } |
484 | 0 | } |
485 | 0 | _mm512_storeu_ps(dst + c, Activate<type>(sum, params, c)); |
486 | 0 | src += F; |
487 | 0 | weight += F; |
488 | 0 | } |
489 | 0 | if (c < srcC) |
490 | 0 | { |
491 | 0 | __mmask16 tail = TailMask16(srcC - c); |
492 | 0 | __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); |
493 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
494 | 0 | { |
495 | 0 | size_t sy = dy * p.strideY + ky - p.padY; |
496 | 0 | if (sy < p.srcH) |
497 | 0 | { |
498 | 0 | for (size_t kx = 0; kx < 3; ++kx) |
499 | 0 | { |
500 | 0 | size_t sx = dx * p.strideX + kx - p.padX; |
501 | 0 | if (sx < p.srcW) |
502 | 0 | { |
503 | 0 | const float * pw = weight + (ky*3 + kx) * srcC; |
504 | 0 | const float * ps = src + (sy*p.srcW + sx) * srcC; |
505 | 0 | sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps), _mm512_maskz_loadu_ps(tail, pw), sum); |
506 | 0 | } |
507 | 0 | } |
508 | 0 | } |
509 | 0 | } |
510 | 0 | _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum, params, c, tail)); |
511 | 0 | } |
512 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float const*, float const*, float const*, float*) |
513 | | |
514 | | template<::SimdConvolutionActivationType type> |
515 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main1(const float * src, size_t srcS, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) |
516 | 0 | { |
517 | 0 | size_t srcCF = AlignLo(srcC, F); |
518 | 0 | size_t c = 0; |
519 | 0 | for (; c < srcCF; c += F) |
520 | 0 | { |
521 | 0 | __m512 sum = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); |
522 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
523 | 0 | { |
524 | 0 | const float * ps = src + ky * srcS; |
525 | 0 | const float * pw = weight + ky * 3 * srcC; |
526 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 0 * srcC), _mm512_loadu_ps(pw + 0 * srcC), sum); |
527 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 1 * srcC), _mm512_loadu_ps(pw + 1 * srcC), sum); |
528 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps + 2 * srcC), _mm512_loadu_ps(pw + 2 * srcC), sum); |
529 | 0 | } |
530 | 0 | _mm512_storeu_ps(dst + c, Activate<type>(sum, params, c)); |
531 | 0 | src += F; |
532 | 0 | weight += F; |
533 | 0 | } |
534 | 0 | if (c < srcC) |
535 | 0 | { |
536 | 0 | __mmask16 tail = TailMask16(srcC - c); |
537 | 0 | __m512 sum = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); |
538 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
539 | 0 | { |
540 | 0 | const float * ps = src + ky * srcS; |
541 | 0 | const float * pw = weight + ky * 3 * srcC; |
542 | 0 | sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 0 * srcC), _mm512_maskz_loadu_ps(tail, pw + 0 * srcC), sum); |
543 | 0 | sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 1 * srcC), _mm512_maskz_loadu_ps(tail, pw + 1 * srcC), sum); |
544 | 0 | sum = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps + 2 * srcC), _mm512_maskz_loadu_ps(tail, pw + 2 * srcC), sum); |
545 | 0 | } |
546 | 0 | _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum, params, c, tail)); |
547 | 0 | } |
548 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main1<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, float const*, float const*, float const*, float*) |
549 | | |
550 | | template<::SimdConvolutionActivationType type> |
551 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main2(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) |
552 | 0 | { |
553 | 0 | size_t srcCF = AlignLo(srcC, F); |
554 | 0 | size_t c = 0; |
555 | 0 | __m512 sum0, sum1, w0; |
556 | 0 | for (; c < srcCF; c += F) |
557 | 0 | { |
558 | 0 | sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); |
559 | 0 | sum1 = sum0; |
560 | 0 | const float * pw = weight + c; |
561 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
562 | 0 | { |
563 | 0 | const float * ps0 = src + ky * srcS; |
564 | 0 | const float * ps1 = ps0 + srcX; |
565 | 0 | w0 = _mm512_loadu_ps(pw); |
566 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 0 * srcC), w0, sum0); |
567 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 0 * srcC), w0, sum1); |
568 | 0 | pw += srcC; |
569 | 0 | w0 = _mm512_loadu_ps(pw); |
570 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 1 * srcC), w0, sum0); |
571 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 1 * srcC), w0, sum1); |
572 | 0 | pw += srcC; |
573 | 0 | w0 = _mm512_loadu_ps(pw); |
574 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + 2 * srcC), w0, sum0); |
575 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + 2 * srcC), w0, sum1); |
576 | 0 | pw += srcC; |
577 | 0 | } |
578 | 0 | _mm512_storeu_ps(dst + c, Activate<type>(sum0, params, c)); |
579 | 0 | _mm512_storeu_ps(dst + c + srcC, Activate<type>(sum1, params, c)); |
580 | 0 | src += F; |
581 | 0 | } |
582 | 0 | if (c < srcC) |
583 | 0 | { |
584 | 0 | __mmask16 tail = TailMask16(srcC - c); |
585 | 0 | sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); |
586 | 0 | sum1 = sum0; |
587 | 0 | const float * pw = weight + c; |
588 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
589 | 0 | { |
590 | 0 | const float * ps0 = src + ky * srcS; |
591 | 0 | const float * ps1 = ps0 + srcX; |
592 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
593 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 0 * srcC), w0, sum0); |
594 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 0 * srcC), w0, sum1); |
595 | 0 | pw += srcC; |
596 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
597 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 1 * srcC), w0, sum0); |
598 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 1 * srcC), w0, sum1); |
599 | 0 | pw += srcC; |
600 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
601 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + 2 * srcC), w0, sum0); |
602 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + 2 * srcC), w0, sum1); |
603 | 0 | pw += srcC; |
604 | 0 | } |
605 | 0 | _mm512_mask_storeu_ps(dst + c, tail, Activate<type>(sum0, params, c, tail)); |
606 | 0 | _mm512_mask_storeu_ps(dst + c + srcC, tail, Activate<type>(sum1, params, c, tail)); |
607 | 0 | } |
608 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main2<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) |
609 | | |
610 | | template<::SimdConvolutionActivationType type> |
611 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main4(const float * src, size_t srcS, size_t srcX, size_t srcC, const float * weight, const float * bias, const float * params, float * dst) |
612 | 0 | { |
613 | 0 | size_t srcCF = AlignLo(srcC, F); |
614 | 0 | size_t c = 0; |
615 | 0 | for (; c < srcCF; c += F) |
616 | 0 | { |
617 | 0 | __m512 sum0, sum1, sum2, sum3, w0; |
618 | 0 | sum0 = bias ? _mm512_loadu_ps(bias + c) : _mm512_setzero_ps(); |
619 | 0 | sum1 = sum0; |
620 | 0 | sum2 = sum0; |
621 | 0 | sum3 = sum0; |
622 | 0 | const float * pw = weight + c; |
623 | 0 | const float * ps0 = src + 0 * srcX; |
624 | 0 | const float * ps1 = src + 1 * srcX; |
625 | 0 | const float * ps2 = src + 2 * srcX; |
626 | 0 | const float * ps3 = src + 3 * srcX; |
627 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
628 | 0 | { |
629 | 0 | size_t offset = ky * srcS; |
630 | 0 | w0 = _mm512_loadu_ps(pw); |
631 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); |
632 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); |
633 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); |
634 | 0 | sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); |
635 | 0 | pw += srcC, offset += srcC; |
636 | 0 | w0 = _mm512_loadu_ps(pw); |
637 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); |
638 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); |
639 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); |
640 | 0 | sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); |
641 | 0 | pw += srcC, offset += srcC; |
642 | 0 | w0 = _mm512_loadu_ps(pw); |
643 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(ps0 + offset), w0, sum0); |
644 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(ps1 + offset), w0, sum1); |
645 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(ps2 + offset), w0, sum2); |
646 | 0 | sum3 = _mm512_fmadd_ps(_mm512_loadu_ps(ps3 + offset), w0, sum3); |
647 | 0 | pw += srcC, offset += srcC; |
648 | 0 | } |
649 | 0 | _mm512_storeu_ps(dst + 0 * srcC, Activate<type>(sum0, params, c)); |
650 | 0 | _mm512_storeu_ps(dst + 1 * srcC, Activate<type>(sum1, params, c)); |
651 | 0 | _mm512_storeu_ps(dst + 2 * srcC, Activate<type>(sum2, params, c)); |
652 | 0 | _mm512_storeu_ps(dst + 3 * srcC, Activate<type>(sum3, params, c)); |
653 | 0 | src += F; |
654 | 0 | dst += F; |
655 | 0 | } |
656 | 0 | if (c < srcC) |
657 | 0 | { |
658 | 0 | __mmask16 tail = TailMask16(srcC - c); |
659 | 0 | __m512 sum0, sum1, sum2, sum3, w0; |
660 | 0 | sum0 = bias ? _mm512_maskz_loadu_ps(tail, bias + c) : _mm512_setzero_ps(); |
661 | 0 | sum1 = sum0; |
662 | 0 | sum2 = sum0; |
663 | 0 | sum3 = sum0; |
664 | 0 | const float * pw = weight + c; |
665 | 0 | const float * ps0 = src + 0 * srcX; |
666 | 0 | const float * ps1 = src + 1 * srcX; |
667 | 0 | const float * ps2 = src + 2 * srcX; |
668 | 0 | const float * ps3 = src + 3 * srcX; |
669 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
670 | 0 | { |
671 | 0 | size_t offset = ky * srcS; |
672 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
673 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); |
674 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); |
675 | 0 | sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); |
676 | 0 | sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); |
677 | 0 | pw += srcC, offset += srcC; |
678 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
679 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); |
680 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); |
681 | 0 | sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); |
682 | 0 | sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); |
683 | 0 | pw += srcC, offset += srcC; |
684 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw); |
685 | 0 | sum0 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps0 + offset), w0, sum0); |
686 | 0 | sum1 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps1 + offset), w0, sum1); |
687 | 0 | sum2 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps2 + offset), w0, sum2); |
688 | 0 | sum3 = _mm512_fmadd_ps(_mm512_maskz_loadu_ps(tail, ps3 + offset), w0, sum3); |
689 | 0 | pw += srcC, offset += srcC; |
690 | 0 | } |
691 | 0 | _mm512_mask_storeu_ps(dst + 0 * srcC, tail, Activate<type>(sum0, params, c, tail)); |
692 | 0 | _mm512_mask_storeu_ps(dst + 1 * srcC, tail, Activate<type>(sum1, params, c, tail)); |
693 | 0 | _mm512_mask_storeu_ps(dst + 2 * srcC, tail, Activate<type>(sum2, params, c, tail)); |
694 | 0 | _mm512_mask_storeu_ps(dst + 3 * srcC, tail, Activate<type>(sum3, params, c, tail)); |
695 | 0 | } |
696 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)0>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)1>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)2>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)3>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)4>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)5>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)6>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)7>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)8>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)9>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main4<(SimdConvolutionActivationType)10>(float const*, unsigned long, unsigned long, unsigned long, float const*, float const*, float const*, float*) |
697 | | |
698 | | template<::SimdConvolutionActivationType type> |
699 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Edge16(const float * src, const ConvParam & p, size_t dy, size_t dx, const __m512 * weight, __m512 bias, const float * params, float * dst) |
700 | 0 | { |
701 | 0 | __m512 sum = bias; |
702 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
703 | 0 | { |
704 | 0 | size_t sy = dy * p.strideY + ky - p.padY; |
705 | 0 | if (sy < p.srcH) |
706 | 0 | { |
707 | 0 | for (size_t kx = 0; kx < 3; ++kx) |
708 | 0 | { |
709 | 0 | size_t sx = dx * p.strideX + kx - p.padX; |
710 | 0 | if (sx < p.srcW) |
711 | 0 | { |
712 | 0 | const float * ps = src + (sy*p.srcW + sx) * F; |
713 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(ps), weight[ky * 3 + kx], sum); |
714 | 0 | } |
715 | 0 | } |
716 | 0 | } |
717 | 0 | } |
718 | 0 | _mm512_storeu_ps(dst, Activate<type>(sum, params, 0)); |
719 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Edge16<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, unsigned long, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) |
720 | | |
721 | | template<::SimdConvolutionActivationType type> |
722 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main16x1(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst) |
723 | 0 | { |
724 | 0 | __m512 sum = bias; |
725 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum); |
726 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum); |
727 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum); |
728 | 0 | src += srcS; |
729 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[3], sum); |
730 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[4], sum); |
731 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[5], sum); |
732 | 0 | src += srcS; |
733 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[6], sum); |
734 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[7], sum); |
735 | 0 | sum = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[8], sum); |
736 | 0 | _mm512_storeu_ps(dst, Activate<type>(sum, params, 0)); |
737 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x1<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) |
738 | | |
739 | | template<::SimdConvolutionActivationType type> |
740 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main16x2(const float * src, size_t srcS, const __m512 * weight, __m512 bias, const float * params, float * dst) |
741 | 0 | { |
742 | 0 | __m512 sum0 = bias; |
743 | 0 | __m512 sum1 = bias; |
744 | 0 | for (size_t ky = 0; ky < 3; ++ky) |
745 | 0 | { |
746 | 0 | __m512 s0 = _mm512_loadu_ps(src + 0 * F); |
747 | 0 | __m512 s1 = _mm512_loadu_ps(src + 1 * F); |
748 | 0 | __m512 s2 = _mm512_loadu_ps(src + 2 * F); |
749 | 0 | __m512 s3 = _mm512_loadu_ps(src + 3 * F); |
750 | 0 | sum0 = _mm512_fmadd_ps(s0, weight[0], sum0); |
751 | 0 | sum1 = _mm512_fmadd_ps(s1, weight[0], sum1); |
752 | 0 | sum0 = _mm512_fmadd_ps(s1, weight[1], sum0); |
753 | 0 | sum1 = _mm512_fmadd_ps(s2, weight[1], sum1); |
754 | 0 | sum0 = _mm512_fmadd_ps(s2, weight[2], sum0); |
755 | 0 | sum1 = _mm512_fmadd_ps(s3, weight[2], sum1); |
756 | 0 | src += srcS; |
757 | 0 | weight += 3; |
758 | 0 | } |
759 | 0 | _mm512_storeu_ps(dst + 0, Activate<type>(sum0, params, 0)); |
760 | 0 | _mm512_storeu_ps(dst + F, Activate<type>(sum1, params, 0)); |
761 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main16x2<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float __vector(16), float const*, float*) |
762 | | |
763 | | template<::SimdConvolutionActivationType type> |
764 | | SIMD_INLINE void Convolution32fNhwcDepthwise3x3Main48(const float * src, size_t srcS, const __m512 * weight, const float * bias, const float * params, float * dst) |
765 | 0 | { |
766 | 0 | __m512 sum0, sum1, sum2; |
767 | 0 | if (bias) |
768 | 0 | { |
769 | 0 | sum0 = _mm512_loadu_ps(bias + 0 * F); |
770 | 0 | sum1 = _mm512_loadu_ps(bias + 1 * F); |
771 | 0 | sum2 = _mm512_loadu_ps(bias + 2 * F); |
772 | 0 | } |
773 | 0 | else |
774 | 0 | { |
775 | 0 | sum0 = _mm512_setzero_ps(); |
776 | 0 | sum1 = _mm512_setzero_ps(); |
777 | 0 | sum2 = _mm512_setzero_ps(); |
778 | 0 | } |
779 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); |
780 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); |
781 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); |
782 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); |
783 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); |
784 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); |
785 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); |
786 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); |
787 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); |
788 | 0 | src += srcS; |
789 | 0 | weight += 9; |
790 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); |
791 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); |
792 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); |
793 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); |
794 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); |
795 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); |
796 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); |
797 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); |
798 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); |
799 | 0 | src += srcS; |
800 | 0 | weight += 9; |
801 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 0 * F), weight[0], sum0); |
802 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 1 * F), weight[1], sum1); |
803 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 2 * F), weight[2], sum2); |
804 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 3 * F), weight[3], sum0); |
805 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 4 * F), weight[4], sum1); |
806 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 5 * F), weight[5], sum2); |
807 | 0 | sum0 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 6 * F), weight[6], sum0); |
808 | 0 | sum1 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 7 * F), weight[7], sum1); |
809 | 0 | sum2 = _mm512_fmadd_ps(_mm512_loadu_ps(src + 8 * F), weight[8], sum2); |
810 | 0 | _mm512_storeu_ps(dst + 0 * F, Activate<type>(sum0, params, 0 * F)); |
811 | 0 | _mm512_storeu_ps(dst + 1 * F, Activate<type>(sum1, params, 1 * F)); |
812 | 0 | _mm512_storeu_ps(dst + 2 * F, Activate<type>(sum2, params, 2 * F)); |
813 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)0>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)1>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)2>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)3>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)4>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)5>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)6>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)7>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)8>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)9>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3Main48<(SimdConvolutionActivationType)10>(float const*, unsigned long, float __vector(16) const*, float const*, float const*, float*) |
814 | | |
815 | | template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise3x3(const float * src, const ConvParam & p, const float * weight, const float * bias, const float * params, float * dst) |
816 | 0 | { |
817 | 0 | size_t srcS = p.srcC*p.srcW; |
818 | 0 | size_t srcX = p.srcC*p.strideX; |
819 | 0 | size_t dstH = p.dstH - p.padH; |
820 | 0 | size_t dstW = p.dstW - p.padW; |
821 | 0 | size_t dstW2 = AlignLo(dstW - p.padX, 2) + p.padX; |
822 | 0 | size_t dstW4 = AlignLo(dstW - p.padX, 4) + p.padX; |
823 | 0 | if (p.dstC == F && p.strideX == 1) |
824 | 0 | { |
825 | 0 | __m512 _weight[9]; |
826 | 0 | for (size_t i = 0; i < 9; ++i) |
827 | 0 | _weight[i] = _mm512_loadu_ps(weight + i * F); |
828 | 0 | __m512 _bias = bias ? _mm512_loadu_ps(bias) : _mm512_setzero_ps(); |
829 | 0 | size_t dy = 0; |
830 | 0 | for (; dy < p.padY; ++dy) |
831 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
832 | 0 | Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F; |
833 | 0 | for (; dy < dstH; ++dy) |
834 | 0 | { |
835 | 0 | size_t dx = 0; |
836 | 0 | for (; dx < p.padX; ++dx) |
837 | 0 | Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F; |
838 | 0 | size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; |
839 | 0 | for (; dx < dstW2; dx += 2) |
840 | 0 | Convolution32fNhwcDepthwise3x3Main16x2<type>(src + offset, srcS, _weight, _bias, params, dst), offset += 2*F, dst += 2*F; |
841 | 0 | for (; dx < dstW; ++dx) |
842 | 0 | Convolution32fNhwcDepthwise3x3Main16x1<type>(src + offset, srcS, _weight, _bias, params, dst), offset += F, dst += F; |
843 | 0 | for (; dx < p.dstW; ++dx) |
844 | 0 | Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F; |
845 | 0 | } |
846 | 0 | for (; dy < p.dstH; ++dy) |
847 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
848 | 0 | Convolution32fNhwcDepthwise3x3Edge16<type>(src, p, dy, dx, _weight, _bias, params, dst), dst += F; |
849 | 0 | } |
850 | 0 | else |
851 | 0 | { |
852 | 0 | size_t dy = 0; |
853 | 0 | for (; dy < p.padY; ++dy) |
854 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
855 | 0 | Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; |
856 | 0 | for (; dy < dstH; ++dy) |
857 | 0 | { |
858 | 0 | size_t dx = 0; |
859 | 0 | for (; dx < p.padX; ++dx) |
860 | 0 | Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; |
861 | 0 | size_t offset = ((dy * p.strideY - p.padY)*p.srcW + dx * p.strideX - p.padX)*p.srcC; |
862 | 0 | if (p.srcC == 48) |
863 | 0 | { |
864 | 0 | __m512 _weight[27]; |
865 | 0 | for (size_t i = 0; i < 27; ++i) |
866 | 0 | _weight[i] = _mm512_loadu_ps(weight + i * F); |
867 | 0 | for (; dx < dstW; ++dx) |
868 | 0 | Convolution32fNhwcDepthwise3x3Main48<type>(src + offset, srcS, _weight, bias, params, dst), dst += p.dstC, offset += srcX; |
869 | 0 | } |
870 | 0 | else |
871 | 0 | for (; dx < dstW4; dx += 4) |
872 | 0 | Convolution32fNhwcDepthwise3x3Main4<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 4 * p.dstC, offset += 4 * srcX; |
873 | 0 | for (; dx < dstW2; dx += 2) |
874 | 0 | Convolution32fNhwcDepthwise3x3Main2<type>(src + offset, srcS, srcX, p.srcC, weight, bias, params, dst), dst += 2 * p.dstC, offset += 2 * srcX; |
875 | 0 | for (; dx < dstW; ++dx) |
876 | 0 | Convolution32fNhwcDepthwise3x3Main1<type>(src + offset, srcS, p.srcC, weight, bias, params, dst), dst += p.dstC, offset += srcX; |
877 | 0 | for (; dx < p.dstW; ++dx) |
878 | 0 | Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; |
879 | 0 | } |
880 | 0 | for (; dy < p.dstH; ++dy) |
881 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
882 | 0 | Convolution32fNhwcDepthwise3x3Edge<type>(src, p, dy, dx, weight, bias, params, dst), dst += p.dstC; |
883 | 0 | } |
884 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise3x3<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
885 | | |
886 | | //------------------------------------------------------------------------------------------------- |
887 | | |
888 | | static SIMD_INLINE bool Preferable_k7p3d1s1w4(const ConvParam& p) |
889 | 0 | { |
890 | 0 | return p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7; |
891 | 0 | } |
892 | | |
893 | | template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w4(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst) |
894 | 0 | { |
895 | 0 | assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && p.srcW >= 7); |
896 | |
|
897 | 0 | size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, endW = dstW - 4; |
898 | 0 | __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, _params[2]; |
899 | 0 | _params[0] = _mm512_set1_ps(params[0]); |
900 | 0 | if (type == SimdConvolutionActivationRestrictRange || |
901 | 0 | type == SimdConvolutionActivationHswish || |
902 | 0 | type == SimdConvolutionActivationHardSigmoid) |
903 | 0 | _params[1] = _mm512_set1_ps(params[1]); |
904 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
905 | 0 | { |
906 | 0 | for (size_t dx = 0;; dx += Min<size_t>(4, endW - dx)) |
907 | 0 | { |
908 | 0 | for (size_t dc = 0; dc < dstC; dc += F) |
909 | 0 | { |
910 | 0 | __mmask16 tail = TailMask16(dstC - dc); |
911 | 0 | if (type == SimdConvolutionActivationPrelu) |
912 | 0 | _params[0] = _mm512_maskz_loadu_ps(tail, params + dc); |
913 | 0 | d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); |
914 | 0 | d1 = d0; d2 = d0; d3 = d0; |
915 | 0 | for (size_t ky = 0; ky < 7; ++ky) |
916 | 0 | { |
917 | 0 | size_t sy = dy + ky - 3; |
918 | 0 | const float* ps = src + (sy * dstW + dx - 3) * dstC + dc; |
919 | 0 | const float* pw = weight + ky * 7 * dstC + dc; |
920 | 0 | if (sy < srcH) |
921 | 0 | { |
922 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC); |
923 | 0 | w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC); |
924 | 0 | w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC); |
925 | 0 | if (dx) |
926 | 0 | { |
927 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC); |
928 | 0 | d0 = _mm512_fmadd_ps(s0, w0, d0); |
929 | |
|
930 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC); |
931 | 0 | d0 = _mm512_fmadd_ps(s1, w1, d0); |
932 | 0 | d1 = _mm512_fmadd_ps(s1, w0, d1); |
933 | |
|
934 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC); |
935 | 0 | d0 = _mm512_fmadd_ps(s0, w2, d0); |
936 | 0 | d1 = _mm512_fmadd_ps(s0, w1, d1); |
937 | 0 | d2 = _mm512_fmadd_ps(s0, w0, d2); |
938 | 0 | } |
939 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC); |
940 | 0 | w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC); |
941 | 0 | d0 = _mm512_fmadd_ps(s1, w3, d0); |
942 | 0 | d1 = _mm512_fmadd_ps(s1, w2, d1); |
943 | 0 | d2 = _mm512_fmadd_ps(s1, w1, d2); |
944 | 0 | d3 = _mm512_fmadd_ps(s1, w0, d3); |
945 | |
|
946 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC); |
947 | 0 | w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC); |
948 | 0 | d0 = _mm512_fmadd_ps(s0, w4, d0); |
949 | 0 | d1 = _mm512_fmadd_ps(s0, w3, d1); |
950 | 0 | d2 = _mm512_fmadd_ps(s0, w2, d2); |
951 | 0 | d3 = _mm512_fmadd_ps(s0, w1, d3); |
952 | |
|
953 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC); |
954 | 0 | w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC); |
955 | 0 | d0 = _mm512_fmadd_ps(s1, w5, d0); |
956 | 0 | d1 = _mm512_fmadd_ps(s1, w4, d1); |
957 | 0 | d2 = _mm512_fmadd_ps(s1, w3, d2); |
958 | 0 | d3 = _mm512_fmadd_ps(s1, w2, d3); |
959 | |
|
960 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC); |
961 | 0 | w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC); |
962 | 0 | d0 = _mm512_fmadd_ps(s0, w6, d0); |
963 | 0 | d1 = _mm512_fmadd_ps(s0, w5, d1); |
964 | 0 | d2 = _mm512_fmadd_ps(s0, w4, d2); |
965 | 0 | d3 = _mm512_fmadd_ps(s0, w3, d3); |
966 | 0 | if (dx < endW) |
967 | 0 | { |
968 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC); |
969 | 0 | d1 = _mm512_fmadd_ps(s1, w6, d1); |
970 | 0 | d2 = _mm512_fmadd_ps(s1, w5, d2); |
971 | 0 | d3 = _mm512_fmadd_ps(s1, w4, d3); |
972 | |
|
973 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC); |
974 | 0 | d2 = _mm512_fmadd_ps(s0, w6, d2); |
975 | 0 | d3 = _mm512_fmadd_ps(s0, w5, d3); |
976 | |
|
977 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC); |
978 | 0 | d3 = _mm512_fmadd_ps(s1, w6, d3); |
979 | 0 | } |
980 | 0 | } |
981 | 0 | } |
982 | 0 | float* pd = dst + (dy * dstW + dx) * dstC + dc; |
983 | 0 | _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0)); |
984 | 0 | _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0)); |
985 | 0 | _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0)); |
986 | 0 | _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0)); |
987 | 0 | } |
988 | 0 | if (dx == endW) |
989 | 0 | break; |
990 | 0 | } |
991 | 0 | } |
992 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w4<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
993 | | |
994 | | //------------------------------------------------------------------------------------------------- |
995 | | |
996 | | template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w6(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst) |
997 | 0 | { |
998 | 0 | assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && AlignedAny(p.srcW, 6)); |
999 | |
|
1000 | 0 | size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, end = dstW - 6; |
1001 | 0 | __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, d4, d5, _params[2]; |
1002 | 0 | _params[0] = _mm512_set1_ps(params[0]); |
1003 | 0 | if (type == SimdConvolutionActivationRestrictRange || |
1004 | 0 | type == SimdConvolutionActivationHswish || |
1005 | 0 | type == SimdConvolutionActivationHardSigmoid) |
1006 | 0 | _params[1] = _mm512_set1_ps(params[1]); |
1007 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
1008 | 0 | { |
1009 | 0 | for (size_t dx = 0; dx < dstW; dx += 6) |
1010 | 0 | { |
1011 | 0 | for (size_t dc = 0; dc < dstC; dc += F) |
1012 | 0 | { |
1013 | 0 | __mmask16 tail = TailMask16(dstC - dc); |
1014 | 0 | if (type == SimdConvolutionActivationPrelu) |
1015 | 0 | _params[0] = _mm512_maskz_loadu_ps(tail, params + dc); |
1016 | 0 | d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); |
1017 | 0 | d1 = d0; d2 = d0; d3 = d0, d4 = d0, d5 = d0; |
1018 | 0 | for (size_t ky = 0; ky < 7; ++ky) |
1019 | 0 | { |
1020 | 0 | size_t sy = dy + ky - 3; |
1021 | 0 | const float* ps = src + (sy * dstW + dx - 3) * dstC + dc; |
1022 | 0 | const float* pw = weight + ky * 7 * dstC + dc; |
1023 | 0 | if (sy < srcH) |
1024 | 0 | { |
1025 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC); |
1026 | 0 | w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC); |
1027 | 0 | w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC); |
1028 | 0 | if (dx) |
1029 | 0 | { |
1030 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC); |
1031 | 0 | d0 = _mm512_fmadd_ps(s0, w0, d0); |
1032 | |
|
1033 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC); |
1034 | 0 | d0 = _mm512_fmadd_ps(s1, w1, d0); |
1035 | 0 | d1 = _mm512_fmadd_ps(s1, w0, d1); |
1036 | |
|
1037 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC); |
1038 | 0 | d0 = _mm512_fmadd_ps(s0, w2, d0); |
1039 | 0 | d1 = _mm512_fmadd_ps(s0, w1, d1); |
1040 | 0 | d2 = _mm512_fmadd_ps(s0, w0, d2); |
1041 | 0 | } |
1042 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC); |
1043 | 0 | w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC); |
1044 | 0 | d0 = _mm512_fmadd_ps(s1, w3, d0); |
1045 | 0 | d1 = _mm512_fmadd_ps(s1, w2, d1); |
1046 | 0 | d2 = _mm512_fmadd_ps(s1, w1, d2); |
1047 | 0 | d3 = _mm512_fmadd_ps(s1, w0, d3); |
1048 | |
|
1049 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC); |
1050 | 0 | w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC); |
1051 | 0 | d0 = _mm512_fmadd_ps(s0, w4, d0); |
1052 | 0 | d1 = _mm512_fmadd_ps(s0, w3, d1); |
1053 | 0 | d2 = _mm512_fmadd_ps(s0, w2, d2); |
1054 | 0 | d3 = _mm512_fmadd_ps(s0, w1, d3); |
1055 | 0 | d4 = _mm512_fmadd_ps(s0, w0, d4); |
1056 | |
|
1057 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC); |
1058 | 0 | w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC); |
1059 | 0 | d0 = _mm512_fmadd_ps(s1, w5, d0); |
1060 | 0 | d1 = _mm512_fmadd_ps(s1, w4, d1); |
1061 | 0 | d2 = _mm512_fmadd_ps(s1, w3, d2); |
1062 | 0 | d3 = _mm512_fmadd_ps(s1, w2, d3); |
1063 | 0 | d4 = _mm512_fmadd_ps(s1, w1, d4); |
1064 | 0 | d5 = _mm512_fmadd_ps(s1, w0, d5); |
1065 | |
|
1066 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC); |
1067 | 0 | w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC); |
1068 | 0 | d0 = _mm512_fmadd_ps(s0, w6, d0); |
1069 | 0 | d1 = _mm512_fmadd_ps(s0, w5, d1); |
1070 | 0 | d2 = _mm512_fmadd_ps(s0, w4, d2); |
1071 | 0 | d3 = _mm512_fmadd_ps(s0, w3, d3); |
1072 | 0 | d4 = _mm512_fmadd_ps(s0, w2, d4); |
1073 | 0 | d5 = _mm512_fmadd_ps(s0, w1, d5); |
1074 | |
|
1075 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC); |
1076 | 0 | d1 = _mm512_fmadd_ps(s1, w6, d1); |
1077 | 0 | d2 = _mm512_fmadd_ps(s1, w5, d2); |
1078 | 0 | d3 = _mm512_fmadd_ps(s1, w4, d3); |
1079 | 0 | d4 = _mm512_fmadd_ps(s1, w3, d4); |
1080 | 0 | d5 = _mm512_fmadd_ps(s1, w2, d5); |
1081 | |
|
1082 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC); |
1083 | 0 | d2 = _mm512_fmadd_ps(s0, w6, d2); |
1084 | 0 | d3 = _mm512_fmadd_ps(s0, w5, d3); |
1085 | 0 | d4 = _mm512_fmadd_ps(s0, w4, d4); |
1086 | 0 | d5 = _mm512_fmadd_ps(s0, w3, d5); |
1087 | |
|
1088 | 0 | if (dx < end) |
1089 | 0 | { |
1090 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC); |
1091 | 0 | d3 = _mm512_fmadd_ps(s1, w6, d3); |
1092 | 0 | d4 = _mm512_fmadd_ps(s1, w5, d4); |
1093 | 0 | d5 = _mm512_fmadd_ps(s1, w4, d5); |
1094 | |
|
1095 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 10 * dstC); |
1096 | 0 | d4 = _mm512_fmadd_ps(s0, w6, d4); |
1097 | 0 | d5 = _mm512_fmadd_ps(s0, w5, d5); |
1098 | |
|
1099 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 11 * dstC); |
1100 | 0 | d5 = _mm512_fmadd_ps(s1, w6, d5); |
1101 | 0 | } |
1102 | 0 | } |
1103 | 0 | } |
1104 | 0 | float* pd = dst + (dy * dstW + dx) * dstC + dc; |
1105 | 0 | _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0)); |
1106 | 0 | _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0)); |
1107 | 0 | _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0)); |
1108 | 0 | _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0)); |
1109 | 0 | _mm512_mask_storeu_ps(pd + 4 * dstC, tail, Activate<type>(d4, _params, 0)); |
1110 | 0 | _mm512_mask_storeu_ps(pd + 5 * dstC, tail, Activate<type>(d5, _params, 0)); |
1111 | 0 | } |
1112 | 0 | } |
1113 | 0 | } |
1114 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w6<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
1115 | | |
1116 | | //------------------------------------------------------------------------------------------------- |
1117 | | |
1118 | | template<::SimdConvolutionActivationType type> void Convolution32fNhwcDepthwise_k7p3d1s1w8(const float* src, const ConvParam& p, const float* weight, const float* bias, const float* params, float* dst) |
1119 | 0 | { |
1120 | 0 | assert(p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 8)); |
1121 | |
|
1122 | 0 | size_t dstC = p.dstC, dstW = p.dstW, srcH = p.srcH, end = dstW - 8; |
1123 | 0 | __m512 s0, s1, w0, w1, w2, w3, w4, w5, w6, d0, d1, d2, d3, d4, d5, d6, d7, _params[2]; |
1124 | 0 | _params[0] = _mm512_set1_ps(params[0]); |
1125 | 0 | if (type == SimdConvolutionActivationRestrictRange || |
1126 | 0 | type == SimdConvolutionActivationHswish || |
1127 | 0 | type == SimdConvolutionActivationHardSigmoid) |
1128 | 0 | _params[1] = _mm512_set1_ps(params[1]); |
1129 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
1130 | 0 | { |
1131 | 0 | for (size_t dx = 0; dx < dstW; dx += 8) |
1132 | 0 | { |
1133 | 0 | for (size_t dc = 0; dc < dstC; dc += F) |
1134 | 0 | { |
1135 | 0 | __mmask16 tail = TailMask16(dstC - dc); |
1136 | 0 | if (type == SimdConvolutionActivationPrelu) |
1137 | 0 | _params[0] = _mm512_maskz_loadu_ps(tail, params + dc); |
1138 | 0 | d0 = bias ? _mm512_maskz_loadu_ps(tail, bias + dc) : _mm512_setzero_ps(); |
1139 | 0 | d1 = d0; d2 = d0; d3 = d0, d4 = d0, d5 = d0, d6 = d0, d7 = d0; |
1140 | 0 | for (size_t ky = 0; ky < 7; ++ky) |
1141 | 0 | { |
1142 | 0 | size_t sy = dy + ky - 3; |
1143 | 0 | const float* ps = src + (sy * dstW + dx - 3) * dstC + dc; |
1144 | 0 | const float* pw = weight + ky * 7 * dstC + dc; |
1145 | 0 | if (sy < srcH) |
1146 | 0 | { |
1147 | 0 | w0 = _mm512_maskz_loadu_ps(tail, pw + 0 * dstC); |
1148 | 0 | w1 = _mm512_maskz_loadu_ps(tail, pw + 1 * dstC); |
1149 | 0 | w2 = _mm512_maskz_loadu_ps(tail, pw + 2 * dstC); |
1150 | 0 | if (dx) |
1151 | 0 | { |
1152 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 0 * dstC); |
1153 | 0 | d0 = _mm512_fmadd_ps(s0, w0, d0); |
1154 | |
|
1155 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 1 * dstC); |
1156 | 0 | d0 = _mm512_fmadd_ps(s1, w1, d0); |
1157 | 0 | d1 = _mm512_fmadd_ps(s1, w0, d1); |
1158 | |
|
1159 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 2 * dstC); |
1160 | 0 | d0 = _mm512_fmadd_ps(s0, w2, d0); |
1161 | 0 | d1 = _mm512_fmadd_ps(s0, w1, d1); |
1162 | 0 | d2 = _mm512_fmadd_ps(s0, w0, d2); |
1163 | 0 | } |
1164 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 3 * dstC); |
1165 | 0 | w3 = _mm512_maskz_loadu_ps(tail, pw + 3 * dstC); |
1166 | 0 | d0 = _mm512_fmadd_ps(s1, w3, d0); |
1167 | 0 | d1 = _mm512_fmadd_ps(s1, w2, d1); |
1168 | 0 | d2 = _mm512_fmadd_ps(s1, w1, d2); |
1169 | 0 | d3 = _mm512_fmadd_ps(s1, w0, d3); |
1170 | |
|
1171 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 4 * dstC); |
1172 | 0 | w4 = _mm512_maskz_loadu_ps(tail, pw + 4 * dstC); |
1173 | 0 | d0 = _mm512_fmadd_ps(s0, w4, d0); |
1174 | 0 | d1 = _mm512_fmadd_ps(s0, w3, d1); |
1175 | 0 | d2 = _mm512_fmadd_ps(s0, w2, d2); |
1176 | 0 | d3 = _mm512_fmadd_ps(s0, w1, d3); |
1177 | 0 | d4 = _mm512_fmadd_ps(s0, w0, d4); |
1178 | |
|
1179 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 5 * dstC); |
1180 | 0 | w5 = _mm512_maskz_loadu_ps(tail, pw + 5 * dstC); |
1181 | 0 | d0 = _mm512_fmadd_ps(s1, w5, d0); |
1182 | 0 | d1 = _mm512_fmadd_ps(s1, w4, d1); |
1183 | 0 | d2 = _mm512_fmadd_ps(s1, w3, d2); |
1184 | 0 | d3 = _mm512_fmadd_ps(s1, w2, d3); |
1185 | 0 | d4 = _mm512_fmadd_ps(s1, w1, d4); |
1186 | 0 | d5 = _mm512_fmadd_ps(s1, w0, d5); |
1187 | |
|
1188 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 6 * dstC); |
1189 | 0 | w6 = _mm512_maskz_loadu_ps(tail, pw + 6 * dstC); |
1190 | 0 | d0 = _mm512_fmadd_ps(s0, w6, d0); |
1191 | 0 | d1 = _mm512_fmadd_ps(s0, w5, d1); |
1192 | 0 | d2 = _mm512_fmadd_ps(s0, w4, d2); |
1193 | 0 | d3 = _mm512_fmadd_ps(s0, w3, d3); |
1194 | 0 | d4 = _mm512_fmadd_ps(s0, w2, d4); |
1195 | 0 | d5 = _mm512_fmadd_ps(s0, w1, d5); |
1196 | 0 | d6 = _mm512_fmadd_ps(s0, w0, d6); |
1197 | |
|
1198 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 7 * dstC); |
1199 | 0 | d1 = _mm512_fmadd_ps(s1, w6, d1); |
1200 | 0 | d2 = _mm512_fmadd_ps(s1, w5, d2); |
1201 | 0 | d3 = _mm512_fmadd_ps(s1, w4, d3); |
1202 | 0 | d4 = _mm512_fmadd_ps(s1, w3, d4); |
1203 | 0 | d5 = _mm512_fmadd_ps(s1, w2, d5); |
1204 | 0 | d6 = _mm512_fmadd_ps(s1, w1, d6); |
1205 | 0 | d7 = _mm512_fmadd_ps(s1, w0, d7); |
1206 | |
|
1207 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 8 * dstC); |
1208 | 0 | d2 = _mm512_fmadd_ps(s0, w6, d2); |
1209 | 0 | d3 = _mm512_fmadd_ps(s0, w5, d3); |
1210 | 0 | d4 = _mm512_fmadd_ps(s0, w4, d4); |
1211 | 0 | d5 = _mm512_fmadd_ps(s0, w3, d5); |
1212 | 0 | d6 = _mm512_fmadd_ps(s0, w2, d6); |
1213 | 0 | d7 = _mm512_fmadd_ps(s0, w1, d7); |
1214 | |
|
1215 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 9 * dstC); |
1216 | 0 | d3 = _mm512_fmadd_ps(s1, w6, d3); |
1217 | 0 | d4 = _mm512_fmadd_ps(s1, w5, d4); |
1218 | 0 | d5 = _mm512_fmadd_ps(s1, w4, d5); |
1219 | 0 | d6 = _mm512_fmadd_ps(s1, w3, d6); |
1220 | 0 | d7 = _mm512_fmadd_ps(s1, w2, d7); |
1221 | |
|
1222 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 10 * dstC); |
1223 | 0 | d4 = _mm512_fmadd_ps(s0, w6, d4); |
1224 | 0 | d5 = _mm512_fmadd_ps(s0, w5, d5); |
1225 | 0 | d6 = _mm512_fmadd_ps(s0, w4, d6); |
1226 | 0 | d7 = _mm512_fmadd_ps(s0, w3, d7); |
1227 | |
|
1228 | 0 | if (dx < end) |
1229 | 0 | { |
1230 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 11 * dstC); |
1231 | 0 | d5 = _mm512_fmadd_ps(s1, w6, d5); |
1232 | 0 | d6 = _mm512_fmadd_ps(s1, w5, d6); |
1233 | 0 | d7 = _mm512_fmadd_ps(s1, w4, d7); |
1234 | |
|
1235 | 0 | s0 = _mm512_maskz_loadu_ps(tail, ps + 12 * dstC); |
1236 | 0 | d6 = _mm512_fmadd_ps(s0, w6, d6); |
1237 | 0 | d7 = _mm512_fmadd_ps(s0, w5, d7); |
1238 | |
|
1239 | 0 | s1 = _mm512_maskz_loadu_ps(tail, ps + 13 * dstC); |
1240 | 0 | d7 = _mm512_fmadd_ps(s1, w6, d7); |
1241 | 0 | } |
1242 | 0 | } |
1243 | 0 | } |
1244 | 0 | float* pd = dst + (dy * dstW + dx) * dstC + dc; |
1245 | 0 | _mm512_mask_storeu_ps(pd + 0 * dstC, tail, Activate<type>(d0, _params, 0)); |
1246 | 0 | _mm512_mask_storeu_ps(pd + 1 * dstC, tail, Activate<type>(d1, _params, 0)); |
1247 | 0 | _mm512_mask_storeu_ps(pd + 2 * dstC, tail, Activate<type>(d2, _params, 0)); |
1248 | 0 | _mm512_mask_storeu_ps(pd + 3 * dstC, tail, Activate<type>(d3, _params, 0)); |
1249 | 0 | _mm512_mask_storeu_ps(pd + 4 * dstC, tail, Activate<type>(d4, _params, 0)); |
1250 | 0 | _mm512_mask_storeu_ps(pd + 5 * dstC, tail, Activate<type>(d5, _params, 0)); |
1251 | 0 | _mm512_mask_storeu_ps(pd + 6 * dstC, tail, Activate<type>(d6, _params, 0)); |
1252 | 0 | _mm512_mask_storeu_ps(pd + 7 * dstC, tail, Activate<type>(d7, _params, 0)); |
1253 | 0 | } |
1254 | 0 | } |
1255 | 0 | } |
1256 | 0 | } Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)0>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)1>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)2>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)3>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)4>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)5>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)6>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)7>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)8>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)9>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void Simd::Avx512bw::Convolution32fNhwcDepthwise_k7p3d1s1w8<(SimdConvolutionActivationType)10>(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
1257 | | |
1258 | | //------------------------------------------------------------------------------------------------- |
1259 | | |
1260 | | template <::SimdConvolutionActivationType type> SynetConvolution32fNhwcDepthwise::ConvolutionPtr Get(const ConvParam& p) |
1261 | 0 | { |
1262 | 0 | if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && Aligned(p.srcW, 8)) |
1263 | 0 | return Convolution32fNhwcDepthwise_k7p3d1s1w8<type>; |
1264 | 0 | else if (p.IsKernel(7) && p.IsPad(3) && p.IsStride(1) && p.IsDilation(1) && AlignedAny(p.srcW, 6)) |
1265 | 0 | return Convolution32fNhwcDepthwise_k7p3d1s1w6<type>; |
1266 | 0 | else if (Preferable_k7p3d1s1w4(p)) |
1267 | 0 | return Convolution32fNhwcDepthwise_k7p3d1s1w4<type>; |
1268 | 0 | else if (p.IsKernel(3) && p.IsDilation(1)) |
1269 | 0 | return Convolution32fNhwcDepthwise3x3<type>; |
1270 | 0 | else |
1271 | 0 | return Convolution32fNhwcDepthwiseDefault<type>; |
1272 | 0 | } Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)0>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)1>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)2>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)3>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)4>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)5>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)6>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)7>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)8>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)9>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) Unexecuted instantiation: void (*Simd::Avx512bw::Get<(SimdConvolutionActivationType)10>(Simd::ConvParam const&))(float const*, Simd::ConvParam const&, float const*, float const*, float const*, float*) |
1273 | | |
1274 | | //------------------------------------------------------------------------------------------------- |
1275 | | |
1276 | | SynetConvolution32fNhwcDepthwise::SynetConvolution32fNhwcDepthwise(const ConvParam& p) |
1277 | 0 | : Avx2::SynetConvolution32fNhwcDepthwise(p) |
1278 | 0 | { |
1279 | 0 | if (p.dstC > HF && p.dstC != 24 && p.dstH >= p.padY + p.padH && p.dstW >= p.padX + p.padW) |
1280 | 0 | { |
1281 | 0 | switch (p.activation) |
1282 | 0 | { |
1283 | 0 | case ::SimdConvolutionActivationIdentity: _convolution = Get<::SimdConvolutionActivationIdentity>(p); break; |
1284 | 0 | case ::SimdConvolutionActivationRelu: _convolution = Get<::SimdConvolutionActivationRelu>(p); break; |
1285 | 0 | case ::SimdConvolutionActivationLeakyRelu: _convolution = Get<::SimdConvolutionActivationLeakyRelu>(p); break; |
1286 | 0 | case ::SimdConvolutionActivationRestrictRange: _convolution = Get<::SimdConvolutionActivationRestrictRange>(p); break; |
1287 | 0 | case ::SimdConvolutionActivationPrelu: _convolution = Get<::SimdConvolutionActivationPrelu>(p); break; |
1288 | 0 | case ::SimdConvolutionActivationElu: _convolution = Get<::SimdConvolutionActivationElu>(p); break; |
1289 | 0 | case ::SimdConvolutionActivationHswish: _convolution = Get<::SimdConvolutionActivationHswish>(p); break; |
1290 | 0 | case ::SimdConvolutionActivationMish: _convolution = Get<::SimdConvolutionActivationMish>(p); break; |
1291 | 0 | case ::SimdConvolutionActivationHardSigmoid: _convolution = Get<::SimdConvolutionActivationHardSigmoid>(p); break; |
1292 | 0 | case ::SimdConvolutionActivationSwish: _convolution = Get<::SimdConvolutionActivationSwish>(p); break; |
1293 | 0 | case ::SimdConvolutionActivationGelu: _convolution = Get<::SimdConvolutionActivationGelu>(p); break; |
1294 | 0 | } |
1295 | 0 | } |
1296 | 0 | } |
1297 | | } |
1298 | | #endif |
1299 | | } |