/src/Simd/src/Simd/SimdBaseSynetConvolution16b.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2024 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynetConvolution16b.h" |
25 | | #include "Simd/SimdSynetConvolution32f.h" |
26 | | #include "Simd/SimdSynetConvolution32fCommon.h" |
27 | | #include "Simd/SimdSynet.h" |
28 | | #include "Simd/SimdBase.h" |
29 | | #include "Simd/SimdBFloat16.h" |
30 | | #include "Simd/SimdAlignment.h" |
31 | | |
32 | | namespace Simd |
33 | | { |
34 | | #if defined(SIMD_SYNET_ENABLE) |
35 | | |
36 | | SynetConvolution16b::SynetConvolution16b(const ConvParam& p) |
37 | 0 | : _param(p) |
38 | | #if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG)) |
39 | | , _perf(NULL) |
40 | | #endif |
41 | 0 | { |
42 | 0 | _src16b = p.srcT == SimdTensorData16b; |
43 | 0 | _dst16b = p.dstT == SimdTensorData16b; |
44 | 0 | _elemS = _src16b ? 2 : 4; |
45 | 0 | _elemD = _dst16b ? 2 : 4; |
46 | 0 | _is1x1 = p.Is1x1(); |
47 | 0 | } |
48 | | |
49 | | #if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG)) |
50 | | Base::PerformanceMeasurer * SynetConvolution16b::Perf(const char* func) |
51 | | { |
52 | | if (_perf == NULL) |
53 | | _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info(true) + " " + Desc(), Param().Flop()); |
54 | | return _perf; |
55 | | } |
56 | | #endif |
57 | | |
58 | | void SynetConvolution16b::SetBias(const float* bias, size_t align) |
59 | 0 | { |
60 | 0 | const ConvParam& p = _param; |
61 | 0 | _bias.Resize(AlignHi(p.dstC, align), true); |
62 | 0 | if (bias) |
63 | 0 | memcpy(_bias.data, bias, p.dstC * sizeof(float)); |
64 | 0 | } |
65 | | |
66 | | void SynetConvolution16b::SetParams(const float* params, size_t align) |
67 | 0 | { |
68 | 0 | const ConvParam& p = _param; |
69 | 0 | if (p.activation == SimdConvolutionActivationLeakyRelu || p.activation == SimdConvolutionActivationPrelu) |
70 | 0 | _params.Resize(AlignHi(p.dstC, align), true); |
71 | 0 | else |
72 | 0 | _params.Resize(2, true); |
73 | 0 | switch (p.activation) |
74 | 0 | { |
75 | 0 | case SimdConvolutionActivationIdentity: |
76 | 0 | _params.data[0] = -FLT_MAX; |
77 | 0 | _params.data[1] = FLT_MAX; |
78 | 0 | break; |
79 | 0 | case SimdConvolutionActivationRelu: |
80 | 0 | _params.data[0] = 0; |
81 | 0 | _params.data[1] = FLT_MAX; |
82 | 0 | break; |
83 | 0 | case SimdConvolutionActivationLeakyRelu: |
84 | 0 | for (size_t d = 0; d < p.dstC; ++d) |
85 | 0 | _params.data[d] = params[0]; |
86 | 0 | break; |
87 | 0 | case SimdConvolutionActivationRestrictRange: |
88 | 0 | _params.data[0] = params[0]; |
89 | 0 | _params.data[1] = params[1]; |
90 | 0 | break; |
91 | 0 | case SimdConvolutionActivationPrelu: |
92 | 0 | for (size_t d = 0; d < p.dstC; ++d) |
93 | 0 | _params.data[d] = params[d]; |
94 | 0 | break; |
95 | 0 | case SimdConvolutionActivationElu: |
96 | 0 | _params.data[0] = params[0]; |
97 | 0 | break; |
98 | 0 | case SimdConvolutionActivationHswish: |
99 | 0 | _params.data[0] = params[0]; |
100 | 0 | _params.data[1] = params[1]; |
101 | 0 | break; |
102 | 0 | case SimdConvolutionActivationMish: |
103 | 0 | _params.data[0] = params[0]; |
104 | 0 | break; |
105 | 0 | case SimdConvolutionActivationHardSigmoid: |
106 | 0 | _params.data[0] = params[0]; |
107 | 0 | _params.data[1] = params[1]; |
108 | 0 | break; |
109 | 0 | case SimdConvolutionActivationSwish: |
110 | 0 | _params.data[0] = params[0]; |
111 | 0 | break; |
112 | 0 | case SimdConvolutionActivationGelu: |
113 | 0 | break; |
114 | 0 | default: |
115 | 0 | assert(0); |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | //------------------------------------------------------------------------------------------------- |
120 | | |
121 | | namespace Base |
122 | | { |
123 | | SynetConvolution16bGemm::SynetConvolution16bGemm(const ConvParam& p) |
124 | 0 | : SynetConvolution16b(p) |
125 | 0 | { |
126 | 0 | if (p.trans) |
127 | 0 | { |
128 | 0 | _M = p.dstH * p.dstW; |
129 | 0 | _N = p.dstC / p.group; |
130 | 0 | _K = p.srcC * p.kernelY * p.kernelX / p.group; |
131 | 0 | _ldS = _K; |
132 | 0 | _ldW = p.dstC; |
133 | 0 | _ldD = p.dstC; |
134 | 0 | _grW = _N; |
135 | 0 | _grS = _K * _M; |
136 | 0 | _grD = _N; |
137 | 0 | _weight.Resize(_K * _N * p.group); |
138 | 0 | } |
139 | 0 | else |
140 | 0 | { |
141 | 0 | _M = p.dstC / p.group; |
142 | 0 | _N = p.dstH * p.dstW; |
143 | 0 | _K = p.srcC * p.kernelY * p.kernelX / p.group; |
144 | 0 | _ldW = _K; |
145 | 0 | _ldS = _N; |
146 | 0 | _ldD = _N; |
147 | 0 | _grW = _M * _K; |
148 | 0 | _grS = _K * _N; |
149 | 0 | _grD = _M * _N; |
150 | 0 | _weight.Resize(_K * _M * p.group); |
151 | 0 | } |
152 | 0 | _batch = p.batch; |
153 | 0 | _sizeS = p.srcC * p.srcH * p.srcW; |
154 | 0 | _sizeB = p.srcC * p.kernelY * p.kernelX * p.dstH * p.dstW; |
155 | 0 | _sizeD = p.dstC * p.dstH * p.dstW; |
156 | 0 | _stepS = _sizeS * _elemS; |
157 | 0 | _stepD = _sizeD * _elemD; |
158 | 0 | } |
159 | | |
160 | | size_t SynetConvolution16bGemm::ExternalBufferSize() const |
161 | 0 | { |
162 | 0 | size_t size = 0; |
163 | 0 | if (!_src16b) |
164 | 0 | size += _sizeS * sizeof(uint16_t); |
165 | 0 | if (!_is1x1) |
166 | 0 | size += _sizeB * sizeof(uint16_t); |
167 | 0 | if (_dst16b) |
168 | 0 | size += _sizeD * sizeof(float); |
169 | 0 | return size; |
170 | 0 | } |
171 | | |
172 | | void SynetConvolution16bGemm::SetParams(const float* weight, const float* bias, const float* params) |
173 | 0 | { |
174 | 0 | const ConvParam& p = _param; |
175 | 0 | Float32ToBFloat16(weight, _weight.size, _weight.data); |
176 | 0 | SynetConvolution16b::SetBias(bias, Alignment()); |
177 | 0 | SynetConvolution16b::SetParams(params, Alignment()); |
178 | 0 | } |
179 | | |
180 | | void SynetConvolution16bGemm::Forward(const uint8_t* src, uint8_t* buf, uint8_t* dst) |
181 | 0 | { |
182 | 0 | const ConvParam& p = _param; |
183 | 0 | buf = Buffer(buf); |
184 | 0 | uint16_t* bufS = _src16b ? NULL : Allocate<uint16_t>(buf, _sizeS); |
185 | 0 | uint16_t* bufB = _is1x1 ? NULL : Allocate<uint16_t>(buf, _sizeB); |
186 | 0 | float* bufD = _dst16b ? Allocate<float>(buf, _sizeD) : NULL; |
187 | 0 | const uint16_t* wgt = _weight.data; |
188 | 0 | for (size_t b = 0; b < _batch; ++b) |
189 | 0 | { |
190 | 0 | const uint16_t* src16b = _src16b ? (uint16_t*)src : bufS; |
191 | 0 | const uint16_t* buf16b = _is1x1 ? src16b : bufB; |
192 | 0 | float* dst32f = _dst16b ? bufD : (float*)dst; |
193 | 0 | if (!_src16b) |
194 | 0 | Float32ToBFloat16((float*)src, _sizeS, bufS); |
195 | 0 | if (_param.trans) |
196 | 0 | { |
197 | 0 | if(!_is1x1) |
198 | 0 | ImgToRow(src16b, bufB); |
199 | 0 | for (size_t g = 0; g < p.group; ++g) |
200 | 0 | GemmNN(_M, _N, _K, buf16b + _grS * g, _ldS, wgt + _grW * g, _ldW, dst32f + _grD * g, _ldD); |
201 | 0 | } |
202 | 0 | else |
203 | 0 | { |
204 | 0 | if (!_is1x1) |
205 | 0 | ImgToCol(src16b, bufB); |
206 | 0 | for (size_t g = 0; g < p.group; ++g) |
207 | 0 | GemmNN(_M, _N, _K, wgt + _grW * g, _ldW, buf16b + _grS * g, _ldS, dst32f + _grD * g, _ldD); |
208 | 0 | } |
209 | 0 | ConvolutionBiasAndActivation(_bias.data, p.dstC, p.dstH * p.dstW, p.activation, _params.data, p.trans, dst32f); |
210 | 0 | if(_dst16b) |
211 | 0 | Float32ToBFloat16(bufD, _sizeD, (uint16_t*)dst); |
212 | 0 | src += _stepS; |
213 | 0 | dst += _stepD; |
214 | 0 | } |
215 | 0 | } |
216 | | |
217 | | void SynetConvolution16bGemm::ImgToCol(const uint16_t* src, uint16_t* dst) |
218 | 0 | { |
219 | 0 | const ConvParam& p = _param; |
220 | 0 | assert(!p.trans); |
221 | 0 | size_t srcSize = p.srcW * p.srcH; |
222 | 0 | for (size_t c = 0; c < p.srcC; ++c) |
223 | 0 | { |
224 | 0 | for (size_t ky = 0; ky < p.kernelY; ky++) |
225 | 0 | { |
226 | 0 | for (size_t kx = 0; kx < p.kernelX; kx++) |
227 | 0 | { |
228 | 0 | size_t sy = ky * p.dilationY - p.padY; |
229 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
230 | 0 | { |
231 | 0 | if (sy < p.srcH) |
232 | 0 | { |
233 | 0 | size_t sx = kx * p.dilationX - p.padX; |
234 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
235 | 0 | { |
236 | 0 | if (sx < p.srcW) |
237 | 0 | *(dst++) = src[sy * p.srcW + sx]; |
238 | 0 | else |
239 | 0 | *(dst++) = 0; |
240 | 0 | sx += p.strideX; |
241 | 0 | } |
242 | 0 | } |
243 | 0 | else |
244 | 0 | { |
245 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
246 | 0 | *(dst++) = 0; |
247 | 0 | } |
248 | 0 | sy += p.strideY; |
249 | 0 | } |
250 | 0 | } |
251 | 0 | } |
252 | 0 | src += srcSize; |
253 | 0 | } |
254 | 0 | } |
255 | | |
256 | | void SynetConvolution16bGemm::ImgToRow(const uint16_t* src, uint16_t* dst) |
257 | 0 | { |
258 | 0 | const ConvParam& p = _param; |
259 | 0 | assert(p.trans); |
260 | 0 | size_t size = p.srcC / p.group; |
261 | 0 | for (size_t g = 0; g < p.group; ++g) |
262 | 0 | { |
263 | 0 | for (size_t dy = 0; dy < p.dstH; ++dy) |
264 | 0 | { |
265 | 0 | for (size_t dx = 0; dx < p.dstW; ++dx) |
266 | 0 | { |
267 | 0 | for (size_t ky = 0; ky < p.kernelY; ky++) |
268 | 0 | { |
269 | 0 | size_t sy = dy * p.strideY + ky * p.dilationY - p.padY; |
270 | 0 | if (sy < p.srcH) |
271 | 0 | { |
272 | 0 | for (size_t kx = 0; kx < p.kernelX; kx++) |
273 | 0 | { |
274 | 0 | size_t sx = dx * p.strideX + kx * p.dilationX - p.padX; |
275 | 0 | if (sx < p.srcW) |
276 | 0 | { |
277 | 0 | memcpy(dst, src + (sy * p.srcW + sx) * p.srcC, size * sizeof(uint16_t)); |
278 | 0 | dst += size; |
279 | 0 | } |
280 | 0 | else |
281 | 0 | { |
282 | 0 | memset(dst, 0, size * sizeof(uint16_t)); |
283 | 0 | dst += size; |
284 | 0 | } |
285 | 0 | } |
286 | 0 | } |
287 | 0 | else |
288 | 0 | { |
289 | 0 | memset(dst, 0, p.kernelX * size * sizeof(uint16_t)); |
290 | 0 | dst += p.kernelX * size; |
291 | 0 | } |
292 | 0 | } |
293 | 0 | } |
294 | 0 | } |
295 | 0 | src += size; |
296 | 0 | } |
297 | 0 | } |
298 | | |
299 | | void SynetConvolution16bGemm::GemmNN(size_t M, size_t N, size_t K, const uint16_t* A, size_t lda, const uint16_t* B, size_t ldb, float* C, size_t ldc) |
300 | 0 | { |
301 | 0 | for (size_t i = 0; i < M; ++i) |
302 | 0 | { |
303 | 0 | float* pC = C + i * ldc; |
304 | 0 | for (size_t j = 0; j < N; ++j) |
305 | 0 | pC[j] = 0.0f; |
306 | 0 | for (size_t k = 0; k < K; ++k) |
307 | 0 | { |
308 | 0 | const uint16_t* pB = B + k * ldb; |
309 | 0 | float a = BFloat16ToFloat32(A[i * lda + k]); |
310 | 0 | for (size_t j = 0; j < N; ++j) |
311 | 0 | pC[j] += a * BFloat16ToFloat32(pB[j]); |
312 | 0 | } |
313 | 0 | } |
314 | 0 | } |
315 | | |
316 | | //------------------------------------------------------------------------------------------------- |
317 | | |
318 | | void * SynetConvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility) |
319 | 0 | { |
320 | 0 | ConvParam param(batch, conv, compatibility); |
321 | 0 | if (!param.Valid(SimdTensorData32f, SimdTensorData16b)) |
322 | 0 | return NULL; |
323 | 0 | if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param)) |
324 | 0 | return new Base::SynetConvolution16bNhwcDepthwise(param); |
325 | 0 | return new SynetConvolution16bGemm(param); |
326 | 0 | } |
327 | | } |
328 | | #endif |
329 | | } |