Coverage Report

Created: 2025-12-31 07:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdBaseSynetConvolution16b.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynetConvolution16b.h"
25
#include "Simd/SimdSynetConvolution32f.h"
26
#include "Simd/SimdSynetConvolution32fCommon.h"
27
#include "Simd/SimdSynet.h"
28
#include "Simd/SimdBase.h"
29
#include "Simd/SimdBFloat16.h"
30
#include "Simd/SimdAlignment.h"
31
32
namespace Simd
33
{
34
#if defined(SIMD_SYNET_ENABLE)
35
36
    SynetConvolution16b::SynetConvolution16b(const ConvParam& p)
37
0
        : _param(p)
38
#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG))
39
        , _perf(NULL)
40
#endif
41
0
    {
42
0
        _src16b = p.srcT == SimdTensorData16b;
43
0
        _dst16b = p.dstT == SimdTensorData16b;
44
0
        _elemS = _src16b ? 2 : 4;
45
0
        _elemD = _dst16b ? 2 : 4;
46
0
        _is1x1 = p.Is1x1();
47
0
    }
48
49
#if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG))
50
    Base::PerformanceMeasurer * SynetConvolution16b::Perf(const char* func)
51
    {
52
        if (_perf == NULL)
53
            _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info(true) + " " + Desc(), Param().Flop());
54
        return _perf;
55
    }
56
#endif
57
58
    void SynetConvolution16b::SetBias(const float* bias, size_t align)
59
0
    {
60
0
        const ConvParam& p = _param;
61
0
        _bias.Resize(AlignHi(p.dstC, align), true);
62
0
        if (bias)
63
0
            memcpy(_bias.data, bias, p.dstC * sizeof(float));
64
0
    }
65
66
    void SynetConvolution16b::SetParams(const float* params, size_t align)
67
0
    {
68
0
        const ConvParam& p = _param;
69
0
        if (p.activation == SimdConvolutionActivationLeakyRelu || p.activation == SimdConvolutionActivationPrelu)
70
0
            _params.Resize(AlignHi(p.dstC, align), true);
71
0
        else
72
0
            _params.Resize(2, true);
73
0
        switch (p.activation)
74
0
        {
75
0
        case SimdConvolutionActivationIdentity:
76
0
            _params.data[0] = -FLT_MAX;
77
0
            _params.data[1] = FLT_MAX;
78
0
            break;
79
0
        case SimdConvolutionActivationRelu:
80
0
            _params.data[0] = 0;
81
0
            _params.data[1] = FLT_MAX;
82
0
            break;
83
0
        case SimdConvolutionActivationLeakyRelu:
84
0
            for (size_t d = 0; d < p.dstC; ++d)
85
0
                _params.data[d] = params[0];
86
0
            break;
87
0
        case SimdConvolutionActivationRestrictRange:
88
0
            _params.data[0] = params[0];
89
0
            _params.data[1] = params[1];
90
0
            break;
91
0
        case SimdConvolutionActivationPrelu:
92
0
            for (size_t d = 0; d < p.dstC; ++d)
93
0
                _params.data[d] = params[d];
94
0
            break;
95
0
        case SimdConvolutionActivationElu:
96
0
            _params.data[0] = params[0];
97
0
            break;
98
0
        case SimdConvolutionActivationHswish:
99
0
            _params.data[0] = params[0];
100
0
            _params.data[1] = params[1];
101
0
            break;
102
0
        case SimdConvolutionActivationMish:
103
0
            _params.data[0] = params[0];
104
0
            break;
105
0
        case SimdConvolutionActivationHardSigmoid:
106
0
            _params.data[0] = params[0];
107
0
            _params.data[1] = params[1];
108
0
            break;
109
0
        case SimdConvolutionActivationSwish:
110
0
            _params.data[0] = params[0];
111
0
            break;
112
0
        case SimdConvolutionActivationGelu:
113
0
            break;
114
0
        default:
115
0
            assert(0);
116
0
        }
117
0
    }
118
119
    //-------------------------------------------------------------------------------------------------
120
121
    namespace Base
122
    {
123
        SynetConvolution16bGemm::SynetConvolution16bGemm(const ConvParam& p)
124
0
            : SynetConvolution16b(p)
125
0
        {
126
0
            if (p.trans)
127
0
            {
128
0
                _M = p.dstH * p.dstW;
129
0
                _N = p.dstC / p.group;
130
0
                _K = p.srcC * p.kernelY * p.kernelX / p.group;
131
0
                _ldS = _K;
132
0
                _ldW = p.dstC;
133
0
                _ldD = p.dstC;
134
0
                _grW = _N;
135
0
                _grS = _K * _M;
136
0
                _grD = _N;
137
0
                _weight.Resize(_K * _N * p.group);
138
0
            }
139
0
            else
140
0
            {
141
0
                _M = p.dstC / p.group;
142
0
                _N = p.dstH * p.dstW;
143
0
                _K = p.srcC * p.kernelY * p.kernelX / p.group;
144
0
                _ldW = _K;
145
0
                _ldS = _N;
146
0
                _ldD = _N;
147
0
                _grW = _M * _K;
148
0
                _grS = _K * _N;
149
0
                _grD = _M * _N;
150
0
                _weight.Resize(_K * _M * p.group);
151
0
            }
152
0
            _batch = p.batch;
153
0
            _sizeS = p.srcC * p.srcH * p.srcW;
154
0
            _sizeB = p.srcC * p.kernelY * p.kernelX * p.dstH * p.dstW;
155
0
            _sizeD = p.dstC * p.dstH * p.dstW;
156
0
            _stepS = _sizeS * _elemS;
157
0
            _stepD = _sizeD * _elemD;
158
0
        }
159
160
        size_t SynetConvolution16bGemm::ExternalBufferSize() const
161
0
        {
162
0
            size_t size = 0;
163
0
            if (!_src16b)
164
0
                size += _sizeS * sizeof(uint16_t);
165
0
            if (!_is1x1)
166
0
                size += _sizeB * sizeof(uint16_t);
167
0
            if (_dst16b)
168
0
                size += _sizeD * sizeof(float);
169
0
            return size;
170
0
        }
171
172
        void SynetConvolution16bGemm::SetParams(const float* weight, const float* bias, const float* params)
173
0
        {
174
0
            const ConvParam& p = _param;
175
0
            Float32ToBFloat16(weight, _weight.size, _weight.data);
176
0
            SynetConvolution16b::SetBias(bias, Alignment());
177
0
            SynetConvolution16b::SetParams(params, Alignment());
178
0
        }
179
180
        void SynetConvolution16bGemm::Forward(const uint8_t* src, uint8_t* buf, uint8_t* dst)
181
0
        {
182
0
            const ConvParam& p = _param;
183
0
            buf = Buffer(buf);
184
0
            uint16_t* bufS = _src16b ? NULL : Allocate<uint16_t>(buf, _sizeS);
185
0
            uint16_t* bufB = _is1x1 ? NULL : Allocate<uint16_t>(buf, _sizeB);
186
0
            float* bufD = _dst16b ? Allocate<float>(buf, _sizeD) : NULL;
187
0
            const uint16_t* wgt = _weight.data;
188
0
            for (size_t b = 0; b < _batch; ++b)
189
0
            {
190
0
                const uint16_t* src16b = _src16b ? (uint16_t*)src : bufS;
191
0
                const uint16_t* buf16b = _is1x1 ? src16b : bufB;
192
0
                float* dst32f = _dst16b ? bufD : (float*)dst;
193
0
                if (!_src16b)
194
0
                    Float32ToBFloat16((float*)src, _sizeS, bufS);
195
0
                if (_param.trans)
196
0
                {
197
0
                    if(!_is1x1)
198
0
                        ImgToRow(src16b, bufB);
199
0
                    for (size_t g = 0; g < p.group; ++g)
200
0
                        GemmNN(_M, _N, _K, buf16b + _grS * g, _ldS, wgt + _grW * g, _ldW, dst32f + _grD * g, _ldD);
201
0
                }
202
0
                else
203
0
                {
204
0
                    if (!_is1x1)
205
0
                        ImgToCol(src16b, bufB);
206
0
                    for (size_t g = 0; g < p.group; ++g)
207
0
                        GemmNN(_M, _N, _K, wgt + _grW * g, _ldW, buf16b + _grS * g, _ldS, dst32f + _grD * g, _ldD);
208
0
                }
209
0
                ConvolutionBiasAndActivation(_bias.data, p.dstC, p.dstH * p.dstW, p.activation, _params.data, p.trans, dst32f);
210
0
                if(_dst16b)
211
0
                    Float32ToBFloat16(bufD, _sizeD, (uint16_t*)dst);
212
0
                src += _stepS;
213
0
                dst += _stepD;
214
0
            }
215
0
        }
216
217
        void SynetConvolution16bGemm::ImgToCol(const uint16_t* src, uint16_t* dst)
218
0
        {
219
0
            const ConvParam& p = _param;
220
0
            assert(!p.trans);
221
0
            size_t srcSize = p.srcW * p.srcH;
222
0
            for (size_t c = 0; c < p.srcC; ++c)
223
0
            {
224
0
                for (size_t ky = 0; ky < p.kernelY; ky++)
225
0
                {
226
0
                    for (size_t kx = 0; kx < p.kernelX; kx++)
227
0
                    {
228
0
                        size_t sy = ky * p.dilationY - p.padY;
229
0
                        for (size_t dy = 0; dy < p.dstH; ++dy)
230
0
                        {
231
0
                            if (sy < p.srcH)
232
0
                            {
233
0
                                size_t sx = kx * p.dilationX - p.padX;
234
0
                                for (size_t dx = 0; dx < p.dstW; ++dx)
235
0
                                {
236
0
                                    if (sx < p.srcW)
237
0
                                        *(dst++) = src[sy * p.srcW + sx];
238
0
                                    else
239
0
                                        *(dst++) = 0;
240
0
                                    sx += p.strideX;
241
0
                                }
242
0
                            }
243
0
                            else
244
0
                            {
245
0
                                for (size_t dx = 0; dx < p.dstW; ++dx)
246
0
                                    *(dst++) = 0;
247
0
                            }
248
0
                            sy += p.strideY;
249
0
                        }
250
0
                    }
251
0
                }
252
0
                src += srcSize;
253
0
            }
254
0
        }
255
256
        void SynetConvolution16bGemm::ImgToRow(const uint16_t* src, uint16_t* dst)
257
0
        {
258
0
            const ConvParam& p = _param;
259
0
            assert(p.trans);
260
0
            size_t size = p.srcC / p.group;
261
0
            for (size_t g = 0; g < p.group; ++g)
262
0
            {
263
0
                for (size_t dy = 0; dy < p.dstH; ++dy)
264
0
                {
265
0
                    for (size_t dx = 0; dx < p.dstW; ++dx)
266
0
                    {
267
0
                        for (size_t ky = 0; ky < p.kernelY; ky++)
268
0
                        {
269
0
                            size_t sy = dy * p.strideY + ky * p.dilationY - p.padY;
270
0
                            if (sy < p.srcH)
271
0
                            {
272
0
                                for (size_t kx = 0; kx < p.kernelX; kx++)
273
0
                                {
274
0
                                    size_t sx = dx * p.strideX + kx * p.dilationX - p.padX;
275
0
                                    if (sx < p.srcW)
276
0
                                    {
277
0
                                        memcpy(dst, src + (sy * p.srcW + sx) * p.srcC, size * sizeof(uint16_t));
278
0
                                        dst += size;
279
0
                                    }
280
0
                                    else
281
0
                                    {
282
0
                                        memset(dst, 0, size * sizeof(uint16_t));
283
0
                                        dst += size;
284
0
                                    }
285
0
                                }
286
0
                            }
287
0
                            else
288
0
                            {
289
0
                                memset(dst, 0, p.kernelX * size * sizeof(uint16_t));
290
0
                                dst += p.kernelX * size;
291
0
                            }
292
0
                        }
293
0
                    }
294
0
                }
295
0
                src += size;
296
0
            }
297
0
        }
298
299
        void SynetConvolution16bGemm::GemmNN(size_t M, size_t N, size_t K, const uint16_t* A, size_t lda, const uint16_t* B, size_t ldb, float* C, size_t ldc)
300
0
        {
301
0
            for (size_t i = 0; i < M; ++i)
302
0
            {
303
0
                float* pC = C + i * ldc;
304
0
                for (size_t j = 0; j < N; ++j)
305
0
                    pC[j] = 0.0f;
306
0
                for (size_t k = 0; k < K; ++k)
307
0
                {
308
0
                    const uint16_t* pB = B + k * ldb;
309
0
                    float a = BFloat16ToFloat32(A[i * lda + k]);
310
0
                    for (size_t j = 0; j < N; ++j)
311
0
                        pC[j] += a * BFloat16ToFloat32(pB[j]);
312
0
                }
313
0
            }
314
0
        }
315
316
        //-------------------------------------------------------------------------------------------------
317
318
        void * SynetConvolution16bInit(size_t batch, const SimdConvolutionParameters * conv, SimdSynetCompatibilityType compatibility)
319
0
        {
320
0
            ConvParam param(batch, conv, compatibility);
321
0
            if (!param.Valid(SimdTensorData32f, SimdTensorData16b))
322
0
                return NULL;
323
0
            if (Base::SynetConvolution16bNhwcDepthwise::Preferable(param))
324
0
                return new Base::SynetConvolution16bNhwcDepthwise(param);
325
0
            return new SynetConvolution16bGemm(param);
326
0
        }
327
    }
328
#endif
329
}