/src/Simd/src/Simd/SimdBaseSynetInnerProduct32f.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /*  | 
2  |  | * Simd Library (http://ermig1979.github.io/Simd).  | 
3  |  | *  | 
4  |  | * Copyright (c) 2011-2024 Yermalayeu Ihar.  | 
5  |  | *  | 
6  |  | * Permission is hereby granted, free of charge, to any person obtaining a copy  | 
7  |  | * of this software and associated documentation files (the "Software"), to deal  | 
8  |  | * in the Software without restriction, including without limitation the rights  | 
9  |  | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell  | 
10  |  | * copies of the Software, and to permit persons to whom the Software is  | 
11  |  | * furnished to do so, subject to the following conditions:  | 
12  |  | *  | 
13  |  | * The above copyright notice and this permission notice shall be included in  | 
14  |  | * all copies or substantial portions of the Software.  | 
15  |  | *  | 
16  |  | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR  | 
17  |  | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,  | 
18  |  | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE  | 
19  |  | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER  | 
20  |  | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,  | 
21  |  | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  | 
22  |  | * SOFTWARE.  | 
23  |  | */  | 
24  |  | #include "Simd/SimdSynetInnerProduct32f.h"  | 
25  |  | #include "Simd/SimdSynetConvolution32f.h"  | 
26  |  | #include "Simd/SimdCpu.h"  | 
27  |  | #include "Simd/SimdBase.h"  | 
28  |  |  | 
29  |  | namespace Simd  | 
30  |  | { | 
31  |  | #if defined(SIMD_SYNET_ENABLE)  | 
32  |  |  | 
33  |  | #if defined(SIMD_PERFORMANCE_STATISTIC) && (defined(NDEBUG) || defined(SIMD_PERF_STAT_IN_DEBUG))  | 
34  |  |     Base::PerformanceMeasurer * SynetInnerProduct32f::Perf(const char* func)  | 
35  |  |     { | 
36  |  |         if (_perf == NULL)  | 
37  |  |             _perf = Simd::Base::PerformanceMeasurerStorage::s_storage.Get(func, Param().Info() + " " + Desc(), Param().Flop());  | 
38  |  |         return _perf;  | 
39  |  |     }  | 
40  |  | #endif  | 
41  |  |  | 
42  |  |     namespace Base  | 
43  |  |     { | 
44  |  |         void SynetInnerProductLayerForward(const float* src, const float* weight, const float* bias, size_t count, size_t size, float* dst)  | 
45  | 0  |         { | 
46  | 0  |             size_t aligned = Simd::AlignLo(size, 4);  | 
47  | 0  |             for (size_t i = 0; i < count; ++i)  | 
48  | 0  |             { | 
49  | 0  |                 size_t j = 0;  | 
50  | 0  |                 float sums[4] = { 0, 0, 0, 0 }; | 
51  | 0  |                 for (; j < aligned; j += 4)  | 
52  | 0  |                 { | 
53  | 0  |                     sums[0] += src[j + 0] * weight[j + 0];  | 
54  | 0  |                     sums[1] += src[j + 1] * weight[j + 1];  | 
55  | 0  |                     sums[2] += src[j + 2] * weight[j + 2];  | 
56  | 0  |                     sums[3] += src[j + 3] * weight[j + 3];  | 
57  | 0  |                 }  | 
58  | 0  |                 for (; j < size; ++j)  | 
59  | 0  |                     sums[0] += src[j] * weight[j];  | 
60  | 0  |                 dst[i] = sums[0] + sums[1] + sums[2] + sums[3] + (bias ? bias[i] : 0);  | 
61  | 0  |                 weight += size;  | 
62  | 0  |             }  | 
63  | 0  |         }  | 
64  |  |  | 
65  |  |         //---------------------------------------------------------------------  | 
66  |  |  | 
67  |  |         SynetInnerProduct32fGemm::SynetInnerProduct32fGemm(const InnerProductParam32f & p)  | 
68  |  |             : SynetInnerProduct32f(p)  | 
69  |  |             , _0(0.0f)  | 
70  |  |             , _1(1.0f)  | 
71  | 0  |         { | 
72  | 0  |             _M = _param.batch;  | 
73  | 0  |             _N = _param.output;  | 
74  | 0  |             _K = _param.input;  | 
75  | 0  |             _ldS = _K;  | 
76  | 0  |             _ldD = _N;  | 
77  | 0  |             _biasAndActivation = Base::ConvolutionBiasAndActivation;  | 
78  | 0  |             _prod = NULL;  | 
79  | 0  |             if (_param.transpose)  | 
80  | 0  |             { | 
81  | 0  |                 _gemm = Base::Gemm32fNT;  | 
82  | 0  |                 _ldW = _K;  | 
83  | 0  |                 if (_M == 1 && _param.activation == SimdConvolutionActivationIdentity)  | 
84  | 0  |                     _prod = Base::SynetInnerProductLayerForward;  | 
85  | 0  |             }  | 
86  | 0  |             else  | 
87  | 0  |             { | 
88  | 0  |                 _gemm = Base::Gemm32fNN;  | 
89  | 0  |                 _ldW = _N;  | 
90  | 0  |             }  | 
91  | 0  |         }  | 
92  |  |  | 
93  |  |         String SynetInnerProduct32fGemm::Desc() const   | 
94  | 0  |         {  | 
95  | 0  |             return Ext() + "::Gemm" + (_prod ? "Prod" :   | 
96  | 0  |                 String("N") + (_cbWeight.size ? "Ncb" : (_param.transpose == SimdTrue ? "T" : "N"))); | 
97  | 0  |         }  | 
98  |  |  | 
99  |  |         void SynetInnerProduct32fGemm::SetParams(const float* weight, SimdBool* internal, const float* bias, const float* params)  | 
100  | 0  |         { | 
101  | 0  |             Simd::SynetInnerProduct32f::SetParams(weight, internal, bias, params);  | 
102  | 0  |             if (_cbWeight.data)  | 
103  | 0  |             { | 
104  | 0  |                 Array32f buffer;  | 
105  | 0  |                 if (_param.transpose)  | 
106  | 0  |                 { | 
107  | 0  |                     buffer.Resize(_N * _K);  | 
108  | 0  |                     for (size_t k = 0; k < _K; ++k)  | 
109  | 0  |                         for (size_t j = 0; j < _N; ++j)  | 
110  | 0  |                             buffer[k*_N + j] = weight[j * _K + k];  | 
111  | 0  |                     weight = buffer.data;  | 
112  | 0  |                 }  | 
113  | 0  |                 _cbPack(_M, _N, _K, weight, _cbWeight.data, GemmKernelAny, NHWC_GEMM_COMPATIBLE);  | 
114  | 0  |                 if (internal)  | 
115  | 0  |                     *internal = SimdTrue;  | 
116  | 0  |             }  | 
117  | 0  |         }  | 
118  |  |  | 
119  |  |         void SynetInnerProduct32fGemm::Forward(const float * src, float * dst)  | 
120  | 0  |         { | 
121  | 0  |             if (_prod)  | 
122  | 0  |                 _prod(src, _weight, _bias, _N, _K, dst);  | 
123  | 0  |             else  | 
124  | 0  |             { | 
125  | 0  |                 if (_cbWeight.data)  | 
126  | 0  |                     _cbRun(_M, _N, _K, src, _cbWeight.data, dst, GemmKernelAny, NHWC_GEMM_COMPATIBLE);  | 
127  | 0  |                 else  | 
128  | 0  |                     _gemm(_M, _N, _K, &_1, src, _ldS, _weight, _ldW, &_0, dst, _ldD);  | 
129  | 0  |                 _biasAndActivation(_bias, _N, _M, _param.activation, _params, SimdTrue, dst);  | 
130  | 0  |             }  | 
131  | 0  |         }  | 
132  |  |  | 
133  |  |         //---------------------------------------------------------------------  | 
134  |  |  | 
135  |  |         SynetInnerProduct32fProd::SynetInnerProduct32fProd(const InnerProductParam32f& p)  | 
136  |  |             : SynetInnerProduct32f(p)  | 
137  | 0  |         { | 
138  | 0  |             _N = _param.output;  | 
139  | 0  |             _K = _param.input;  | 
140  | 0  |         }  | 
141  |  |  | 
142  |  |         void SynetInnerProduct32fProd::SetParams(const float* weight, SimdBool* internal, const float* bias, const float* params)  | 
143  | 0  |         { | 
144  | 0  |             SynetInnerProduct32f::SetParams(weight, internal, bias, params);  | 
145  | 0  |             ReorderWeight(_weight, _rWeight.data);  | 
146  | 0  |             if (internal)  | 
147  | 0  |                 *internal = SimdTrue;  | 
148  | 0  |             if (bias)  | 
149  | 0  |                 memcpy(_rBias.data, bias, _param.output * sizeof(float));  | 
150  | 0  |         }  | 
151  |  |  | 
152  |  |         void SynetInnerProduct32fProd::Forward(const float* src, float* dst)  | 
153  | 0  |         { | 
154  | 0  |             _prod(src, _rWeight.data, _rBias.data, _K, _N, dst);  | 
155  | 0  |         }  | 
156  |  |  | 
157  |  |         bool SynetInnerProduct32fProd::Preferable(const InnerProductParam32f& p)  | 
158  | 0  |         { | 
159  | 0  |             return  | 
160  | 0  |                 p.activation == SimdConvolutionActivationIdentity &&  | 
161  | 0  |                 p.batch == 1 &&  | 
162  | 0  |                 p.output >= 4 &&  | 
163  | 0  |                 Base::AlgCacheL3() > p.input * p.output * sizeof(float);  | 
164  | 0  |         }  | 
165  |  |  | 
166  |  |         void SynetInnerProduct32fProd::SetSize(size_t F)  | 
167  | 0  |         { | 
168  | 0  |             _F = F;  | 
169  | 0  |             _rWeight.Resize(AlignHi(_N, _F) * _K);  | 
170  | 0  |             _rBias.Resize(AlignHi(_N, _F), true);  | 
171  | 0  |         }  | 
172  |  |  | 
173  |  |         void SynetInnerProduct32fProd::ReorderWeight(const float* src, float* dst)  | 
174  | 0  |         { | 
175  | 0  |             if (_param.transpose)  | 
176  | 0  |             { | 
177  | 0  |                 for (size_t n = 0; n < _N; n += _F)  | 
178  | 0  |                 { | 
179  | 0  |                     size_t F = Simd::Min(_N, n + _F) - n;  | 
180  | 0  |                     const float* psrc = src + n * _K;  | 
181  | 0  |                     for (size_t k = 0; k < _K; ++k)  | 
182  | 0  |                     { | 
183  | 0  |                         size_t f = 0;  | 
184  | 0  |                         for (; f < F; ++f)  | 
185  | 0  |                             *(dst++) = psrc[f * _K];  | 
186  | 0  |                         for (; f < _F; ++f)  | 
187  | 0  |                             *(dst++) = 0.0f;  | 
188  | 0  |                         psrc++;  | 
189  | 0  |                     }  | 
190  | 0  |                 }              | 
191  | 0  |             }  | 
192  | 0  |             else  | 
193  | 0  |             { | 
194  | 0  |                 for (size_t n = 0; n < _N; n += _F)  | 
195  | 0  |                 { | 
196  | 0  |                     size_t F = Simd::Min(_N, n + _F) - n;  | 
197  | 0  |                     const float* psrc = src + n;  | 
198  | 0  |                     for (size_t k = 0; k < _K; ++k)  | 
199  | 0  |                     { | 
200  | 0  |                         size_t f = 0;  | 
201  | 0  |                         for (; f < F; ++f)  | 
202  | 0  |                             *(dst++) = psrc[f];  | 
203  | 0  |                         for (; f < _F; ++f)  | 
204  | 0  |                             *(dst++) = 0.0f;  | 
205  | 0  |                         psrc += _N;  | 
206  | 0  |                     }  | 
207  | 0  |                 }  | 
208  | 0  |             }  | 
209  | 0  |         }  | 
210  |  |  | 
211  |  |         //---------------------------------------------------------------------  | 
212  |  |  | 
213  |  |         void * SynetInnerProduct32fInit(size_t batch, size_t input, size_t output, SimdBool transpose, SimdConvolutionActivationType activation)  | 
214  | 0  |         { | 
215  | 0  |             InnerProductParam32f param(batch, input, output, transpose, activation);  | 
216  | 0  |             if (!param.Valid())  | 
217  | 0  |                 return NULL;  | 
218  | 0  |             return new SynetInnerProduct32fGemm(param);  | 
219  | 0  |         }  | 
220  |  |     }  | 
221  |  | #endif  | 
222  |  | }  |