/src/Simd/src/Simd/SimdBaseSynetActivation.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2024 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdArray.h" |
25 | | #include "Simd/SimdExp.h" |
26 | | #include "Simd/SimdErf.h" |
27 | | #include "Simd/SimdSynet.h" |
28 | | #include "Simd/SimdBFloat16.h" |
29 | | |
30 | | namespace Simd |
31 | | { |
32 | | #if defined(SIMD_SYNET_ENABLE) |
33 | | namespace Base |
34 | | { |
35 | | void SynetElu32f(const float * src, size_t size, const float * alpha, float * dst) |
36 | 0 | { |
37 | 0 | float _alpha = alpha[0]; |
38 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
39 | 0 | size_t i = 0; |
40 | 0 | for (; i < size4; i += 4) |
41 | 0 | { |
42 | 0 | dst[i + 0] = SynetElu32f(src[i + 0], _alpha); |
43 | 0 | dst[i + 1] = SynetElu32f(src[i + 1], _alpha); |
44 | 0 | dst[i + 2] = SynetElu32f(src[i + 2], _alpha); |
45 | 0 | dst[i + 3] = SynetElu32f(src[i + 3], _alpha); |
46 | 0 | } |
47 | 0 | for (; i < size; ++i) |
48 | 0 | dst[i] = SynetElu32f(src[i], _alpha); |
49 | 0 | } |
50 | | |
51 | | //------------------------------------------------------------------------------------------------- |
52 | | |
53 | | void SynetGelu32f(const float* src, size_t size, float* dst) |
54 | 0 | { |
55 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
56 | 0 | size_t i = 0; |
57 | 0 | for (; i < size4; i += 4) |
58 | 0 | { |
59 | 0 | dst[i + 0] = Gelu(src[i + 0]); |
60 | 0 | dst[i + 1] = Gelu(src[i + 1]); |
61 | 0 | dst[i + 2] = Gelu(src[i + 2]); |
62 | 0 | dst[i + 3] = Gelu(src[i + 3]); |
63 | 0 | } |
64 | 0 | for (; i < size; ++i) |
65 | 0 | dst[i] = Gelu(src[i]); |
66 | 0 | } |
67 | | |
68 | | //------------------------------------------------------------------------------------------------- |
69 | | |
70 | | void SynetHardSigmoid32f(const float* src, size_t size, const float* scale, const float* shift, float* dst) |
71 | 0 | { |
72 | 0 | float _scale = scale[0]; |
73 | 0 | float _shift = shift[0]; |
74 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
75 | 0 | size_t i = 0; |
76 | 0 | for (; i < size4; i += 4) |
77 | 0 | { |
78 | 0 | dst[i + 0] = SynetHardSigmoid32f(src[i + 0], _scale, _shift); |
79 | 0 | dst[i + 1] = SynetHardSigmoid32f(src[i + 1], _scale, _shift); |
80 | 0 | dst[i + 2] = SynetHardSigmoid32f(src[i + 2], _scale, _shift); |
81 | 0 | dst[i + 3] = SynetHardSigmoid32f(src[i + 3], _scale, _shift); |
82 | 0 | } |
83 | 0 | for (; i < size; ++i) |
84 | 0 | dst[i] = SynetHardSigmoid32f(src[i], _scale, _shift); |
85 | 0 | } |
86 | | |
87 | | //------------------------------------------------------------------------------------------------- |
88 | | |
89 | | void SynetHswish32f(const float * src, size_t size, const float * shift, const float * scale, float * dst) |
90 | 0 | { |
91 | 0 | float _shift = shift[0]; |
92 | 0 | float _scale = scale[0]; |
93 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
94 | 0 | size_t i = 0; |
95 | 0 | for (; i < size4; i += 4) |
96 | 0 | { |
97 | 0 | dst[i + 0] = SynetHswish32f(src[i + 0], _shift, _scale); |
98 | 0 | dst[i + 1] = SynetHswish32f(src[i + 1], _shift, _scale); |
99 | 0 | dst[i + 2] = SynetHswish32f(src[i + 2], _shift, _scale); |
100 | 0 | dst[i + 3] = SynetHswish32f(src[i + 3], _shift, _scale); |
101 | 0 | } |
102 | 0 | for (; i < size; ++i) |
103 | 0 | dst[i] = SynetHswish32f(src[i], _shift, _scale); |
104 | 0 | } |
105 | | |
106 | | //------------------------------------------------------------------------------------------------- |
107 | | |
108 | | void SynetMish32f(const float* src, size_t size, const float* threshold, float* dst) |
109 | 0 | { |
110 | 0 | float _threshold = threshold[0]; |
111 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
112 | 0 | size_t i = 0; |
113 | 0 | for (; i < size4; i += 4) |
114 | 0 | { |
115 | 0 | dst[i + 0] = SynetMish32f(src[i + 0], _threshold); |
116 | 0 | dst[i + 1] = SynetMish32f(src[i + 1], _threshold); |
117 | 0 | dst[i + 2] = SynetMish32f(src[i + 2], _threshold); |
118 | 0 | dst[i + 3] = SynetMish32f(src[i + 3], _threshold); |
119 | 0 | } |
120 | 0 | for (; i < size; ++i) |
121 | 0 | dst[i] = SynetMish32f(src[i], _threshold); |
122 | 0 | } |
123 | | |
124 | | //------------------------------------------------------------------------------------------------- |
125 | | |
126 | | void SynetPreluLayerForwardNchw(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) |
127 | 0 | { |
128 | 0 | size_t aligned = Simd::AlignLo(spatial, 4); |
129 | 0 | for (size_t c = 0; c < channels; ++c) |
130 | 0 | { |
131 | 0 | float _slope = slope[c]; |
132 | 0 | size_t s = 0; |
133 | 0 | for (; s < aligned; s += 4) |
134 | 0 | { |
135 | 0 | dst[s + 0] = SynetRelu32f(src[s + 0], _slope); |
136 | 0 | dst[s + 1] = SynetRelu32f(src[s + 1], _slope); |
137 | 0 | dst[s + 2] = SynetRelu32f(src[s + 2], _slope); |
138 | 0 | dst[s + 3] = SynetRelu32f(src[s + 3], _slope); |
139 | 0 | } |
140 | 0 | for (; s < spatial; ++s) |
141 | 0 | dst[s] = SynetRelu32f(src[s], _slope); |
142 | 0 | src += spatial; |
143 | 0 | dst += spatial; |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | | void SynetPreluLayerForwardNhwc(const float* src, const float* slope, size_t channels, size_t spatial, float* dst) |
148 | 0 | { |
149 | 0 | size_t aligned = Simd::AlignLo(channels, 4); |
150 | 0 | for (size_t s = 0; s < spatial; ++s) |
151 | 0 | { |
152 | 0 | size_t c = 0; |
153 | 0 | for (; c < aligned; c += 4) |
154 | 0 | { |
155 | 0 | dst[c + 0] = SynetRelu32f(src[c + 0], slope[c + 0]); |
156 | 0 | dst[c + 1] = SynetRelu32f(src[c + 1], slope[c + 1]); |
157 | 0 | dst[c + 2] = SynetRelu32f(src[c + 2], slope[c + 2]); |
158 | 0 | dst[c + 3] = SynetRelu32f(src[c + 3], slope[c + 3]); |
159 | 0 | } |
160 | 0 | for (; c < channels; ++c) |
161 | 0 | dst[c] = SynetRelu32f(src[c], slope[c]); |
162 | 0 | src += channels; |
163 | 0 | dst += channels; |
164 | |
|
165 | 0 | } |
166 | 0 | } |
167 | | |
168 | | void SynetPreluLayerForward(const float* src, const float* slope, size_t channels, size_t spatial, float* dst, SimdTensorFormatType format) |
169 | 0 | { |
170 | 0 | if (Base::NchwCompatible(channels, spatial, format)) |
171 | 0 | SynetPreluLayerForwardNchw(src, slope, channels, spatial, dst); |
172 | 0 | else if (Base::NhwcCompatible(channels, spatial, format)) |
173 | 0 | SynetPreluLayerForwardNhwc(src, slope, channels, spatial, dst); |
174 | 0 | else |
175 | 0 | assert(0); |
176 | 0 | } |
177 | | |
178 | | //------------------------------------------------------------------------------------------------- |
179 | | |
180 | | void SynetRelu32f(const float* src, size_t size, const float* slope, float* dst) |
181 | 0 | { |
182 | 0 | float _slope = slope[0]; |
183 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
184 | 0 | size_t i = 0; |
185 | 0 | for (; i < size4; i += 4) |
186 | 0 | { |
187 | 0 | dst[i + 0] = SynetRelu32f(src[i + 0], _slope); |
188 | 0 | dst[i + 1] = SynetRelu32f(src[i + 1], _slope); |
189 | 0 | dst[i + 2] = SynetRelu32f(src[i + 2], _slope); |
190 | 0 | dst[i + 3] = SynetRelu32f(src[i + 3], _slope); |
191 | 0 | } |
192 | 0 | for (; i < size; ++i) |
193 | 0 | dst[i] = SynetRelu32f(src[i], _slope); |
194 | 0 | } |
195 | | |
196 | | //------------------------------------------------------------------------------------------------- |
197 | | |
198 | | void SynetRelu16b(const uint16_t* src, size_t size, const float* slope, uint16_t* dst) |
199 | 0 | { |
200 | 0 | float _slope = slope[0]; |
201 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
202 | 0 | size_t i = 0; |
203 | 0 | for (; i < size4; i += 4) |
204 | 0 | { |
205 | 0 | dst[i + 0] = SynetRelu16b(src[i + 0], _slope); |
206 | 0 | dst[i + 1] = SynetRelu16b(src[i + 1], _slope); |
207 | 0 | dst[i + 2] = SynetRelu16b(src[i + 2], _slope); |
208 | 0 | dst[i + 3] = SynetRelu16b(src[i + 3], _slope); |
209 | 0 | } |
210 | 0 | for (; i < size; ++i) |
211 | 0 | dst[i] = SynetRelu16b(src[i], _slope); |
212 | 0 | } |
213 | | |
214 | | //------------------------------------------------------------------------------------------------- |
215 | | |
216 | | void SynetRestrictRange32f(const float * src, size_t size, const float * lower, const float * upper, float * dst) |
217 | 0 | { |
218 | 0 | float min = *lower; |
219 | 0 | float max = *upper; |
220 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
221 | 0 | size_t i = 0; |
222 | 0 | for (; i < size4; i += 4) |
223 | 0 | { |
224 | 0 | dst[i + 0] = Simd::RestrictRange(src[i + 0], min, max); |
225 | 0 | dst[i + 1] = Simd::RestrictRange(src[i + 1], min, max); |
226 | 0 | dst[i + 2] = Simd::RestrictRange(src[i + 2], min, max); |
227 | 0 | dst[i + 3] = Simd::RestrictRange(src[i + 3], min, max); |
228 | 0 | } |
229 | 0 | for (; i < size; ++i) |
230 | 0 | dst[i] = Simd::RestrictRange(src[i], min, max); |
231 | 0 | } |
232 | | |
233 | | //------------------------------------------------------------------------------------------------- |
234 | | |
235 | | void SynetSigmoid32f(const float* src, size_t size, const float* slope, float* dst) |
236 | 0 | { |
237 | 0 | float _slope = slope[0]; |
238 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
239 | 0 | size_t i = 0; |
240 | 0 | for (; i < size4; i += 4) |
241 | 0 | { |
242 | 0 | dst[i + 0] = SynetSigmoid32f(src[i + 0], _slope); |
243 | 0 | dst[i + 1] = SynetSigmoid32f(src[i + 1], _slope); |
244 | 0 | dst[i + 2] = SynetSigmoid32f(src[i + 2], _slope); |
245 | 0 | dst[i + 3] = SynetSigmoid32f(src[i + 3], _slope); |
246 | 0 | } |
247 | 0 | for (; i < size; ++i) |
248 | 0 | dst[i] = SynetSigmoid32f(src[i], _slope); |
249 | 0 | } |
250 | | |
251 | | //------------------------------------------------------------------------------------------------- |
252 | | |
253 | | void SynetSwish32f(const float* src, size_t size, const float* slope, float* dst) |
254 | 0 | { |
255 | 0 | float _slope = slope[0]; |
256 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
257 | 0 | size_t i = 0; |
258 | 0 | for (; i < size4; i += 4) |
259 | 0 | { |
260 | 0 | dst[i + 0] = SynetSwish32f(src[i + 0], _slope); |
261 | 0 | dst[i + 1] = SynetSwish32f(src[i + 1], _slope); |
262 | 0 | dst[i + 2] = SynetSwish32f(src[i + 2], _slope); |
263 | 0 | dst[i + 3] = SynetSwish32f(src[i + 3], _slope); |
264 | 0 | } |
265 | 0 | for (; i < size; ++i) |
266 | 0 | dst[i] = SynetSwish32f(src[i], _slope); |
267 | 0 | } |
268 | | |
269 | | //------------------------------------------------------------------------------------------------- |
270 | | |
271 | | void SynetSoftplus32f(const float* src, size_t size, const float * beta, const float * threshold, float* dst) |
272 | 0 | { |
273 | 0 | float _beta = beta[0]; |
274 | 0 | float _threshold = threshold[0]; |
275 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
276 | 0 | size_t i = 0; |
277 | 0 | for (; i < size4; i += 4) |
278 | 0 | { |
279 | 0 | dst[i + 0] = SynetSoftplus32f(src[i + 0], _beta, _threshold); |
280 | 0 | dst[i + 1] = SynetSoftplus32f(src[i + 1], _beta, _threshold); |
281 | 0 | dst[i + 2] = SynetSoftplus32f(src[i + 2], _beta, _threshold); |
282 | 0 | dst[i + 3] = SynetSoftplus32f(src[i + 3], _beta, _threshold); |
283 | 0 | } |
284 | 0 | for (; i < size; ++i) |
285 | 0 | dst[i] = SynetSoftplus32f(src[i], _beta, _threshold); |
286 | 0 | } |
287 | | |
288 | | //------------------------------------------------------------------------------------------------- |
289 | | |
290 | | void SynetTanh32f(const float* src, size_t size, const float* slope, float* dst) |
291 | 0 | { |
292 | 0 | float _slope = slope[0]; |
293 | 0 | size_t size4 = Simd::AlignLo(size, 4); |
294 | 0 | size_t i = 0; |
295 | 0 | for (; i < size4; i += 4) |
296 | 0 | { |
297 | 0 | dst[i + 0] = SynetTanh32f(src[i + 0], _slope); |
298 | 0 | dst[i + 1] = SynetTanh32f(src[i + 1], _slope); |
299 | 0 | dst[i + 2] = SynetTanh32f(src[i + 2], _slope); |
300 | 0 | dst[i + 3] = SynetTanh32f(src[i + 3], _slope); |
301 | 0 | } |
302 | 0 | for (; i < size; ++i) |
303 | 0 | dst[i] = SynetTanh32f(src[i], _slope); |
304 | 0 | } |
305 | | } |
306 | | #endif |
307 | | } |