/src/Simd/src/Simd/SimdAvx2SynetNormalize32f.cpp
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2026 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdSynet.h" |
25 | | #include "Simd/SimdArray.h" |
26 | | #include "Simd/SimdMath.h" |
27 | | #include "Simd/SimdExtract.h" |
28 | | |
29 | | namespace Simd |
30 | | { |
31 | | #if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE) |
32 | | namespace Avx2 |
33 | | { |
34 | | void NormalizeNchw1(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst) |
35 | 0 | { |
36 | 0 | size_t size = channels * spatial; |
37 | 0 | size_t sizeF = AlignLo(size, F); |
38 | 0 | size_t spatialF = AlignLo(spatial, F); |
39 | 0 | for (size_t b = 0; b < batch; ++b) |
40 | 0 | { |
41 | 0 | __m256 _sum = _mm256_setzero_ps(); |
42 | 0 | size_t i = 0; |
43 | 0 | for (; i < sizeF; i += F) |
44 | 0 | { |
45 | 0 | __m256 _src = _mm256_loadu_ps(src + i); |
46 | 0 | _sum = _mm256_fmadd_ps(_src, _src, _sum); |
47 | 0 | } |
48 | 0 | float sum = ExtractSum(_sum); |
49 | 0 | for (; i < size; ++i) |
50 | 0 | sum += Simd::Square(src[i]); |
51 | 0 | float k0 = 1.0f / ::sqrt(sum + eps); |
52 | 0 | for (size_t c = 0; c < channels; ++c) |
53 | 0 | { |
54 | 0 | __m256 _k = _mm256_set1_ps(scale[c] * k0); |
55 | 0 | size_t s = 0; |
56 | 0 | for (; s < spatialF; s += F) |
57 | 0 | _mm256_storeu_ps(dst + s, _mm256_mul_ps(_mm256_loadu_ps(src + s), _k)); |
58 | 0 | for (; s < spatial; ++s) |
59 | 0 | _mm_store_ss(dst + s, _mm_mul_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(_k))); |
60 | 0 | dst += spatial; |
61 | 0 | src += spatial; |
62 | 0 | } |
63 | 0 | } |
64 | 0 | } |
65 | | |
66 | | void NormalizeNchw0(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* buf, float* dst) |
67 | 0 | { |
68 | 0 | Array32f _buf; |
69 | 0 | if (buf == NULL) |
70 | 0 | { |
71 | 0 | _buf.Resize(spatial); |
72 | 0 | buf = _buf.data; |
73 | 0 | } |
74 | 0 | size_t spatialF = AlignLo(spatial, F); |
75 | 0 | __m256 _eps = _mm256_set1_ps(eps), _1 = _mm256_set1_ps(1.0f); |
76 | 0 | for (size_t b = 0; b < batch; ++b) |
77 | 0 | { |
78 | 0 | size_t s = 0; |
79 | 0 | for (; s < spatialF; s += F) |
80 | 0 | _mm256_storeu_ps(buf + s, _mm256_setzero_ps()); |
81 | 0 | for (; s < spatial; ++s) |
82 | 0 | _mm_store_ss(buf + s, _mm_setzero_ps()); |
83 | 0 | for (size_t c = 0; c < channels; ++c) |
84 | 0 | { |
85 | 0 | const float* ps = src + c * spatial; |
86 | 0 | for (s = 0; s < spatialF; s += F) |
87 | 0 | { |
88 | 0 | __m256 _src = _mm256_loadu_ps(ps + s); |
89 | 0 | _mm256_storeu_ps(buf + s, _mm256_fmadd_ps(_src, _src, _mm256_loadu_ps(buf + s))); |
90 | 0 | } |
91 | 0 | for (; s < spatial; ++s) |
92 | 0 | { |
93 | 0 | __m128 _src = _mm_load_ss(ps + s); |
94 | 0 | __m128 _sum = _mm_load_ss(buf + s); |
95 | 0 | _mm_store_ss(buf + s, _mm_add_ss(_sum, _mm_mul_ps(_src, _src))); |
96 | 0 | } |
97 | 0 | } |
98 | 0 | for (s = 0; s < spatialF; s += F) |
99 | 0 | _mm256_storeu_ps(buf + s, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_add_ps(_mm256_loadu_ps(buf + s), _eps)))); |
100 | 0 | for (; s < spatial; ++s) |
101 | 0 | _mm_store_ss(buf + s, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_add_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_eps))))); |
102 | 0 | for (size_t c = 0; c < channels; ++c) |
103 | 0 | { |
104 | 0 | float k = scale[c]; |
105 | 0 | __m256 _k = _mm256_set1_ps(k); |
106 | 0 | for (s = 0; s < spatialF; s += F) |
107 | 0 | _mm256_storeu_ps(dst + s, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + s), _mm256_loadu_ps(buf + s)), _k)); |
108 | 0 | for (; s < spatial; ++s) |
109 | 0 | _mm_store_ss(dst + s, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + s), _mm_load_ss(buf + s)), _mm256_castps256_ps128(_k))); |
110 | 0 | dst += spatial; |
111 | 0 | src += spatial; |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | | |
116 | | void NormalizeNhwc1(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst) |
117 | 0 | { |
118 | 0 | size_t size = channels * spatial; |
119 | 0 | size_t sizeF = AlignLo(size, F); |
120 | 0 | size_t channelsF = AlignLo(channels, F); |
121 | 0 | for (size_t b = 0; b < batch; ++b) |
122 | 0 | { |
123 | 0 | __m256 _sum = _mm256_setzero_ps(); |
124 | 0 | size_t i = 0; |
125 | 0 | for (; i < sizeF; i += F) |
126 | 0 | { |
127 | 0 | __m256 _src = _mm256_loadu_ps(src + i); |
128 | 0 | _sum = _mm256_fmadd_ps(_src, _src, _sum); |
129 | 0 | } |
130 | 0 | float sum = ExtractSum(_sum); |
131 | 0 | for (; i < size; ++i) |
132 | 0 | sum += Simd::Square(src[i]); |
133 | 0 | __m256 _k = _mm256_set1_ps(1.0f / ::sqrt(sum + eps)); |
134 | 0 | for (size_t s = 0; s < spatial; ++s) |
135 | 0 | { |
136 | 0 | size_t c = 0; |
137 | 0 | for (; c < channelsF; c += F) |
138 | 0 | _mm256_storeu_ps(dst + c, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(scale + c)), _k)); |
139 | 0 | for (; c < channels; ++c) |
140 | 0 | _mm_store_ss(dst + c, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + c), _mm_load_ss(scale + c)), _mm256_castps256_ps128(_k))); |
141 | 0 | dst += channels; |
142 | 0 | src += channels; |
143 | 0 | } |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | | void NormalizeNhwc0(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst) |
148 | 0 | { |
149 | 0 | size_t channelsF = AlignLo(channels, F); |
150 | 0 | for (size_t b = 0; b < batch; ++b) |
151 | 0 | { |
152 | 0 | for (size_t s = 0; s < spatial; ++s) |
153 | 0 | { |
154 | 0 | __m256 _sum = _mm256_setzero_ps(); |
155 | 0 | size_t c = 0; |
156 | 0 | for (; c < channelsF; c += F) |
157 | 0 | { |
158 | 0 | __m256 _src = _mm256_loadu_ps(src + c); |
159 | 0 | _sum = _mm256_fmadd_ps(_src, _src, _sum); |
160 | 0 | } |
161 | 0 | float sum = ExtractSum(_sum); |
162 | 0 | for (; c < channels; ++c) |
163 | 0 | sum += Simd::Square(src[c]); |
164 | 0 | __m256 _k = _mm256_set1_ps(1.0f / ::sqrt(sum + eps)); |
165 | 0 | for (c = 0; c < channelsF; c += F) |
166 | 0 | _mm256_storeu_ps(dst + c, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(scale + c)), _k)); |
167 | 0 | for (; c < channels; ++c) |
168 | 0 | _mm_store_ss(dst + c, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + c), _mm_load_ss(scale + c)), _mm256_castps256_ps128(_k))); |
169 | 0 | dst += channels; |
170 | 0 | src += channels; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | } |
174 | | |
175 | | void SynetNormalizeLayerForward(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, |
176 | | const float* eps, SimdBool acrossSpatial, SimdTensorFormatType format, float* buf, float* dst) |
177 | 0 | { |
178 | 0 | if (format == SimdTensorFormatNchw) |
179 | 0 | { |
180 | 0 | if (acrossSpatial) |
181 | 0 | NormalizeNchw1(src, batch, channels, spatial, scale, eps[0], dst); |
182 | 0 | else |
183 | 0 | NormalizeNchw0(src, batch, channels, spatial, scale, eps[0], buf, dst); |
184 | 0 | } |
185 | 0 | else if (format == SimdTensorFormatNhwc) |
186 | 0 | { |
187 | 0 | if (acrossSpatial) |
188 | 0 | NormalizeNhwc1(src, batch, channels, spatial, scale, eps[0], dst); |
189 | 0 | else |
190 | 0 | NormalizeNhwc0(src, batch, channels, spatial, scale, eps[0], dst); |
191 | 0 | } |
192 | 0 | else |
193 | 0 | assert(0); |
194 | 0 | } |
195 | | |
196 | | //------------------------------------------------------------------------------------------------- |
197 | | |
198 | | void NormalizeNchwV2(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst) |
199 | 0 | { |
200 | 0 | float k = 1.0f / float(channels); |
201 | 0 | Array32f _buf; |
202 | 0 | if (buf == NULL) |
203 | 0 | { |
204 | 0 | _buf.Resize(spatial); |
205 | 0 | buf = _buf.data; |
206 | 0 | } |
207 | 0 | size_t spatialF = AlignLo(spatial, F); |
208 | 0 | __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f); |
209 | 0 | for (size_t b = 0, s; b < batch; ++b) |
210 | 0 | { |
211 | 0 | for (s = 0; s < spatialF; s += F) |
212 | 0 | _mm256_storeu_ps(buf + s, _mm256_setzero_ps()); |
213 | 0 | for (; s < spatial; ++s) |
214 | 0 | _mm_store_ss(buf + s, _mm_setzero_ps()); |
215 | 0 | for (size_t c = 0; c < channels; ++c) |
216 | 0 | { |
217 | 0 | const float* ps = src + c * spatial; |
218 | 0 | for (s = 0; s < spatialF; s += F) |
219 | 0 | { |
220 | 0 | __m256 _src = _mm256_loadu_ps(ps + s); |
221 | 0 | __m256 _sum = _mm256_loadu_ps(buf + s); |
222 | 0 | _mm256_storeu_ps(buf + s, _mm256_add_ps(_sum, _src)); |
223 | 0 | } |
224 | 0 | for (; s < spatial; ++s) |
225 | 0 | { |
226 | 0 | __m128 _src = _mm_load_ss(ps + s); |
227 | 0 | __m128 _sum = _mm_load_ss(buf + s); |
228 | 0 | _mm_store_ss(buf + s, _mm_add_ss(_sum, _src)); |
229 | 0 | } |
230 | 0 | } |
231 | 0 | for (s = 0; s < spatialF; s += F) |
232 | 0 | _mm256_storeu_ps(buf + s, _mm256_mul_ps(_mm256_loadu_ps(buf + s), _k)); |
233 | 0 | for (; s < spatial; ++s) |
234 | 0 | _mm_store_ss(buf + s, _mm_mul_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_k))); |
235 | 0 | for (size_t c = 0; c < channels; ++c) |
236 | 0 | { |
237 | 0 | const float* ps = src + c * spatial; |
238 | 0 | float* pd = dst + c * spatial; |
239 | 0 | for (s = 0; s < spatialF; s += F) |
240 | 0 | { |
241 | 0 | __m256 _src = _mm256_loadu_ps(ps + s); |
242 | 0 | __m256 mean = _mm256_loadu_ps(buf + s); |
243 | 0 | _mm256_storeu_ps(pd + s, _mm256_sub_ps(_src, mean)); |
244 | 0 | } |
245 | 0 | for (; s < spatial; ++s) |
246 | 0 | { |
247 | 0 | __m128 _src = _mm_load_ss(ps + s); |
248 | 0 | __m128 mean = _mm_load_ss(buf + s); |
249 | 0 | _mm_store_ss(pd + s, _mm_sub_ps(_src, mean)); |
250 | 0 | } |
251 | 0 | } |
252 | |
|
253 | 0 | for (s = 0; s < spatialF; s += F) |
254 | 0 | _mm256_storeu_ps(buf + s, _mm256_setzero_ps()); |
255 | 0 | for (; s < spatial; ++s) |
256 | 0 | _mm_store_ss(buf + s, _mm_setzero_ps()); |
257 | 0 | for (size_t c = 0; c < channels; ++c) |
258 | 0 | { |
259 | 0 | const float* pd = dst + c * spatial; |
260 | 0 | for (s = 0; s < spatialF; s += F) |
261 | 0 | { |
262 | 0 | __m256 _dst = _mm256_loadu_ps(pd + s); |
263 | 0 | __m256 _sum = _mm256_loadu_ps(buf + s); |
264 | 0 | _mm256_storeu_ps(buf + s, _mm256_fmadd_ps(_dst, _dst, _sum)); |
265 | 0 | } |
266 | 0 | for (; s < spatial; ++s) |
267 | 0 | { |
268 | 0 | __m128 _dst = _mm_load_ss(pd + s); |
269 | 0 | __m128 _sum = _mm_load_ss(buf + s); |
270 | 0 | _mm_store_ss(buf + s, _mm_fmadd_ss(_dst, _dst, _sum)); |
271 | 0 | } |
272 | 0 | } |
273 | 0 | for (s = 0; s < spatialF; s += F) |
274 | 0 | _mm256_storeu_ps(buf + s, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_loadu_ps(buf + s), _k, _eps)))); |
275 | 0 | for (; s < spatial; ++s) |
276 | 0 | _mm_store_ss(buf + s, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_fmadd_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_k), _mm256_castps256_ps128(_eps))))); |
277 | 0 | for (size_t c = 0; c < channels; ++c) |
278 | 0 | { |
279 | 0 | __m256 _scale = _mm256_set1_ps(scale[c]); |
280 | 0 | __m256 _shift = _mm256_set1_ps(shift[c]); |
281 | 0 | float* pd = dst + c * spatial; |
282 | 0 | for (s = 0; s < spatialF; s += F) |
283 | 0 | { |
284 | 0 | __m256 _dst = _mm256_loadu_ps(pd + s); |
285 | 0 | __m256 norm = _mm256_loadu_ps(buf + s); |
286 | 0 | _mm256_storeu_ps(pd + s, _mm256_fmadd_ps(_mm256_mul_ps(_dst, norm), _scale, _shift)); |
287 | 0 | } |
288 | 0 | for (; s < spatial; ++s) |
289 | 0 | { |
290 | 0 | __m128 _dst = _mm_load_ss(pd + s); |
291 | 0 | __m128 norm = _mm_load_ss(buf + s); |
292 | 0 | _mm_store_ss(pd + s, _mm_fmadd_ss(_mm_mul_ss(_dst, norm), _mm256_castps256_ps128(_scale), _mm256_castps256_ps128(_shift))); |
293 | 0 | } |
294 | 0 | } |
295 | |
|
296 | 0 | src += channels * spatial; |
297 | 0 | dst += channels * spatial; |
298 | 0 | } |
299 | 0 | } |
300 | | |
301 | | void NormalizeNhwcV2(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* dst) |
302 | 0 | { |
303 | 0 | float k = 1.0f / float(channels); |
304 | 0 | size_t channelsF = AlignLo(channels, F), c; |
305 | 0 | for (size_t b = 0; b < batch; ++b) |
306 | 0 | { |
307 | 0 | for (size_t s = 0; s < spatial; ++s) |
308 | 0 | { |
309 | 0 | __m256 _sum = _mm256_setzero_ps(); |
310 | 0 | for (c = 0; c < channelsF; c += F) |
311 | 0 | _sum = _mm256_add_ps(_mm256_loadu_ps(src + c), _sum); |
312 | 0 | float sum = ExtractSum(_sum); |
313 | 0 | for (; c < channels; ++c) |
314 | 0 | sum += src[c]; |
315 | 0 | __m256 mean = _mm256_set1_ps(sum * k); |
316 | 0 | for (c = 0; c < channelsF; c += F) |
317 | 0 | _mm256_storeu_ps(dst + c, _mm256_sub_ps(_mm256_loadu_ps(src + c), mean)); |
318 | 0 | for (; c < channels; ++c) |
319 | 0 | _mm_store_ss(dst + c, _mm_sub_ss(_mm_load_ss(src + c), _mm256_castps256_ps128(mean))); |
320 | |
|
321 | 0 | __m256 _sqsum = _mm256_setzero_ps(); |
322 | 0 | for (c = 0; c < channelsF; c += F) |
323 | 0 | { |
324 | 0 | __m256 _dst = _mm256_loadu_ps(dst + c); |
325 | 0 | _sqsum = _mm256_fmadd_ps(_dst, _dst, _sqsum); |
326 | 0 | } |
327 | 0 | float sqsum = ExtractSum(_sqsum); |
328 | 0 | for (; c < channels; ++c) |
329 | 0 | sqsum += Simd::Square(dst[c]); |
330 | 0 | __m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps)); |
331 | 0 | for (c = 0; c < channelsF; c += F) |
332 | 0 | _mm256_storeu_ps(dst + c, _mm256_fmadd_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + c), norm), _mm256_loadu_ps(scale + c), _mm256_loadu_ps(shift + c))); |
333 | 0 | for (; c < channels; ++c) |
334 | 0 | _mm_store_ss(dst + c, _mm_fmadd_ss(_mm_mul_ss(_mm_load_ss(dst + c), _mm256_castps256_ps128(norm)), _mm_load_ss(scale + c), _mm_load_ss(shift + c))); |
335 | |
|
336 | 0 | dst += channels; |
337 | 0 | src += channels; |
338 | 0 | } |
339 | 0 | } |
340 | 0 | } |
341 | | |
342 | | void SynetNormalizeLayerForwardV2(const float* src, size_t batch, size_t channels, size_t spatial, |
343 | | const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst) |
344 | 0 | { |
345 | 0 | if (format == SimdTensorFormatNchw) |
346 | 0 | NormalizeNchwV2(src, batch, channels, spatial, scale, shift, *eps, buf, dst); |
347 | 0 | else if (format == SimdTensorFormatNhwc) |
348 | 0 | NormalizeNhwcV2(src, batch, channels, spatial, scale, shift, *eps, dst); |
349 | 0 | else |
350 | 0 | assert(0); |
351 | 0 | } |
352 | | |
353 | | //------------------------------------------------------------------------------------------------- |
354 | | |
355 | | void NormalizeNchwV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* dst) |
356 | 0 | { |
357 | 0 | float k = 1.0f / float(spatial); |
358 | 0 | size_t spatialF = AlignLo(spatial, F), s; |
359 | 0 | for (size_t b = 0; b < batch; ++b) |
360 | 0 | { |
361 | 0 | for (size_t c = 0; c < channels; ++c) |
362 | 0 | { |
363 | 0 | __m256 _sum = _mm256_setzero_ps(); |
364 | 0 | for (s = 0; s < spatialF; s += F) |
365 | 0 | _sum = _mm256_add_ps(_mm256_loadu_ps(src + s), _sum); |
366 | 0 | float sum = ExtractSum(_sum); |
367 | 0 | for (; s < spatial; ++s) |
368 | 0 | sum += src[s]; |
369 | 0 | __m256 mean = _mm256_set1_ps(sum * k); |
370 | 0 | for (s = 0; s < spatialF; s += F) |
371 | 0 | _mm256_storeu_ps(dst + s, _mm256_sub_ps(_mm256_loadu_ps(src + s), mean)); |
372 | 0 | for (; s < spatial; ++s) |
373 | 0 | _mm_store_ss(dst + s, _mm_sub_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(mean))); |
374 | |
|
375 | 0 | __m256 _sqsum = _mm256_setzero_ps(); |
376 | 0 | for (s = 0; s < spatialF; s += F) |
377 | 0 | _sqsum = _mm256_add_ps(Square(_mm256_loadu_ps(dst + s)), _sqsum); |
378 | 0 | float sqsum = ExtractSum(_sqsum); |
379 | 0 | for (; s < spatial; ++s) |
380 | 0 | sqsum += Simd::Square(dst[s]); |
381 | 0 | __m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps)); |
382 | 0 | __m256 _scale = _mm256_set1_ps(scale[c]); |
383 | 0 | __m256 _shift = _mm256_set1_ps(shift[c]); |
384 | 0 | for (s = 0; s < spatialF; s += F) |
385 | 0 | _mm256_storeu_ps(dst + s, _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + s), norm), _scale), _shift)); |
386 | 0 | for (; s < spatial; ++s) |
387 | 0 | _mm_store_ss(dst + s, _mm_add_ss(_mm_mul_ss(_mm_mul_ss(_mm_load_ss(dst + s), _mm256_castps256_ps128(norm)), _mm256_castps256_ps128(_scale)), _mm256_castps256_ps128(_shift))); |
388 | |
|
389 | 0 | dst += spatial; |
390 | 0 | src += spatial; |
391 | 0 | } |
392 | 0 | } |
393 | 0 | } |
394 | | |
395 | | void NormalizeNhwcV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst) |
396 | 0 | { |
397 | 0 | float k = 1.0f / float(spatial); |
398 | 0 | Array32f _buf; |
399 | 0 | if (buf == NULL) |
400 | 0 | { |
401 | 0 | _buf.Resize(spatial); |
402 | 0 | buf = _buf.data; |
403 | 0 | } |
404 | 0 | size_t channelsF = AlignLo(channels, F); |
405 | 0 | __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f); |
406 | 0 | for (size_t b = 0, c; b < batch; ++b) |
407 | 0 | { |
408 | 0 | for (c = 0; c < channelsF; c += F) |
409 | 0 | _mm256_storeu_ps(buf + c, _mm256_setzero_ps()); |
410 | 0 | for (; c < channels; ++c) |
411 | 0 | _mm_store_ss(buf + c, _mm_setzero_ps()); |
412 | 0 | for (size_t s = 0; s < spatial; ++s) |
413 | 0 | { |
414 | 0 | const float* ps = src + s * channels; |
415 | 0 | for (c = 0; c < channelsF; c += F) |
416 | 0 | { |
417 | 0 | __m256 _src = _mm256_loadu_ps(ps + c); |
418 | 0 | __m256 _sum = _mm256_loadu_ps(buf + c); |
419 | 0 | _mm256_storeu_ps(buf + c, _mm256_add_ps(_sum, _src)); |
420 | 0 | } |
421 | 0 | for (; c < channels; ++c) |
422 | 0 | { |
423 | 0 | __m128 _src = _mm_load_ss(ps + c); |
424 | 0 | __m128 _sum = _mm_load_ss(buf + c); |
425 | 0 | _mm_store_ss(buf + c, _mm_add_ss(_sum, _src)); |
426 | 0 | } |
427 | 0 | } |
428 | 0 | for (c = 0; c < channelsF; c += F) |
429 | 0 | _mm256_storeu_ps(buf + c, _mm256_mul_ps(_mm256_loadu_ps(buf + c), _k)); |
430 | 0 | for (; c < channels; ++c) |
431 | 0 | _mm_store_ss(buf + c, _mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(_k))); |
432 | 0 | for (size_t s = 0; s < spatial; ++s) |
433 | 0 | { |
434 | 0 | const float* ps = src + s * channels; |
435 | 0 | float* pd = dst + s * channels; |
436 | 0 | for (c = 0; c < channelsF; c += F) |
437 | 0 | { |
438 | 0 | __m256 _src = _mm256_loadu_ps(ps + c); |
439 | 0 | __m256 mean = _mm256_loadu_ps(buf + c); |
440 | 0 | _mm256_storeu_ps(pd + c, _mm256_sub_ps(_src, mean)); |
441 | 0 | } |
442 | 0 | for (; c < channels; ++c) |
443 | 0 | { |
444 | 0 | __m128 _src = _mm_load_ss(ps + c); |
445 | 0 | __m128 mean = _mm_load_ss(buf + c); |
446 | 0 | _mm_store_ss(pd + c, _mm_sub_ps(_src, mean)); |
447 | 0 | } |
448 | 0 | } |
449 | |
|
450 | 0 | for (c = 0; c < channelsF; c += F) |
451 | 0 | _mm256_storeu_ps(buf + c, _mm256_setzero_ps()); |
452 | 0 | for (; c < channels; ++c) |
453 | 0 | _mm_store_ss(buf + c, _mm_setzero_ps()); |
454 | 0 | for (size_t s = 0; s < spatial; ++s) |
455 | 0 | { |
456 | 0 | const float* pd = dst + s * channels; |
457 | 0 | for (c = 0; c < channelsF; c += F) |
458 | 0 | { |
459 | 0 | __m256 _dst = _mm256_loadu_ps(pd + c); |
460 | 0 | __m256 _sum = _mm256_loadu_ps(buf + c); |
461 | 0 | _mm256_storeu_ps(buf + c, _mm256_add_ps(_sum, _mm256_mul_ps(_dst, _dst))); |
462 | 0 | } |
463 | 0 | for (; c < channels; ++c) |
464 | 0 | { |
465 | 0 | __m128 _dst = _mm_load_ss(pd + c); |
466 | 0 | __m128 _sum = _mm_load_ss(buf + c); |
467 | 0 | _mm_store_ss(buf + c, _mm_add_ss(_sum, _mm_mul_ss(_dst, _dst))); |
468 | 0 | } |
469 | 0 | } |
470 | 0 | for (c = 0; c < channelsF; c += F) |
471 | 0 | _mm256_storeu_ps(buf + c, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(buf + c), _k), _eps)))); |
472 | 0 | for (; c < channels; ++c) |
473 | 0 | _mm_store_ss(buf + c, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(_k)), _mm256_castps256_ps128(_eps))))); |
474 | 0 | for (size_t s = 0; s < spatial; ++s) |
475 | 0 | { |
476 | 0 | float* pd = dst + s * channels; |
477 | 0 | for (c = 0; c < channelsF; c += F) |
478 | 0 | { |
479 | 0 | __m256 _dst = _mm256_loadu_ps(pd + c); |
480 | 0 | __m256 norm = _mm256_loadu_ps(buf + c); |
481 | 0 | _mm256_storeu_ps(pd + c, _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_dst, norm), _mm256_loadu_ps(scale + c)), _mm256_loadu_ps(shift + c))); |
482 | 0 | } |
483 | 0 | for (; c < channels; ++c) |
484 | 0 | { |
485 | 0 | __m128 _dst = _mm_load_ss(pd + c); |
486 | 0 | __m128 norm = _mm_load_ss(buf + c); |
487 | 0 | _mm_store_ss(pd + c, _mm_add_ss(_mm_mul_ss(_mm_mul_ss(_dst, norm), _mm_load_ss(scale + c)), _mm_load_ss(shift + c))); |
488 | 0 | } |
489 | 0 | } |
490 | |
|
491 | 0 | src += channels * spatial; |
492 | 0 | dst += channels * spatial; |
493 | 0 | } |
494 | 0 | } |
495 | | |
496 | | void SynetNormalizeLayerForwardV3(const float* src, size_t batch, size_t channels, size_t spatial, |
497 | | const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst) |
498 | 0 | { |
499 | 0 | if (format == SimdTensorFormatNchw) |
500 | 0 | NormalizeNchwV3(src, batch, channels, spatial, scale, shift, *eps, dst); |
501 | 0 | else if (format == SimdTensorFormatNhwc) |
502 | 0 | NormalizeNhwcV3(src, batch, channels, spatial, scale, shift, *eps, buf, dst); |
503 | 0 | else |
504 | 0 | assert(0); |
505 | 0 | } |
506 | | |
507 | | //------------------------------------------------------------------------------------------------- |
508 | | |
509 | | void NormalizeNchwV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst) |
510 | 0 | { |
511 | 0 | float k = 1.0f / float(channels); |
512 | 0 | size_t spatialF = AlignLo(spatial, F), s; |
513 | 0 | for (size_t b = 0; b < batch; ++b) |
514 | 0 | { |
515 | 0 | float sum = 0; |
516 | 0 | for (size_t c = 0, o = 0; c < channels; ++c) |
517 | 0 | { |
518 | 0 | __m256 _sqsum = _mm256_setzero_ps(); |
519 | 0 | for (s = 0; s < spatialF; s += F, o += F) |
520 | 0 | _sqsum = _mm256_add_ps(Square(_mm256_loadu_ps(src + o)), _sqsum); |
521 | 0 | float sqsum = ExtractSum(_sqsum); |
522 | 0 | for (; s < spatial; ++s, ++o) |
523 | 0 | sqsum += Simd::Square(src[o]); |
524 | 0 | buf[c] = sqrt(sqsum); |
525 | 0 | sum += buf[c]; |
526 | 0 | } |
527 | 0 | float norm = 1.0f / (sum * k + eps); |
528 | 0 | for (size_t c = 0; c < channels; ++c) |
529 | 0 | { |
530 | 0 | __m256 _alpha = _mm256_set1_ps(1.0f + scale[c] * buf[c] * norm); |
531 | 0 | __m256 _shift = _mm256_set1_ps(shift[c]); |
532 | 0 | for (s = 0; s < spatialF; s += F) |
533 | 0 | _mm256_storeu_ps(dst + s, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + s), _alpha), _shift)); |
534 | 0 | for (; s < spatial; ++s) |
535 | 0 | _mm_store_ss(dst + s, _mm_add_ss(_mm_mul_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(_alpha)), _mm256_castps256_ps128(_shift))); |
536 | 0 | dst += spatial; |
537 | 0 | src += spatial; |
538 | 0 | } |
539 | 0 | } |
540 | 0 | } |
541 | | |
542 | | void NormalizeNhwcV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst) |
543 | 0 | { |
544 | 0 | float k = 1.0f / float(channels); |
545 | 0 | size_t channelsF = AlignLo(channels, F), c; |
546 | 0 | __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f); |
547 | 0 | for (size_t b = 0; b < batch; ++b) |
548 | 0 | { |
549 | 0 | for (c = 0; c < channelsF; c += F) |
550 | 0 | _mm256_storeu_ps(buf + c, _mm256_setzero_ps()); |
551 | 0 | for (; c < channels; ++c) |
552 | 0 | _mm_store_ss(buf + c, _mm_setzero_ps()); |
553 | 0 | for (size_t s = 0, o = 0; s < spatial; ++s) |
554 | 0 | { |
555 | 0 | for (c = 0; c < channelsF; c += F, o += F) |
556 | 0 | _mm256_storeu_ps(buf + c, _mm256_add_ps(Square(_mm256_loadu_ps(src + o)), _mm256_loadu_ps(buf + c))); |
557 | 0 | for (; c < channels; c += 1, o += 1) |
558 | 0 | _mm_store_ss(buf + c, _mm_add_ss(Sse41::Square(_mm_load_ss(src + o)), _mm_load_ss(buf + c))); |
559 | 0 | } |
560 | 0 | float sum = 0; |
561 | 0 | for (size_t c = 0; c < channels; ++c) |
562 | 0 | { |
563 | 0 | buf[c] = sqrt(buf[c]); |
564 | 0 | sum += buf[c]; |
565 | 0 | } |
566 | 0 | float norm = 1.0f / (sum * k + eps); |
567 | 0 | for (size_t c = 0; c < channels; ++c) |
568 | 0 | buf[c] = 1.0f + scale[c] * buf[c] * norm; |
569 | 0 | for (size_t s = 0, o = 0; s < spatial; ++s) |
570 | 0 | { |
571 | 0 | for (c = 0; c < channelsF; c += F) |
572 | 0 | _mm256_storeu_ps(dst + c, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(buf + c)), _mm256_loadu_ps(shift + c))); |
573 | 0 | for (; c < channels; c += 1, o += 1) |
574 | 0 | _mm_store_ss(dst + c, _mm_add_ss(_mm_mul_ss(_mm_load_ss(src + c), _mm_load_ss(buf + c)), _mm_load_ss(shift + c))); |
575 | 0 | src += channels; |
576 | 0 | dst += channels; |
577 | |
|
578 | 0 | } |
579 | 0 | } |
580 | 0 | } |
581 | | |
582 | | void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial, |
583 | | const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst) |
584 | 0 | { |
585 | 0 | Array32f _buf; |
586 | 0 | if (buf == NULL) |
587 | 0 | { |
588 | 0 | _buf.Resize(channels); |
589 | 0 | buf = _buf.data; |
590 | 0 | } |
591 | 0 | if (format == SimdTensorFormatNchw) |
592 | 0 | NormalizeNchwV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst); |
593 | 0 | else if (format == SimdTensorFormatNhwc) |
594 | 0 | NormalizeNhwcV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst); |
595 | 0 | else |
596 | | assert(0); |
597 | 0 | } |
598 | | } |
599 | | #endif |
600 | | } |