Coverage Report

Created: 2026-04-01 07:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdAvx2SynetNormalize32f.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2026 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdSynet.h"
25
#include "Simd/SimdArray.h"
26
#include "Simd/SimdMath.h"
27
#include "Simd/SimdExtract.h"
28
29
namespace Simd
30
{
31
#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)   
32
    namespace Avx2
33
    {
34
        void NormalizeNchw1(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst)
35
0
        {
36
0
            size_t size = channels * spatial;
37
0
            size_t sizeF = AlignLo(size, F);
38
0
            size_t spatialF = AlignLo(spatial, F);
39
0
            for (size_t b = 0; b < batch; ++b)
40
0
            {
41
0
                __m256 _sum = _mm256_setzero_ps();
42
0
                size_t i = 0;
43
0
                for (; i < sizeF; i += F)
44
0
                {
45
0
                    __m256 _src = _mm256_loadu_ps(src + i);
46
0
                    _sum = _mm256_fmadd_ps(_src, _src, _sum);
47
0
                }
48
0
                float sum = ExtractSum(_sum);
49
0
                for (; i < size; ++i)
50
0
                    sum += Simd::Square(src[i]);
51
0
                float k0 = 1.0f / ::sqrt(sum + eps);
52
0
                for (size_t c = 0; c < channels; ++c)
53
0
                {
54
0
                    __m256 _k = _mm256_set1_ps(scale[c] * k0);
55
0
                    size_t s = 0;
56
0
                    for (; s < spatialF; s += F)
57
0
                        _mm256_storeu_ps(dst + s, _mm256_mul_ps(_mm256_loadu_ps(src + s), _k));
58
0
                    for (; s < spatial; ++s)
59
0
                        _mm_store_ss(dst + s, _mm_mul_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(_k)));
60
0
                    dst += spatial;
61
0
                    src += spatial;
62
0
                }
63
0
            }
64
0
        }
65
66
        void NormalizeNchw0(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* buf, float* dst)
67
0
        {
68
0
            Array32f _buf;
69
0
            if (buf == NULL)
70
0
            {
71
0
                _buf.Resize(spatial);
72
0
                buf = _buf.data;
73
0
            }
74
0
            size_t spatialF = AlignLo(spatial, F);
75
0
            __m256 _eps = _mm256_set1_ps(eps), _1 = _mm256_set1_ps(1.0f);
76
0
            for (size_t b = 0; b < batch; ++b)
77
0
            {
78
0
                size_t s = 0;
79
0
                for (; s < spatialF; s += F)
80
0
                    _mm256_storeu_ps(buf + s, _mm256_setzero_ps());
81
0
                for (; s < spatial; ++s)
82
0
                    _mm_store_ss(buf + s, _mm_setzero_ps());
83
0
                for (size_t c = 0; c < channels; ++c)
84
0
                {
85
0
                    const float* ps = src + c * spatial;
86
0
                    for (s = 0; s < spatialF; s += F)
87
0
                    {
88
0
                        __m256 _src = _mm256_loadu_ps(ps + s);
89
0
                        _mm256_storeu_ps(buf + s, _mm256_fmadd_ps(_src, _src, _mm256_loadu_ps(buf + s)));
90
0
                    }
91
0
                    for (; s < spatial; ++s)
92
0
                    {
93
0
                        __m128 _src = _mm_load_ss(ps + s);
94
0
                        __m128 _sum = _mm_load_ss(buf + s);
95
0
                        _mm_store_ss(buf + s, _mm_add_ss(_sum, _mm_mul_ps(_src, _src)));
96
0
                    }
97
0
                }
98
0
                for (s = 0; s < spatialF; s += F)
99
0
                    _mm256_storeu_ps(buf + s, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_add_ps(_mm256_loadu_ps(buf + s), _eps))));
100
0
                for (; s < spatial; ++s)
101
0
                    _mm_store_ss(buf + s, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_add_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_eps)))));
102
0
                for (size_t c = 0; c < channels; ++c)
103
0
                {
104
0
                    float k = scale[c];
105
0
                    __m256 _k = _mm256_set1_ps(k);
106
0
                    for (s = 0; s < spatialF; s += F)
107
0
                        _mm256_storeu_ps(dst + s, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + s), _mm256_loadu_ps(buf + s)), _k));
108
0
                    for (; s < spatial; ++s)
109
0
                        _mm_store_ss(dst + s, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + s), _mm_load_ss(buf + s)), _mm256_castps256_ps128(_k)));
110
0
                    dst += spatial;
111
0
                    src += spatial;
112
0
                }
113
0
            }
114
0
        }
115
116
        void NormalizeNhwc1(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst)
117
0
        {
118
0
            size_t size = channels * spatial;
119
0
            size_t sizeF = AlignLo(size, F);
120
0
            size_t channelsF = AlignLo(channels, F);
121
0
            for (size_t b = 0; b < batch; ++b)
122
0
            {
123
0
                __m256 _sum = _mm256_setzero_ps();
124
0
                size_t i = 0;
125
0
                for (; i < sizeF; i += F)
126
0
                {
127
0
                    __m256 _src = _mm256_loadu_ps(src + i);
128
0
                    _sum = _mm256_fmadd_ps(_src, _src, _sum);
129
0
                }
130
0
                float sum = ExtractSum(_sum);
131
0
                for (; i < size; ++i)
132
0
                    sum += Simd::Square(src[i]);
133
0
                __m256 _k = _mm256_set1_ps(1.0f / ::sqrt(sum + eps));
134
0
                for (size_t s = 0; s < spatial; ++s)
135
0
                {
136
0
                    size_t c = 0;
137
0
                    for (; c < channelsF; c += F)
138
0
                        _mm256_storeu_ps(dst + c, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(scale + c)), _k));
139
0
                    for (; c < channels; ++c)
140
0
                        _mm_store_ss(dst + c, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + c), _mm_load_ss(scale + c)), _mm256_castps256_ps128(_k)));
141
0
                    dst += channels;
142
0
                    src += channels;
143
0
                }
144
0
            }
145
0
        }
146
147
        void NormalizeNhwc0(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, float eps, float* dst)
148
0
        {
149
0
            size_t channelsF = AlignLo(channels, F);
150
0
            for (size_t b = 0; b < batch; ++b)
151
0
            {
152
0
                for (size_t s = 0; s < spatial; ++s)
153
0
                {
154
0
                    __m256 _sum = _mm256_setzero_ps();
155
0
                    size_t c = 0;
156
0
                    for (; c < channelsF; c += F)
157
0
                    {
158
0
                        __m256 _src = _mm256_loadu_ps(src + c);
159
0
                        _sum = _mm256_fmadd_ps(_src, _src, _sum);
160
0
                    }
161
0
                    float sum = ExtractSum(_sum);
162
0
                    for (; c < channels; ++c)
163
0
                        sum += Simd::Square(src[c]);
164
0
                    __m256 _k = _mm256_set1_ps(1.0f / ::sqrt(sum + eps));
165
0
                    for (c = 0; c < channelsF; c += F)
166
0
                        _mm256_storeu_ps(dst + c, _mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(scale + c)), _k));
167
0
                    for (; c < channels; ++c)
168
0
                        _mm_store_ss(dst + c, _mm_mul_ss(_mm_mul_ps(_mm_load_ss(src + c), _mm_load_ss(scale + c)), _mm256_castps256_ps128(_k)));
169
0
                    dst += channels;
170
0
                    src += channels;
171
0
                }
172
0
            }
173
0
        }
174
175
        void SynetNormalizeLayerForward(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale,
176
            const float* eps, SimdBool acrossSpatial, SimdTensorFormatType format, float* buf, float* dst)
177
0
        {
178
0
            if (format == SimdTensorFormatNchw)
179
0
            {
180
0
                if (acrossSpatial)
181
0
                    NormalizeNchw1(src, batch, channels, spatial, scale, eps[0], dst);
182
0
                else
183
0
                    NormalizeNchw0(src, batch, channels, spatial, scale, eps[0], buf, dst);
184
0
            }
185
0
            else if (format == SimdTensorFormatNhwc)
186
0
            {
187
0
                if (acrossSpatial)
188
0
                    NormalizeNhwc1(src, batch, channels, spatial, scale, eps[0], dst);
189
0
                else
190
0
                    NormalizeNhwc0(src, batch, channels, spatial, scale, eps[0], dst);
191
0
            }
192
0
            else
193
0
                assert(0);
194
0
        }
195
196
        //-------------------------------------------------------------------------------------------------
197
198
        void NormalizeNchwV2(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
199
0
        {
200
0
            float k = 1.0f / float(channels);
201
0
            Array32f _buf;
202
0
            if (buf == NULL)
203
0
            {
204
0
                _buf.Resize(spatial);
205
0
                buf = _buf.data;
206
0
            }
207
0
            size_t spatialF = AlignLo(spatial, F);
208
0
            __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f);
209
0
            for (size_t b = 0, s; b < batch; ++b)
210
0
            {
211
0
                for (s = 0; s < spatialF; s += F)
212
0
                    _mm256_storeu_ps(buf + s, _mm256_setzero_ps());
213
0
                for (; s < spatial; ++s)
214
0
                    _mm_store_ss(buf + s, _mm_setzero_ps());
215
0
                for (size_t c = 0; c < channels; ++c)
216
0
                {
217
0
                    const float* ps = src + c * spatial;
218
0
                    for (s = 0; s < spatialF; s += F)
219
0
                    {
220
0
                        __m256 _src = _mm256_loadu_ps(ps + s);
221
0
                        __m256 _sum = _mm256_loadu_ps(buf + s);
222
0
                        _mm256_storeu_ps(buf + s, _mm256_add_ps(_sum, _src));
223
0
                    }
224
0
                    for (; s < spatial; ++s)
225
0
                    {
226
0
                        __m128 _src = _mm_load_ss(ps + s);
227
0
                        __m128 _sum = _mm_load_ss(buf + s);
228
0
                        _mm_store_ss(buf + s, _mm_add_ss(_sum, _src));
229
0
                    }
230
0
                }
231
0
                for (s = 0; s < spatialF; s += F)
232
0
                    _mm256_storeu_ps(buf + s, _mm256_mul_ps(_mm256_loadu_ps(buf + s), _k));
233
0
                for (; s < spatial; ++s)
234
0
                    _mm_store_ss(buf + s, _mm_mul_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_k)));
235
0
                for (size_t c = 0; c < channels; ++c)
236
0
                {
237
0
                    const float* ps = src + c * spatial;
238
0
                    float* pd = dst + c * spatial;
239
0
                    for (s = 0; s < spatialF; s += F)
240
0
                    {
241
0
                        __m256 _src = _mm256_loadu_ps(ps + s);
242
0
                        __m256 mean = _mm256_loadu_ps(buf + s);
243
0
                        _mm256_storeu_ps(pd + s, _mm256_sub_ps(_src, mean));
244
0
                    }
245
0
                    for (; s < spatial; ++s)
246
0
                    {
247
0
                        __m128 _src = _mm_load_ss(ps + s);
248
0
                        __m128 mean = _mm_load_ss(buf + s);
249
0
                        _mm_store_ss(pd + s, _mm_sub_ps(_src, mean));
250
0
                    }
251
0
                }
252
253
0
                for (s = 0; s < spatialF; s += F)
254
0
                    _mm256_storeu_ps(buf + s, _mm256_setzero_ps());
255
0
                for (; s < spatial; ++s)
256
0
                    _mm_store_ss(buf + s, _mm_setzero_ps());
257
0
                for (size_t c = 0; c < channels; ++c)
258
0
                {
259
0
                    const float* pd = dst + c * spatial;
260
0
                    for (s = 0; s < spatialF; s += F)
261
0
                    {
262
0
                        __m256 _dst = _mm256_loadu_ps(pd + s);
263
0
                        __m256 _sum = _mm256_loadu_ps(buf + s);
264
0
                        _mm256_storeu_ps(buf + s, _mm256_fmadd_ps(_dst, _dst, _sum));
265
0
                    }
266
0
                    for (; s < spatial; ++s)
267
0
                    {
268
0
                        __m128 _dst = _mm_load_ss(pd + s);
269
0
                        __m128 _sum = _mm_load_ss(buf + s);
270
0
                        _mm_store_ss(buf + s, _mm_fmadd_ss(_dst, _dst, _sum));
271
0
                    }
272
0
                }
273
0
                for (s = 0; s < spatialF; s += F)
274
0
                    _mm256_storeu_ps(buf + s, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_fmadd_ps(_mm256_loadu_ps(buf + s), _k, _eps))));
275
0
                for (; s < spatial; ++s)
276
0
                    _mm_store_ss(buf + s, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_fmadd_ss(_mm_load_ss(buf + s), _mm256_castps256_ps128(_k), _mm256_castps256_ps128(_eps)))));
277
0
                for (size_t c = 0; c < channels; ++c)
278
0
                {
279
0
                    __m256 _scale = _mm256_set1_ps(scale[c]);
280
0
                    __m256 _shift = _mm256_set1_ps(shift[c]);
281
0
                    float* pd = dst + c * spatial;
282
0
                    for (s = 0; s < spatialF; s += F)
283
0
                    {
284
0
                        __m256 _dst = _mm256_loadu_ps(pd + s);
285
0
                        __m256 norm = _mm256_loadu_ps(buf + s);
286
0
                        _mm256_storeu_ps(pd + s, _mm256_fmadd_ps(_mm256_mul_ps(_dst, norm), _scale, _shift));
287
0
                    }
288
0
                    for (; s < spatial; ++s)
289
0
                    {
290
0
                        __m128 _dst = _mm_load_ss(pd + s);
291
0
                        __m128 norm = _mm_load_ss(buf + s);
292
0
                        _mm_store_ss(pd + s, _mm_fmadd_ss(_mm_mul_ss(_dst, norm), _mm256_castps256_ps128(_scale), _mm256_castps256_ps128(_shift)));
293
0
                    }
294
0
                }
295
296
0
                src += channels * spatial;
297
0
                dst += channels * spatial;
298
0
            }
299
0
        }
300
301
        void NormalizeNhwcV2(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* dst)
302
0
        {
303
0
            float k = 1.0f / float(channels);
304
0
            size_t channelsF = AlignLo(channels, F), c;
305
0
            for (size_t b = 0; b < batch; ++b)
306
0
            {
307
0
                for (size_t s = 0; s < spatial; ++s)
308
0
                {
309
0
                    __m256 _sum = _mm256_setzero_ps();
310
0
                    for (c = 0; c < channelsF; c += F)
311
0
                        _sum = _mm256_add_ps(_mm256_loadu_ps(src + c), _sum);
312
0
                    float sum = ExtractSum(_sum);
313
0
                    for (; c < channels; ++c)
314
0
                        sum += src[c];
315
0
                    __m256 mean = _mm256_set1_ps(sum * k);
316
0
                    for (c = 0; c < channelsF; c += F)
317
0
                        _mm256_storeu_ps(dst + c, _mm256_sub_ps(_mm256_loadu_ps(src + c), mean));
318
0
                    for (; c < channels; ++c)
319
0
                        _mm_store_ss(dst + c, _mm_sub_ss(_mm_load_ss(src + c), _mm256_castps256_ps128(mean)));
320
321
0
                    __m256 _sqsum = _mm256_setzero_ps();
322
0
                    for (c = 0; c < channelsF; c += F)
323
0
                    {
324
0
                        __m256 _dst = _mm256_loadu_ps(dst + c);
325
0
                        _sqsum = _mm256_fmadd_ps(_dst, _dst, _sqsum);
326
0
                    }
327
0
                    float sqsum = ExtractSum(_sqsum);
328
0
                    for (; c < channels; ++c)
329
0
                        sqsum += Simd::Square(dst[c]);
330
0
                    __m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps));
331
0
                    for (c = 0; c < channelsF; c += F)
332
0
                        _mm256_storeu_ps(dst + c, _mm256_fmadd_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + c), norm), _mm256_loadu_ps(scale + c), _mm256_loadu_ps(shift + c)));
333
0
                    for (; c < channels; ++c)
334
0
                        _mm_store_ss(dst + c, _mm_fmadd_ss(_mm_mul_ss(_mm_load_ss(dst + c), _mm256_castps256_ps128(norm)), _mm_load_ss(scale + c), _mm_load_ss(shift + c)));
335
336
0
                    dst += channels;
337
0
                    src += channels;
338
0
                }
339
0
            }
340
0
        }
341
342
        void SynetNormalizeLayerForwardV2(const float* src, size_t batch, size_t channels, size_t spatial,
343
            const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst)
344
0
        {
345
0
            if (format == SimdTensorFormatNchw)
346
0
                NormalizeNchwV2(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
347
0
            else if (format == SimdTensorFormatNhwc)
348
0
                NormalizeNhwcV2(src, batch, channels, spatial, scale, shift, *eps, dst);
349
0
            else
350
0
                assert(0);
351
0
        }
352
353
        //-------------------------------------------------------------------------------------------------
354
355
        void NormalizeNchwV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* dst)
356
0
        {
357
0
            float k = 1.0f / float(spatial);
358
0
            size_t spatialF = AlignLo(spatial, F), s;
359
0
            for (size_t b = 0; b < batch; ++b)
360
0
            {
361
0
                for (size_t c = 0; c < channels; ++c)
362
0
                {
363
0
                    __m256 _sum = _mm256_setzero_ps();
364
0
                    for (s = 0; s < spatialF; s += F)
365
0
                        _sum = _mm256_add_ps(_mm256_loadu_ps(src + s), _sum);
366
0
                    float sum = ExtractSum(_sum);
367
0
                    for (; s < spatial; ++s)
368
0
                        sum += src[s];
369
0
                    __m256 mean = _mm256_set1_ps(sum * k);
370
0
                    for (s = 0; s < spatialF; s += F)
371
0
                        _mm256_storeu_ps(dst + s, _mm256_sub_ps(_mm256_loadu_ps(src + s), mean));
372
0
                    for (; s < spatial; ++s)
373
0
                        _mm_store_ss(dst + s, _mm_sub_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(mean)));
374
375
0
                    __m256 _sqsum = _mm256_setzero_ps();
376
0
                    for (s = 0; s < spatialF; s += F)
377
0
                        _sqsum = _mm256_add_ps(Square(_mm256_loadu_ps(dst + s)), _sqsum);
378
0
                    float sqsum = ExtractSum(_sqsum);
379
0
                    for (; s < spatial; ++s)
380
0
                        sqsum += Simd::Square(dst[s]);
381
0
                    __m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps));
382
0
                    __m256 _scale = _mm256_set1_ps(scale[c]);
383
0
                    __m256 _shift = _mm256_set1_ps(shift[c]);
384
0
                    for (s = 0; s < spatialF; s += F)
385
0
                        _mm256_storeu_ps(dst + s, _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_loadu_ps(dst + s), norm), _scale), _shift));
386
0
                    for (; s < spatial; ++s)
387
0
                        _mm_store_ss(dst + s, _mm_add_ss(_mm_mul_ss(_mm_mul_ss(_mm_load_ss(dst + s), _mm256_castps256_ps128(norm)), _mm256_castps256_ps128(_scale)), _mm256_castps256_ps128(_shift)));
388
389
0
                    dst += spatial;
390
0
                    src += spatial;
391
0
                }
392
0
            }
393
0
        }
394
395
        void NormalizeNhwcV3(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
396
0
        {
397
0
            float k = 1.0f / float(spatial);
398
0
            Array32f _buf;
399
0
            if (buf == NULL)
400
0
            {
401
0
                _buf.Resize(spatial);
402
0
                buf = _buf.data;
403
0
            }
404
0
            size_t channelsF = AlignLo(channels, F);
405
0
            __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f);
406
0
            for (size_t b = 0, c; b < batch; ++b)
407
0
            {
408
0
                for (c = 0; c < channelsF; c += F)
409
0
                    _mm256_storeu_ps(buf + c, _mm256_setzero_ps());
410
0
                for (; c < channels; ++c)
411
0
                    _mm_store_ss(buf + c, _mm_setzero_ps());
412
0
                for (size_t s = 0; s < spatial; ++s)
413
0
                {
414
0
                    const float* ps = src + s * channels;
415
0
                    for (c = 0; c < channelsF; c += F)
416
0
                    {
417
0
                        __m256 _src = _mm256_loadu_ps(ps + c);
418
0
                        __m256 _sum = _mm256_loadu_ps(buf + c);
419
0
                        _mm256_storeu_ps(buf + c, _mm256_add_ps(_sum, _src));
420
0
                    }
421
0
                    for (; c < channels; ++c)
422
0
                    {
423
0
                        __m128 _src = _mm_load_ss(ps + c);
424
0
                        __m128 _sum = _mm_load_ss(buf + c);
425
0
                        _mm_store_ss(buf + c, _mm_add_ss(_sum, _src));
426
0
                    }
427
0
                }
428
0
                for (c = 0; c < channelsF; c += F)
429
0
                    _mm256_storeu_ps(buf + c, _mm256_mul_ps(_mm256_loadu_ps(buf + c), _k));
430
0
                for (; c < channels; ++c)
431
0
                    _mm_store_ss(buf + c, _mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(_k)));
432
0
                for (size_t s = 0; s < spatial; ++s)
433
0
                {
434
0
                    const float* ps = src + s * channels;
435
0
                    float* pd = dst + s * channels;
436
0
                    for (c = 0; c < channelsF; c += F)
437
0
                    {
438
0
                        __m256 _src = _mm256_loadu_ps(ps + c);
439
0
                        __m256 mean = _mm256_loadu_ps(buf + c);
440
0
                        _mm256_storeu_ps(pd + c, _mm256_sub_ps(_src, mean));
441
0
                    }
442
0
                    for (; c < channels; ++c)
443
0
                    {
444
0
                        __m128 _src = _mm_load_ss(ps + c);
445
0
                        __m128 mean = _mm_load_ss(buf + c);
446
0
                        _mm_store_ss(pd + c, _mm_sub_ps(_src, mean));
447
0
                    }
448
0
                }
449
450
0
                for (c = 0; c < channelsF; c += F)
451
0
                    _mm256_storeu_ps(buf + c, _mm256_setzero_ps());
452
0
                for (; c < channels; ++c)
453
0
                    _mm_store_ss(buf + c, _mm_setzero_ps());
454
0
                for (size_t s = 0; s < spatial; ++s)
455
0
                {
456
0
                    const float* pd = dst + s * channels;
457
0
                    for (c = 0; c < channelsF; c += F)
458
0
                    {
459
0
                        __m256 _dst = _mm256_loadu_ps(pd + c);
460
0
                        __m256 _sum = _mm256_loadu_ps(buf + c);
461
0
                        _mm256_storeu_ps(buf + c, _mm256_add_ps(_sum, _mm256_mul_ps(_dst, _dst)));
462
0
                    }
463
0
                    for (; c < channels; ++c)
464
0
                    {
465
0
                        __m128 _dst = _mm_load_ss(pd + c);
466
0
                        __m128 _sum = _mm_load_ss(buf + c);
467
0
                        _mm_store_ss(buf + c, _mm_add_ss(_sum, _mm_mul_ss(_dst, _dst)));
468
0
                    }
469
0
                }
470
0
                for (c = 0; c < channelsF; c += F)
471
0
                    _mm256_storeu_ps(buf + c, _mm256_div_ps(_1, _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(buf + c), _k), _eps))));
472
0
                for (; c < channels; ++c)
473
0
                    _mm_store_ss(buf + c, _mm_div_ss(_mm256_castps256_ps128(_1), _mm_sqrt_ss(_mm_add_ss(_mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(_k)), _mm256_castps256_ps128(_eps)))));
474
0
                for (size_t s = 0; s < spatial; ++s)
475
0
                {
476
0
                    float* pd = dst + s * channels;
477
0
                    for (c = 0; c < channelsF; c += F)
478
0
                    {
479
0
                        __m256 _dst = _mm256_loadu_ps(pd + c);
480
0
                        __m256 norm = _mm256_loadu_ps(buf + c);
481
0
                        _mm256_storeu_ps(pd + c, _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_dst, norm), _mm256_loadu_ps(scale + c)), _mm256_loadu_ps(shift + c)));
482
0
                    }
483
0
                    for (; c < channels; ++c)
484
0
                    {
485
0
                        __m128 _dst = _mm_load_ss(pd + c);
486
0
                        __m128 norm = _mm_load_ss(buf + c);
487
0
                        _mm_store_ss(pd + c, _mm_add_ss(_mm_mul_ss(_mm_mul_ss(_dst, norm), _mm_load_ss(scale + c)), _mm_load_ss(shift + c)));
488
0
                    }
489
0
                }
490
491
0
                src += channels * spatial;
492
0
                dst += channels * spatial;
493
0
            }
494
0
        }
495
496
        void SynetNormalizeLayerForwardV3(const float* src, size_t batch, size_t channels, size_t spatial,
497
            const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst)
498
0
        {
499
0
            if (format == SimdTensorFormatNchw)
500
0
                NormalizeNchwV3(src, batch, channels, spatial, scale, shift, *eps, dst);
501
0
            else if (format == SimdTensorFormatNhwc)
502
0
                NormalizeNhwcV3(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
503
0
            else
504
0
                assert(0);
505
0
        }
506
507
        //-------------------------------------------------------------------------------------------------
508
509
        void NormalizeNchwV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
510
0
        {
511
0
            float k = 1.0f / float(channels);
512
0
            size_t spatialF = AlignLo(spatial, F), s;
513
0
            for (size_t b = 0; b < batch; ++b)
514
0
            {
515
0
                float sum = 0;
516
0
                for (size_t c = 0, o = 0; c < channels; ++c)
517
0
                {
518
0
                    __m256 _sqsum = _mm256_setzero_ps();
519
0
                    for (s = 0; s < spatialF; s += F, o += F)
520
0
                        _sqsum = _mm256_add_ps(Square(_mm256_loadu_ps(src + o)), _sqsum);
521
0
                    float sqsum = ExtractSum(_sqsum);
522
0
                    for (; s < spatial; ++s, ++o)
523
0
                        sqsum += Simd::Square(src[o]);
524
0
                    buf[c] = sqrt(sqsum);
525
0
                    sum += buf[c];
526
0
                }
527
0
                float norm = 1.0f / (sum * k + eps);
528
0
                for (size_t c = 0; c < channels; ++c)
529
0
                {
530
0
                    __m256 _alpha = _mm256_set1_ps(1.0f + scale[c] * buf[c] * norm);
531
0
                    __m256 _shift = _mm256_set1_ps(shift[c]);
532
0
                    for (s = 0; s < spatialF; s += F)
533
0
                        _mm256_storeu_ps(dst + s, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + s), _alpha), _shift));
534
0
                    for (; s < spatial; ++s)
535
0
                        _mm_store_ss(dst + s, _mm_add_ss(_mm_mul_ss(_mm_load_ss(src + s), _mm256_castps256_ps128(_alpha)), _mm256_castps256_ps128(_shift)));
536
0
                    dst += spatial;
537
0
                    src += spatial;
538
0
                }
539
0
            }
540
0
        }
541
542
        void NormalizeNhwcV4(const float* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, float* dst)
543
0
        {
544
0
            float k = 1.0f / float(channels);
545
0
            size_t channelsF = AlignLo(channels, F), c;
546
0
            __m256 _eps = _mm256_set1_ps(eps), _k = _mm256_set1_ps(k), _1 = _mm256_set1_ps(1.0f);
547
0
            for (size_t b = 0; b < batch; ++b)
548
0
            {
549
0
                for (c = 0; c < channelsF; c += F)
550
0
                    _mm256_storeu_ps(buf + c, _mm256_setzero_ps());
551
0
                for (; c < channels; ++c)
552
0
                    _mm_store_ss(buf + c, _mm_setzero_ps());
553
0
                for (size_t s = 0, o = 0; s < spatial; ++s)
554
0
                {
555
0
                    for (c = 0; c < channelsF; c += F, o += F)
556
0
                        _mm256_storeu_ps(buf + c, _mm256_add_ps(Square(_mm256_loadu_ps(src + o)), _mm256_loadu_ps(buf + c)));
557
0
                    for (; c < channels; c += 1, o += 1)
558
0
                        _mm_store_ss(buf + c, _mm_add_ss(Sse41::Square(_mm_load_ss(src + o)), _mm_load_ss(buf + c)));
559
0
                }
560
0
                float sum = 0;
561
0
                for (size_t c = 0; c < channels; ++c)
562
0
                {
563
0
                    buf[c] = sqrt(buf[c]);
564
0
                    sum += buf[c];
565
0
                }
566
0
                float norm = 1.0f / (sum * k + eps);
567
0
                for (size_t c = 0; c < channels; ++c)
568
0
                    buf[c] = 1.0f + scale[c] * buf[c] * norm;
569
0
                for (size_t s = 0, o = 0; s < spatial; ++s)
570
0
                {
571
0
                    for (c = 0; c < channelsF; c += F)
572
0
                        _mm256_storeu_ps(dst + c, _mm256_add_ps(_mm256_mul_ps(_mm256_loadu_ps(src + c), _mm256_loadu_ps(buf + c)), _mm256_loadu_ps(shift + c)));
573
0
                    for (; c < channels; c += 1, o += 1)
574
0
                        _mm_store_ss(dst + c, _mm_add_ss(_mm_mul_ss(_mm_load_ss(src + c), _mm_load_ss(buf + c)), _mm_load_ss(shift + c)));
575
0
                    src += channels;
576
0
                    dst += channels;
577
578
0
                }
579
0
            }
580
0
        }
581
582
        void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial,
583
            const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst)
584
0
        {
585
0
            Array32f _buf;
586
0
            if (buf == NULL)
587
0
            {
588
0
                _buf.Resize(channels);
589
0
                buf = _buf.data;
590
0
            }
591
0
            if (format == SimdTensorFormatNchw)
592
0
                NormalizeNchwV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
593
0
            else if (format == SimdTensorFormatNhwc)
594
0
                NormalizeNhwcV4(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
595
0
            else
596
                assert(0);
597
0
        }
598
    }
599
#endif
600
}