Coverage Report

Created: 2026-04-09 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/Simd/src/Simd/SimdSse41SynetSoftmax32f.cpp
Line
Count
Source
1
/*
2
* Simd Library (http://ermig1979.github.io/Simd).
3
*
4
* Copyright (c) 2011-2026 Yermalayeu Ihar.
5
*
6
* Permission is hereby granted, free of charge, to any person obtaining a copy
7
* of this software and associated documentation files (the "Software"), to deal
8
* in the Software without restriction, including without limitation the rights
9
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
* copies of the Software, and to permit persons to whom the Software is
11
* furnished to do so, subject to the following conditions:
12
*
13
* The above copyright notice and this permission notice shall be included in
14
* all copies or substantial portions of the Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
* SOFTWARE.
23
*/
24
#include "Simd/SimdMemory.h"
25
#include "Simd/SimdArray.h"
26
#include "Simd/SimdStore.h"
27
#include "Simd/SimdExtract.h"
28
#include "Simd/SimdSynet.h"
29
#include "Simd/SimdBase.h"
30
#include "Simd/SimdSse41.h"
31
#include "Simd/SimdExp.h"
32
#include "Simd/SimdGather.h"
33
#include "Simd/SimdPow.h"
34
35
namespace Simd
36
{
37
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)   
38
    namespace Sse41
39
    {
40
        void SynetSoftmax32f21(const float* src, size_t outer, float* dst)
41
0
        {
42
0
            Exp exp;
43
0
            size_t aligned = Simd::AlignLo(outer, F);
44
0
            size_t o = 0;
45
0
            for (; o < aligned; o += F)
46
0
            {
47
0
                __m128 s0 = _mm_loadu_ps(src + 0);
48
0
                __m128 s1 = _mm_loadu_ps(src + F);
49
0
                __m128 ss0 = _mm_shuffle_ps(s0, s1, 0x88);
50
0
                __m128 ss1 = _mm_shuffle_ps(s0, s1, 0xDD);
51
0
                __m128 max = _mm_max_ps(ss0, ss1);
52
0
                __m128 exp0 = exp.Exponent(_mm_sub_ps(ss0, max));
53
0
                __m128 exp1 = exp.Exponent(_mm_sub_ps(ss1, max));
54
0
                __m128 sum = _mm_add_ps(exp0, exp1);
55
0
                __m128 d0 = _mm_div_ps(exp0, sum);
56
0
                __m128 d1 = _mm_div_ps(exp1, sum);
57
0
                _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(d0, d1));
58
0
                _mm_storeu_ps(dst + F, _mm_unpackhi_ps(d0, d1));
59
0
                src += DF;
60
0
                dst += DF;
61
0
            }
62
0
            for (; o < outer; ++o)
63
0
            {
64
0
                float max = Simd::Max(src[0], src[1]);
65
0
                float exp0 = ::exp(src[0] - max);
66
0
                float exp1 = ::exp(src[1] - max);
67
0
                float sum = exp0 + exp1;
68
0
                dst[0] = exp0 / sum;
69
0
                dst[1] = exp1 / sum;
70
0
                src += 2;
71
0
                dst += 2;
72
0
            }
73
0
        }
74
75
        SIMD_INLINE void SynetSoftmax32f31(const Exp& exp, __m128 buf[3])
76
0
        {
77
0
            __m128 max = _mm_max_ps(buf[0], _mm_max_ps(buf[1], buf[2]));
78
0
            buf[0] = exp.Exponent(_mm_sub_ps(buf[0], max));
79
0
            buf[1] = exp.Exponent(_mm_sub_ps(buf[1], max));
80
0
            buf[2] = exp.Exponent(_mm_sub_ps(buf[2], max));
81
0
            __m128 sum = _mm_add_ps(buf[0], _mm_add_ps(buf[1], buf[2]));
82
0
            buf[0] = _mm_div_ps(buf[0], sum);
83
0
            buf[1] = _mm_div_ps(buf[1], sum);
84
0
            buf[2] = _mm_div_ps(buf[2], sum);
85
0
        }
86
87
        void SynetSoftmax32f31(const float* src, size_t outer, float* dst)
88
0
        {
89
0
            Exp exp;
90
0
            __m128 buf[3];
91
0
            size_t aligned = Simd::AlignLo(outer, F);
92
0
            for (size_t o = 0; o < aligned; o += F)
93
0
            {
94
0
                buf[0] = Gather<3>(src + 0);
95
0
                buf[1] = Gather<3>(src + 1);
96
0
                buf[2] = Gather<3>(src + 2);
97
0
                SynetSoftmax32f31(exp, buf);
98
0
                Scater<3>(dst + 0, buf[0]);
99
0
                Scater<3>(dst + 1, buf[1]);
100
0
                Scater<3>(dst + 2, buf[2]);
101
0
                src += 3 * F;
102
0
                dst += 3 * F;
103
0
            }
104
0
            if (aligned < outer)
105
0
            {
106
0
                size_t tail = outer - aligned;
107
0
                buf[0] = Gather<3>(src + 0, tail);
108
0
                buf[1] = Gather<3>(src + 1, tail);
109
0
                buf[2] = Gather<3>(src + 2, tail);
110
0
                SynetSoftmax32f31(exp, buf);
111
0
                Scater<3>(dst + 0, buf[0], tail);
112
0
                Scater<3>(dst + 1, buf[1], tail);
113
0
                Scater<3>(dst + 2, buf[2], tail);
114
0
            }
115
0
        }
116
117
        SIMD_INLINE void LoadTansp4x4(const float* src, size_t count, float* dst, __m128& max)
118
0
        {
119
0
            __m128 a0 = _mm_loadu_ps(src + 0 * count);
120
0
            __m128 a1 = _mm_loadu_ps(src + 1 * count);
121
0
            __m128 a2 = _mm_loadu_ps(src + 2 * count);
122
0
            __m128 a3 = _mm_loadu_ps(src + 3 * count);
123
0
            __m128 b0 = _mm_unpacklo_ps(a0, a2);
124
0
            __m128 b1 = _mm_unpacklo_ps(a1, a3);
125
0
            __m128 b2 = _mm_unpackhi_ps(a0, a2);
126
0
            __m128 b3 = _mm_unpackhi_ps(a1, a3);
127
0
            a0 = _mm_unpacklo_ps(b0, b1);
128
0
            max = _mm_max_ps(max, a0);
129
0
            _mm_storeu_ps(dst + 0 * F, a0);
130
0
            a1 = _mm_unpackhi_ps(b0, b1);
131
0
            max = _mm_max_ps(max, a1);
132
0
            _mm_storeu_ps(dst + 1 * F, a1);
133
0
            a2 = _mm_unpacklo_ps(b2, b3);
134
0
            max = _mm_max_ps(max, a2);
135
0
            _mm_storeu_ps(dst + 2 * F, a2);
136
0
            a3 = _mm_unpackhi_ps(b2, b3);
137
0
            max = _mm_max_ps(max, a3);
138
0
            _mm_storeu_ps(dst + 3 * F, a3);
139
0
        }
140
141
        SIMD_INLINE void StoreTansp4x4(const float* src, __m128 k, float* dst, size_t count)
142
0
        {
143
0
            __m128 a0 = _mm_mul_ps(_mm_loadu_ps(src + 0 * F), k);
144
0
            __m128 a1 = _mm_mul_ps(_mm_loadu_ps(src + 1 * F), k);
145
0
            __m128 a2 = _mm_mul_ps(_mm_loadu_ps(src + 2 * F), k);
146
0
            __m128 a3 = _mm_mul_ps(_mm_loadu_ps(src + 3 * F), k);
147
0
            __m128 b0 = _mm_unpacklo_ps(a0, a2);
148
0
            __m128 b1 = _mm_unpacklo_ps(a1, a3);
149
0
            __m128 b2 = _mm_unpackhi_ps(a0, a2);
150
0
            __m128 b3 = _mm_unpackhi_ps(a1, a3);
151
0
            _mm_storeu_ps(dst + 0 * count, _mm_unpacklo_ps(b0, b1));
152
0
            _mm_storeu_ps(dst + 1 * count, _mm_unpackhi_ps(b0, b1));
153
0
            _mm_storeu_ps(dst + 2 * count, _mm_unpacklo_ps(b2, b3));
154
0
            _mm_storeu_ps(dst + 3 * count, _mm_unpackhi_ps(b2, b3));
155
0
        }
156
157
        void SynetSoftmax32fX1(const float* src, size_t outer, size_t count, float* dst)
158
0
        {
159
0
            size_t o = 0, c = 0, outerF = AlignLo(outer, F), countF = AlignLo(count, F);
160
0
            Array32f buf(AlignHi(count, F) * F);
161
0
            Exp exp;
162
0
            for (; o < outerF; o += F)
163
0
            {
164
0
                __m128 _max = _mm_set1_ps(-FLT_MAX);
165
0
                for (c = 0; c < countF; c += F)
166
0
                    LoadTansp4x4(src + c, count, buf.data + c * F, _max);
167
0
                if (c < count)
168
0
                {
169
0
                    c = count - F;
170
0
                    LoadTansp4x4(src + c, count, buf.data + c * F, _max);
171
0
                }
172
0
                __m128 _sum = _mm_setzero_ps();
173
0
                for (size_t c = 0; c < count; ++c)
174
0
                {
175
0
                    __m128 _exp = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(buf.data + c * F), _max));
176
0
                    _sum = _mm_add_ps(_sum, _exp);
177
0
                    _mm_storeu_ps(buf.data + c * F, _exp);
178
0
                }
179
0
                __m128 _k = _mm_div_ps(_mm_set1_ps(1.0f), _sum);
180
0
                for (c = 0; c < countF; c += F)
181
0
                    StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
182
0
                if (c < count)
183
0
                {
184
0
                    c = count - F;
185
0
                    StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
186
0
                }
187
0
                src += count * F;
188
0
                dst += count * F;
189
0
            }
190
0
            for (; o < outer; ++o)
191
0
            {
192
0
                float max = src[0];
193
0
                for (size_t c = 1; c < count; ++c)
194
0
                    max = Simd::Max(max, src[c]);
195
0
                float sum = 0;
196
0
                for (size_t c = 0; c < count; ++c)
197
0
                {
198
0
                    dst[c] = ::exp(src[c] - max);
199
0
                    sum += dst[c];
200
0
                }
201
0
                float k = 1.0f / sum;
202
0
                for (size_t c = 0; c < count; ++c)
203
0
                    dst[c] *= k;
204
0
                src += count;
205
0
                dst += count;
206
0
            }
207
0
        }
208
209
        void SynetSoftmax32f(const float* src, size_t outer, size_t count, size_t inner, float* dst)
210
0
        {
211
0
            if (inner == 1)
212
0
            {
213
0
                if (count == 2)
214
0
                    SynetSoftmax32f21(src, outer, dst);
215
0
                else if (count == 3)
216
0
                    SynetSoftmax32f31(src, outer, dst);
217
0
                else
218
0
                    SynetSoftmax32fX1(src, outer, count, dst);
219
0
            }
220
0
            else
221
0
            {
222
0
                Exp exp;
223
0
                size_t aligned = Simd::AlignLo(inner, F);
224
0
                Array32f tmp(inner * 2);
225
0
                const float* s;
226
0
                float* max = tmp.data, * sum = tmp.data + inner, * d;
227
0
                for (size_t o = 0; o < outer; ++o)
228
0
                {
229
0
                    memcpy(max, src, inner * sizeof(float));
230
0
                    s = src + inner;
231
0
                    for (size_t c = 1; c < count; ++c)
232
0
                    {
233
0
                        size_t i = 0;
234
0
                        for (; i < aligned; i += F)
235
0
                            _mm_storeu_ps(max + i, _mm_max_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
236
0
                        for (; i < inner; ++i)
237
0
                            max[i] = Simd::Max(max[i], s[i]);
238
0
                        s += inner;
239
0
                    }
240
241
0
                    s = src;
242
0
                    d = dst;
243
0
                    memset(sum, 0, inner * sizeof(float));
244
0
                    for (size_t c = 0; c < count; ++c)
245
0
                    {
246
0
                        size_t i = 0;
247
0
                        for (; i < aligned; i += F)
248
0
                        {
249
0
                            __m128 _d = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
250
0
                            _mm_storeu_ps(d + i, _d);
251
0
                            _mm_storeu_ps(sum + i, _mm_add_ps(_d, _mm_loadu_ps(sum + i)));
252
0
                        }
253
0
                        for (; i < inner; ++i)
254
0
                        {
255
0
                            d[i] = ::exp(s[i] - max[i]);
256
0
                            sum[i] += d[i];
257
0
                        }
258
0
                        s += inner;
259
0
                        d += inner;
260
0
                    }
261
262
0
                    d = dst;
263
0
                    for (size_t c = 0; c < count; ++c)
264
0
                    {
265
0
                        size_t i = 0;
266
0
                        for (; i < aligned; i += F)
267
0
                            _mm_storeu_ps(d + i, _mm_div_ps(_mm_loadu_ps(d + i), _mm_loadu_ps(sum + i)));
268
0
                        for (; i < inner; ++i)
269
0
                            d[i] /= sum[i];
270
0
                        d += inner;
271
0
                    }
272
0
                    src += count * inner;
273
0
                    dst += count * inner;
274
0
                }
275
0
            }
276
0
        }
277
    }
278
#endif
279
}