/src/Simd/src/Simd/SimdSse41SynetSoftmax32f.cpp

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2026 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "Simd/SimdMemory.h"
#include "Simd/SimdArray.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdExtract.h"
#include "Simd/SimdSynet.h"
#include "Simd/SimdBase.h"
#include "Simd/SimdSse41.h"
#include "Simd/SimdExp.h"
#include "Simd/SimdGather.h"
#include "Simd/SimdPow.h"

namespace Simd
{
#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)   
    namespace Sse41
    {
        void SynetSoftmax32f21(const float* src, size_t outer, float* dst)
        {
            Exp exp;
            size_t aligned = Simd::AlignLo(outer, F);
            size_t o = 0;
            for (; o < aligned; o += F)
            {
                __m128 s0 = _mm_loadu_ps(src + 0);
                __m128 s1 = _mm_loadu_ps(src + F);
                __m128 ss0 = _mm_shuffle_ps(s0, s1, 0x88);
                __m128 ss1 = _mm_shuffle_ps(s0, s1, 0xDD);
                __m128 max = _mm_max_ps(ss0, ss1);
                __m128 exp0 = exp.Exponent(_mm_sub_ps(ss0, max));
                __m128 exp1 = exp.Exponent(_mm_sub_ps(ss1, max));
                __m128 sum = _mm_add_ps(exp0, exp1);
                __m128 d0 = _mm_div_ps(exp0, sum);
                __m128 d1 = _mm_div_ps(exp1, sum);
                _mm_storeu_ps(dst + 0, _mm_unpacklo_ps(d0, d1));
                _mm_storeu_ps(dst + F, _mm_unpackhi_ps(d0, d1));
                src += DF;
                dst += DF;
            }
            for (; o < outer; ++o)
            {
                float max = Simd::Max(src[0], src[1]);
                float exp0 = ::exp(src[0] - max);
                float exp1 = ::exp(src[1] - max);
                float sum = exp0 + exp1;
                dst[0] = exp0 / sum;
                dst[1] = exp1 / sum;
                src += 2;
                dst += 2;
            }
        }

        SIMD_INLINE void SynetSoftmax32f31(const Exp& exp, __m128 buf[3])
        {
            __m128 max = _mm_max_ps(buf[0], _mm_max_ps(buf[1], buf[2]));
            buf[0] = exp.Exponent(_mm_sub_ps(buf[0], max));
            buf[1] = exp.Exponent(_mm_sub_ps(buf[1], max));
            buf[2] = exp.Exponent(_mm_sub_ps(buf[2], max));
            __m128 sum = _mm_add_ps(buf[0], _mm_add_ps(buf[1], buf[2]));
            buf[0] = _mm_div_ps(buf[0], sum);
            buf[1] = _mm_div_ps(buf[1], sum);
            buf[2] = _mm_div_ps(buf[2], sum);
        }

        void SynetSoftmax32f31(const float* src, size_t outer, float* dst)
        {
            Exp exp;
            __m128 buf[3];
            size_t aligned = Simd::AlignLo(outer, F);
            for (size_t o = 0; o < aligned; o += F)
            {
                buf[0] = Gather<3>(src + 0);
                buf[1] = Gather<3>(src + 1);
                buf[2] = Gather<3>(src + 2);
                SynetSoftmax32f31(exp, buf);
                Scater<3>(dst + 0, buf[0]);
                Scater<3>(dst + 1, buf[1]);
                Scater<3>(dst + 2, buf[2]);
                src += 3 * F;
                dst += 3 * F;
            }
            if (aligned < outer)
            {
                size_t tail = outer - aligned;
                buf[0] = Gather<3>(src + 0, tail);
                buf[1] = Gather<3>(src + 1, tail);
                buf[2] = Gather<3>(src + 2, tail);
                SynetSoftmax32f31(exp, buf);
                Scater<3>(dst + 0, buf[0], tail);
                Scater<3>(dst + 1, buf[1], tail);
                Scater<3>(dst + 2, buf[2], tail);
            }
        }

        SIMD_INLINE void LoadTansp4x4(const float* src, size_t count, float* dst, __m128& max)
        {
            __m128 a0 = _mm_loadu_ps(src + 0 * count);
            __m128 a1 = _mm_loadu_ps(src + 1 * count);
            __m128 a2 = _mm_loadu_ps(src + 2 * count);
            __m128 a3 = _mm_loadu_ps(src + 3 * count);
            __m128 b0 = _mm_unpacklo_ps(a0, a2);
            __m128 b1 = _mm_unpacklo_ps(a1, a3);
            __m128 b2 = _mm_unpackhi_ps(a0, a2);
            __m128 b3 = _mm_unpackhi_ps(a1, a3);
            a0 = _mm_unpacklo_ps(b0, b1);
            max = _mm_max_ps(max, a0);
            _mm_storeu_ps(dst + 0 * F, a0);
            a1 = _mm_unpackhi_ps(b0, b1);
            max = _mm_max_ps(max, a1);
            _mm_storeu_ps(dst + 1 * F, a1);
            a2 = _mm_unpacklo_ps(b2, b3);
            max = _mm_max_ps(max, a2);
            _mm_storeu_ps(dst + 2 * F, a2);
            a3 = _mm_unpackhi_ps(b2, b3);
            max = _mm_max_ps(max, a3);
            _mm_storeu_ps(dst + 3 * F, a3);
        }

        SIMD_INLINE void StoreTansp4x4(const float* src, __m128 k, float* dst, size_t count)
        {
            __m128 a0 = _mm_mul_ps(_mm_loadu_ps(src + 0 * F), k);
            __m128 a1 = _mm_mul_ps(_mm_loadu_ps(src + 1 * F), k);
            __m128 a2 = _mm_mul_ps(_mm_loadu_ps(src + 2 * F), k);
            __m128 a3 = _mm_mul_ps(_mm_loadu_ps(src + 3 * F), k);
            __m128 b0 = _mm_unpacklo_ps(a0, a2);
            __m128 b1 = _mm_unpacklo_ps(a1, a3);
            __m128 b2 = _mm_unpackhi_ps(a0, a2);
            __m128 b3 = _mm_unpackhi_ps(a1, a3);
            _mm_storeu_ps(dst + 0 * count, _mm_unpacklo_ps(b0, b1));
            _mm_storeu_ps(dst + 1 * count, _mm_unpackhi_ps(b0, b1));
            _mm_storeu_ps(dst + 2 * count, _mm_unpacklo_ps(b2, b3));
            _mm_storeu_ps(dst + 3 * count, _mm_unpackhi_ps(b2, b3));
        }

        void SynetSoftmax32fX1(const float* src, size_t outer, size_t count, float* dst)
        {
            size_t o = 0, c = 0, outerF = AlignLo(outer, F), countF = AlignLo(count, F);
            Array32f buf(AlignHi(count, F) * F);
            Exp exp;
            for (; o < outerF; o += F)
            {
                __m128 _max = _mm_set1_ps(-FLT_MAX);
                for (c = 0; c < countF; c += F)
                    LoadTansp4x4(src + c, count, buf.data + c * F, _max);
                if (c < count)
                {
                    c = count - F;
                    LoadTansp4x4(src + c, count, buf.data + c * F, _max);
                }
                __m128 _sum = _mm_setzero_ps();
                for (size_t c = 0; c < count; ++c)
                {
                    __m128 _exp = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(buf.data + c * F), _max));
                    _sum = _mm_add_ps(_sum, _exp);
                    _mm_storeu_ps(buf.data + c * F, _exp);
                }
                __m128 _k = _mm_div_ps(_mm_set1_ps(1.0f), _sum);
                for (c = 0; c < countF; c += F)
                    StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
                if (c < count)
                {
                    c = count - F;
                    StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
                }
                src += count * F;
                dst += count * F;
            }
            for (; o < outer; ++o)
            {
                float max = src[0];
                for (size_t c = 1; c < count; ++c)
                    max = Simd::Max(max, src[c]);
                float sum = 0;
                for (size_t c = 0; c < count; ++c)
                {
                    dst[c] = ::exp(src[c] - max);
                    sum += dst[c];
                }
                float k = 1.0f / sum;
                for (size_t c = 0; c < count; ++c)
                    dst[c] *= k;
                src += count;
                dst += count;
            }
        }

        void SynetSoftmax32f(const float* src, size_t outer, size_t count, size_t inner, float* dst)
        {
            if (inner == 1)
            {
                if (count == 2)
                    SynetSoftmax32f21(src, outer, dst);
                else if (count == 3)
                    SynetSoftmax32f31(src, outer, dst);
                else
                    SynetSoftmax32fX1(src, outer, count, dst);
            }
            else
            {
                Exp exp;
                size_t aligned = Simd::AlignLo(inner, F);
                Array32f tmp(inner * 2);
                const float* s;
                float* max = tmp.data, * sum = tmp.data + inner, * d;
                for (size_t o = 0; o < outer; ++o)
                {
                    memcpy(max, src, inner * sizeof(float));
                    s = src + inner;
                    for (size_t c = 1; c < count; ++c)
                    {
                        size_t i = 0;
                        for (; i < aligned; i += F)
                            _mm_storeu_ps(max + i, _mm_max_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
                        for (; i < inner; ++i)
                            max[i] = Simd::Max(max[i], s[i]);
                        s += inner;
                    }

                    s = src;
                    d = dst;
                    memset(sum, 0, inner * sizeof(float));
                    for (size_t c = 0; c < count; ++c)
                    {
                        size_t i = 0;
                        for (; i < aligned; i += F)
                        {
                            __m128 _d = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
                            _mm_storeu_ps(d + i, _d);
                            _mm_storeu_ps(sum + i, _mm_add_ps(_d, _mm_loadu_ps(sum + i)));
                        }
                        for (; i < inner; ++i)
                        {
                            d[i] = ::exp(s[i] - max[i]);
                            sum[i] += d[i];
                        }
                        s += inner;
                        d += inner;
                    }

                    d = dst;
                    for (size_t c = 0; c < count; ++c)
                    {
                        size_t i = 0;
                        for (; i < aligned; i += F)
                            _mm_storeu_ps(d + i, _mm_div_ps(_mm_loadu_ps(d + i), _mm_loadu_ps(sum + i)));
                        for (; i < inner; ++i)
                            d[i] /= sum[i];
                        d += inner;
                    }
                    src += count * inner;
                    dst += count * inner;
                }
            }
        }
    }
#endif
}

Coverage Report

Created: 2026-04-09 07:14

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2026 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#include "Simd/SimdMemory.h"
25		#include "Simd/SimdArray.h"
26		#include "Simd/SimdStore.h"
27		#include "Simd/SimdExtract.h"
28		#include "Simd/SimdSynet.h"
29		#include "Simd/SimdBase.h"
30		#include "Simd/SimdSse41.h"
31		#include "Simd/SimdExp.h"
32		#include "Simd/SimdGather.h"
33		#include "Simd/SimdPow.h"
34
35		namespace Simd
36		{
37		#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE)
38		namespace Sse41
39		{
40		void SynetSoftmax32f21(const float* src, size_t outer, float* dst)
41	0	{
42	0	Exp exp;
43	0	size_t aligned = Simd::AlignLo(outer, F);
44	0	size_t o = 0;
45	0	for (; o < aligned; o += F)
46	0	{
47	0	__m128 s0 = _mm_loadu_ps(src + 0);
48	0	__m128 s1 = _mm_loadu_ps(src + F);
49	0	__m128 ss0 = _mm_shuffle_ps(s0, s1, 0x88);
50	0	__m128 ss1 = _mm_shuffle_ps(s0, s1, 0xDD);
51	0	__m128 max = _mm_max_ps(ss0, ss1);
52	0	__m128 exp0 = exp.Exponent(_mm_sub_ps(ss0, max));
53	0	__m128 exp1 = exp.Exponent(_mm_sub_ps(ss1, max));
54	0	__m128 sum = _mm_add_ps(exp0, exp1);
55	0	__m128 d0 = _mm_div_ps(exp0, sum);
56	0	__m128 d1 = _mm_div_ps(exp1, sum);
57	0	_mm_storeu_ps(dst + 0, _mm_unpacklo_ps(d0, d1));
58	0	_mm_storeu_ps(dst + F, _mm_unpackhi_ps(d0, d1));
59	0	src += DF;
60	0	dst += DF;
61	0	}
62	0	for (; o < outer; ++o)
63	0	{
64	0	float max = Simd::Max(src[0], src[1]);
65	0	float exp0 = ::exp(src[0] - max);
66	0	float exp1 = ::exp(src[1] - max);
67	0	float sum = exp0 + exp1;
68	0	dst[0] = exp0 / sum;
69	0	dst[1] = exp1 / sum;
70	0	src += 2;
71	0	dst += 2;
72	0	}
73	0	}
74
75		SIMD_INLINE void SynetSoftmax32f31(const Exp& exp, __m128 buf[3])
76	0	{
77	0	__m128 max = _mm_max_ps(buf[0], _mm_max_ps(buf[1], buf[2]));
78	0	buf[0] = exp.Exponent(_mm_sub_ps(buf[0], max));
79	0	buf[1] = exp.Exponent(_mm_sub_ps(buf[1], max));
80	0	buf[2] = exp.Exponent(_mm_sub_ps(buf[2], max));
81	0	__m128 sum = _mm_add_ps(buf[0], _mm_add_ps(buf[1], buf[2]));
82	0	buf[0] = _mm_div_ps(buf[0], sum);
83	0	buf[1] = _mm_div_ps(buf[1], sum);
84	0	buf[2] = _mm_div_ps(buf[2], sum);
85	0	}
86
87		void SynetSoftmax32f31(const float* src, size_t outer, float* dst)
88	0	{
89	0	Exp exp;
90	0	__m128 buf[3];
91	0	size_t aligned = Simd::AlignLo(outer, F);
92	0	for (size_t o = 0; o < aligned; o += F)
93	0	{
94	0	buf[0] = Gather<3>(src + 0);
95	0	buf[1] = Gather<3>(src + 1);
96	0	buf[2] = Gather<3>(src + 2);
97	0	SynetSoftmax32f31(exp, buf);
98	0	Scater<3>(dst + 0, buf[0]);
99	0	Scater<3>(dst + 1, buf[1]);
100	0	Scater<3>(dst + 2, buf[2]);
101	0	src += 3 * F;
102	0	dst += 3 * F;
103	0	}
104	0	if (aligned < outer)
105	0	{
106	0	size_t tail = outer - aligned;
107	0	buf[0] = Gather<3>(src + 0, tail);
108	0	buf[1] = Gather<3>(src + 1, tail);
109	0	buf[2] = Gather<3>(src + 2, tail);
110	0	SynetSoftmax32f31(exp, buf);
111	0	Scater<3>(dst + 0, buf[0], tail);
112	0	Scater<3>(dst + 1, buf[1], tail);
113	0	Scater<3>(dst + 2, buf[2], tail);
114	0	}
115	0	}
116
117		SIMD_INLINE void LoadTansp4x4(const float* src, size_t count, float* dst, __m128& max)
118	0	{
119	0	__m128 a0 = _mm_loadu_ps(src + 0 * count);
120	0	__m128 a1 = _mm_loadu_ps(src + 1 * count);
121	0	__m128 a2 = _mm_loadu_ps(src + 2 * count);
122	0	__m128 a3 = _mm_loadu_ps(src + 3 * count);
123	0	__m128 b0 = _mm_unpacklo_ps(a0, a2);
124	0	__m128 b1 = _mm_unpacklo_ps(a1, a3);
125	0	__m128 b2 = _mm_unpackhi_ps(a0, a2);
126	0	__m128 b3 = _mm_unpackhi_ps(a1, a3);
127	0	a0 = _mm_unpacklo_ps(b0, b1);
128	0	max = _mm_max_ps(max, a0);
129	0	_mm_storeu_ps(dst + 0 * F, a0);
130	0	a1 = _mm_unpackhi_ps(b0, b1);
131	0	max = _mm_max_ps(max, a1);
132	0	_mm_storeu_ps(dst + 1 * F, a1);
133	0	a2 = _mm_unpacklo_ps(b2, b3);
134	0	max = _mm_max_ps(max, a2);
135	0	_mm_storeu_ps(dst + 2 * F, a2);
136	0	a3 = _mm_unpackhi_ps(b2, b3);
137	0	max = _mm_max_ps(max, a3);
138	0	_mm_storeu_ps(dst + 3 * F, a3);
139	0	}
140
141		SIMD_INLINE void StoreTansp4x4(const float* src, __m128 k, float* dst, size_t count)
142	0	{
143	0	__m128 a0 = _mm_mul_ps(_mm_loadu_ps(src + 0 * F), k);
144	0	__m128 a1 = _mm_mul_ps(_mm_loadu_ps(src + 1 * F), k);
145	0	__m128 a2 = _mm_mul_ps(_mm_loadu_ps(src + 2 * F), k);
146	0	__m128 a3 = _mm_mul_ps(_mm_loadu_ps(src + 3 * F), k);
147	0	__m128 b0 = _mm_unpacklo_ps(a0, a2);
148	0	__m128 b1 = _mm_unpacklo_ps(a1, a3);
149	0	__m128 b2 = _mm_unpackhi_ps(a0, a2);
150	0	__m128 b3 = _mm_unpackhi_ps(a1, a3);
151	0	_mm_storeu_ps(dst + 0 * count, _mm_unpacklo_ps(b0, b1));
152	0	_mm_storeu_ps(dst + 1 * count, _mm_unpackhi_ps(b0, b1));
153	0	_mm_storeu_ps(dst + 2 * count, _mm_unpacklo_ps(b2, b3));
154	0	_mm_storeu_ps(dst + 3 * count, _mm_unpackhi_ps(b2, b3));
155	0	}
156
157		void SynetSoftmax32fX1(const float* src, size_t outer, size_t count, float* dst)
158	0	{
159	0	size_t o = 0, c = 0, outerF = AlignLo(outer, F), countF = AlignLo(count, F);
160	0	Array32f buf(AlignHi(count, F) * F);
161	0	Exp exp;
162	0	for (; o < outerF; o += F)
163	0	{
164	0	__m128 _max = _mm_set1_ps(-FLT_MAX);
165	0	for (c = 0; c < countF; c += F)
166	0	LoadTansp4x4(src + c, count, buf.data + c * F, _max);
167	0	if (c < count)
168	0	{
169	0	c = count - F;
170	0	LoadTansp4x4(src + c, count, buf.data + c * F, _max);
171	0	}
172	0	__m128 _sum = _mm_setzero_ps();
173	0	for (size_t c = 0; c < count; ++c)
174	0	{
175	0	__m128 _exp = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(buf.data + c * F), _max));
176	0	_sum = _mm_add_ps(_sum, _exp);
177	0	_mm_storeu_ps(buf.data + c * F, _exp);
178	0	}
179	0	__m128 _k = _mm_div_ps(_mm_set1_ps(1.0f), _sum);
180	0	for (c = 0; c < countF; c += F)
181	0	StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
182	0	if (c < count)
183	0	{
184	0	c = count - F;
185	0	StoreTansp4x4(buf.data + c * F, _k, dst + c, count);
186	0	}
187	0	src += count * F;
188	0	dst += count * F;
189	0	}
190	0	for (; o < outer; ++o)
191	0	{
192	0	float max = src[0];
193	0	for (size_t c = 1; c < count; ++c)
194	0	max = Simd::Max(max, src[c]);
195	0	float sum = 0;
196	0	for (size_t c = 0; c < count; ++c)
197	0	{
198	0	dst[c] = ::exp(src[c] - max);
199	0	sum += dst[c];
200	0	}
201	0	float k = 1.0f / sum;
202	0	for (size_t c = 0; c < count; ++c)
203	0	dst[c] *= k;
204	0	src += count;
205	0	dst += count;
206	0	}
207	0	}
208
209		void SynetSoftmax32f(const float* src, size_t outer, size_t count, size_t inner, float* dst)
210	0	{
211	0	if (inner == 1)
212	0	{
213	0	if (count == 2)
214	0	SynetSoftmax32f21(src, outer, dst);
215	0	else if (count == 3)
216	0	SynetSoftmax32f31(src, outer, dst);
217	0	else
218	0	SynetSoftmax32fX1(src, outer, count, dst);
219	0	}
220	0	else
221	0	{
222	0	Exp exp;
223	0	size_t aligned = Simd::AlignLo(inner, F);
224	0	Array32f tmp(inner * 2);
225	0	const float* s;
226	0	float* max = tmp.data, * sum = tmp.data + inner, * d;
227	0	for (size_t o = 0; o < outer; ++o)
228	0	{
229	0	memcpy(max, src, inner * sizeof(float));
230	0	s = src + inner;
231	0	for (size_t c = 1; c < count; ++c)
232	0	{
233	0	size_t i = 0;
234	0	for (; i < aligned; i += F)
235	0	_mm_storeu_ps(max + i, _mm_max_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
236	0	for (; i < inner; ++i)
237	0	max[i] = Simd::Max(max[i], s[i]);
238	0	s += inner;
239	0	}
240
241	0	s = src;
242	0	d = dst;
243	0	memset(sum, 0, inner * sizeof(float));
244	0	for (size_t c = 0; c < count; ++c)
245	0	{
246	0	size_t i = 0;
247	0	for (; i < aligned; i += F)
248	0	{
249	0	__m128 _d = exp.Exponent(_mm_sub_ps(_mm_loadu_ps(s + i), _mm_loadu_ps(max + i)));
250	0	_mm_storeu_ps(d + i, _d);
251	0	_mm_storeu_ps(sum + i, _mm_add_ps(_d, _mm_loadu_ps(sum + i)));
252	0	}
253	0	for (; i < inner; ++i)
254	0	{
255	0	d[i] = ::exp(s[i] - max[i]);
256	0	sum[i] += d[i];
257	0	}
258	0	s += inner;
259	0	d += inner;
260	0	}
261
262	0	d = dst;
263	0	for (size_t c = 0; c < count; ++c)
264	0	{
265	0	size_t i = 0;
266	0	for (; i < aligned; i += F)
267	0	_mm_storeu_ps(d + i, _mm_div_ps(_mm_loadu_ps(d + i), _mm_loadu_ps(sum + i)));
268	0	for (; i < inner; ++i)
269	0	d[i] /= sum[i];
270	0	d += inner;
271	0	}
272	0	src += count * inner;
273	0	dst += count * inner;
274	0	}
275	0	}
276	0	}
277		}
278		#endif
279		}