/src/Simd/src/Simd/SimdSse41Float32.cpp

Source (jump to first uncovered line)
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2022 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "Simd/SimdMemory.h"
#include "Simd/SimdStore.h"
#include "Simd/SimdExtract.h"
#include "Simd/SimdUnpack.h"

namespace Simd
{
#ifdef SIMD_SSE41_ENABLE    
    namespace Sse41
    {
        template<bool align> void CosineDistance32f(const float* a, const float* b, size_t size, float* distance)
        {
            if (align)
                assert(Aligned(a) && Aligned(b));

            size_t partialAlignedSize = AlignLo(size, F);
            size_t fullAlignedSize = AlignLo(size, DF);
            size_t i = 0;
            __m128 _aa[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
            __m128 _ab[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
            __m128 _bb[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
            if (fullAlignedSize)
            {
                for (; i < fullAlignedSize; i += DF)
                {
                    __m128 a0 = Load<align>(a + i + 0 * F);
                    __m128 b0 = Load<align>(b + i + 0 * F);
                    _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
                    _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
                    _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
                    __m128 a1 = Load<align>(a + i + 1 * F);
                    __m128 b1 = Load<align>(b + i + 1 * F);
                    _aa[1] = _mm_add_ps(_aa[1], _mm_mul_ps(a1, a1));
                    _ab[1] = _mm_add_ps(_ab[1], _mm_mul_ps(a1, b1));
                    _bb[1] = _mm_add_ps(_bb[1], _mm_mul_ps(b1, b1));
                }
                _aa[0] = _mm_add_ps(_aa[0], _aa[1]);
                _ab[0] = _mm_add_ps(_ab[0], _ab[1]);
                _bb[0] = _mm_add_ps(_bb[0], _bb[1]);
            }
            for (; i < partialAlignedSize; i += F)
            {
                __m128 a0 = Load<align>(a + i);
                __m128 b0 = Load<align>(b + i);
                _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
                _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
                _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
            }
            float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]);
            for (; i < size; ++i)
            {
                float _a = a[i];
                float _b = b[i];
                aa += _a * _a;
                ab += _a * _b;
                bb += _b * _b;
            }
            *distance = 1.0f - ab / ::sqrt(aa * bb);
        }

        void CosineDistance32f(const float* a, const float* b, size_t size, float* distance)
        {
            if (Aligned(a) && Aligned(b))
                CosineDistance32f<true>(a, b, size, distance);
            else
                CosineDistance32f<false>(a, b, size, distance);
        }

        //-----------------------------------------------------------------------------------------

        template <bool align> SIMD_INLINE __m128i Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost)
        {
            return _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(_mm_min_ps(_mm_max_ps(Load<align>(src), lower), upper), lower), boost));
        }

        template <bool align> SIMD_INLINE void Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost, uint8_t * dst)
        {
            __m128i d0 = Float32ToUint8<align>(src + F * 0, lower, upper, boost);
            __m128i d1 = Float32ToUint8<align>(src + F * 1, lower, upper, boost);
            __m128i d2 = Float32ToUint8<align>(src + F * 2, lower, upper, boost);
            __m128i d3 = Float32ToUint8<align>(src + F * 3, lower, upper, boost);
            Store<align>((__m128i*)dst, _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3)));
        }

        template <bool align> void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst)
        {
            assert(size >= A);
            if (align)
                assert(Aligned(src) && Aligned(dst));

            __m128 _lower = _mm_set1_ps(lower[0]);
            __m128 _upper = _mm_set1_ps(upper[0]);
            __m128 boost = _mm_set1_ps(255.0f / (upper[0] - lower[0]));

            size_t alignedSize = AlignLo(size, A);
            for (size_t i = 0; i < alignedSize; i += A)
                Float32ToUint8<align>(src + i, _lower, _upper, boost, dst + i);
            if (alignedSize != size)
                Float32ToUint8<false>(src + size - A, _lower, _upper, boost, dst + size - A);
        }

        void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst)
        {
            if (Aligned(src) && Aligned(dst))
                Float32ToUint8<true>(src, size, lower, upper, dst);
            else
                Float32ToUint8<false>(src, size, lower, upper, dst);
        }

        //-----------------------------------------------------------------------------------------

        SIMD_INLINE __m128 Uint8ToFloat32(const __m128i & value, const __m128 & lower, const __m128 & boost)
        {
            return _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower);
        }

        template <bool align> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m128 & lower, const __m128 & boost, float * dst)
        {
            __m128i _src = Load<align>((__m128i*)src);
            __m128i lo = UnpackU8<0>(_src);
            __m128i hi = UnpackU8<1>(_src);
            Store<align>(dst + F * 0, Uint8ToFloat32(UnpackU16<0>(lo), lower, boost));
            Store<align>(dst + F * 1, Uint8ToFloat32(UnpackU16<1>(lo), lower, boost));
            Store<align>(dst + F * 2, Uint8ToFloat32(UnpackU16<0>(hi), lower, boost));
            Store<align>(dst + F * 3, Uint8ToFloat32(UnpackU16<1>(hi), lower, boost));
        }

        template <bool align> void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst)
        {
            assert(size >= A);
            if (align)
                assert(Aligned(src) && Aligned(dst));

            __m128 _lower = _mm_set1_ps(lower[0]);
            __m128 boost = _mm_set1_ps((upper[0] - lower[0]) / 255.0f);

            size_t alignedSize = AlignLo(size, A);
            for (size_t i = 0; i < alignedSize; i += A)
                Uint8ToFloat32<align>(src + i, _lower, boost, dst + i);
            if (alignedSize != size)
                Uint8ToFloat32<false>(src + size - A, _lower, boost, dst + size - A);
        }

        void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst)
        {
            if (Aligned(src) && Aligned(dst))
                Uint8ToFloat32<true>(src, size, lower, upper, dst);
            else
                Uint8ToFloat32<false>(src, size, lower, upper, dst);
        }
    }
#endif
}

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2022 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#include "Simd/SimdMemory.h"
25		#include "Simd/SimdStore.h"
26		#include "Simd/SimdExtract.h"
27		#include "Simd/SimdUnpack.h"
28
29		namespace Simd
30		{
31		#ifdef SIMD_SSE41_ENABLE
32		namespace Sse41
33		{
34		template<bool align> void CosineDistance32f(const float* a, const float* b, size_t size, float* distance)
35	0	{
36	0	if (align)
37	0	assert(Aligned(a) && Aligned(b));
38
39	0	size_t partialAlignedSize = AlignLo(size, F);
40	0	size_t fullAlignedSize = AlignLo(size, DF);
41	0	size_t i = 0;
42	0	__m128 _aa[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
43	0	__m128 _ab[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
44	0	__m128 _bb[2] = { _mm_setzero_ps(), _mm_setzero_ps() };
45	0	if (fullAlignedSize)
46	0	{
47	0	for (; i < fullAlignedSize; i += DF)
48	0	{
49	0	__m128 a0 = Load<align>(a + i + 0 * F);
50	0	__m128 b0 = Load<align>(b + i + 0 * F);
51	0	_aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
52	0	_ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
53	0	_bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
54	0	__m128 a1 = Load<align>(a + i + 1 * F);
55	0	__m128 b1 = Load<align>(b + i + 1 * F);
56	0	_aa[1] = _mm_add_ps(_aa[1], _mm_mul_ps(a1, a1));
57	0	_ab[1] = _mm_add_ps(_ab[1], _mm_mul_ps(a1, b1));
58	0	_bb[1] = _mm_add_ps(_bb[1], _mm_mul_ps(b1, b1));
59	0	}
60	0	_aa[0] = _mm_add_ps(_aa[0], _aa[1]);
61	0	_ab[0] = _mm_add_ps(_ab[0], _ab[1]);
62	0	_bb[0] = _mm_add_ps(_bb[0], _bb[1]);
63	0	}
64	0	for (; i < partialAlignedSize; i += F)
65	0	{
66	0	__m128 a0 = Load<align>(a + i);
67	0	__m128 b0 = Load<align>(b + i);
68	0	_aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0));
69	0	_ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0));
70	0	_bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0));
71	0	}
72	0	float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]);
73	0	for (; i < size; ++i)
74	0	{
75	0	float _a = a[i];
76	0	float _b = b[i];
77	0	aa += _a * _a;
78	0	ab += _a * _b;
79	0	bb += _b * _b;
80	0	}
81	0	distance = 1.0f - ab / ::sqrt(aa bb);
82	0	} Unexecuted instantiation: void Simd::Sse41::CosineDistance32f<true>(float const, float const, unsigned long, float) Unexecuted instantiation: void Simd::Sse41::CosineDistance32f<false>(float const, float const, unsigned long, float)
83
84		void CosineDistance32f(const float* a, const float* b, size_t size, float* distance)
85	0	{
86	0	if (Aligned(a) && Aligned(b))
87	0	CosineDistance32f<true>(a, b, size, distance);
88	0	else
89	0	CosineDistance32f<false>(a, b, size, distance);
90	0	}
91
92		//-----------------------------------------------------------------------------------------
93
94		template <bool align> SIMD_INLINE __m128i Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost)
95	0	{
96	0	return _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(_mm_min_ps(_mm_max_ps(Load<align>(src), lower), upper), lower), boost));
97	0	} Unexecuted instantiation: long long __vector(2) Simd::Sse41::Float32ToUint8<true>(float const, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&) Unexecuted instantiation: long long __vector(2) Simd::Sse41::Float32ToUint8<false>(float const, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&)
98
99		template <bool align> SIMD_INLINE void Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost, uint8_t * dst)
100	0	{
101	0	__m128i d0 = Float32ToUint8<align>(src + F * 0, lower, upper, boost);
102	0	__m128i d1 = Float32ToUint8<align>(src + F * 1, lower, upper, boost);
103	0	__m128i d2 = Float32ToUint8<align>(src + F * 2, lower, upper, boost);
104	0	__m128i d3 = Float32ToUint8<align>(src + F * 3, lower, upper, boost);
105	0	Store<align>((__m128i*)dst, _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3)));
106	0	} Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<true>(float const, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&, unsigned char) Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<false>(float const, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&, unsigned char)
107
108		template <bool align> void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst)
109	0	{
110	0	assert(size >= A);
111	0	if (align)
112	0	assert(Aligned(src) && Aligned(dst));
113
114	0	__m128 _lower = _mm_set1_ps(lower[0]);
115	0	__m128 _upper = _mm_set1_ps(upper[0]);
116	0	__m128 boost = _mm_set1_ps(255.0f / (upper[0] - lower[0]));
117
118	0	size_t alignedSize = AlignLo(size, A);
119	0	for (size_t i = 0; i < alignedSize; i += A)
120	0	Float32ToUint8<align>(src + i, _lower, _upper, boost, dst + i);
121	0	if (alignedSize != size)
122	0	Float32ToUint8<false>(src + size - A, _lower, _upper, boost, dst + size - A);
123	0	} Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<true>(float const, unsigned long, float const, float const, unsigned char) Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<false>(float const, unsigned long, float const, float const, unsigned char)
124
125		void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst)
126	0	{
127	0	if (Aligned(src) && Aligned(dst))
128	0	Float32ToUint8<true>(src, size, lower, upper, dst);
129	0	else
130	0	Float32ToUint8<false>(src, size, lower, upper, dst);
131	0	}
132
133		//-----------------------------------------------------------------------------------------
134
135		SIMD_INLINE __m128 Uint8ToFloat32(const __m128i & value, const __m128 & lower, const __m128 & boost)
136	0	{
137	0	return _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower);
138	0	}
139
140		template <bool align> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m128 & lower, const __m128 & boost, float * dst)
141	0	{
142	0	__m128i _src = Load<align>((__m128i*)src);
143	0	__m128i lo = UnpackU8<0>(_src);
144	0	__m128i hi = UnpackU8<1>(_src);
145	0	Store<align>(dst + F * 0, Uint8ToFloat32(UnpackU16<0>(lo), lower, boost));
146	0	Store<align>(dst + F * 1, Uint8ToFloat32(UnpackU16<1>(lo), lower, boost));
147	0	Store<align>(dst + F * 2, Uint8ToFloat32(UnpackU16<0>(hi), lower, boost));
148	0	Store<align>(dst + F * 3, Uint8ToFloat32(UnpackU16<1>(hi), lower, boost));
149	0	} Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<true>(unsigned char const, float __vector(4) const&, float __vector(4) const&, float) Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<false>(unsigned char const, float __vector(4) const&, float __vector(4) const&, float)
150
151		template <bool align> void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst)
152	0	{
153	0	assert(size >= A);
154	0	if (align)
155	0	assert(Aligned(src) && Aligned(dst));
156
157	0	__m128 _lower = _mm_set1_ps(lower[0]);
158	0	__m128 boost = _mm_set1_ps((upper[0] - lower[0]) / 255.0f);
159
160	0	size_t alignedSize = AlignLo(size, A);
161	0	for (size_t i = 0; i < alignedSize; i += A)
162	0	Uint8ToFloat32<align>(src + i, _lower, boost, dst + i);
163	0	if (alignedSize != size)
164	0	Uint8ToFloat32<false>(src + size - A, _lower, boost, dst + size - A);
165	0	} Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<true>(unsigned char const, unsigned long, float const, float const, float) Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<false>(unsigned char const, unsigned long, float const, float const, float)
166
167		void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst)
168	0	{
169	0	if (Aligned(src) && Aligned(dst))
170	0	Uint8ToFloat32<true>(src, size, lower, upper, dst);
171	0	else
172	0	Uint8ToFloat32<false>(src, size, lower, upper, dst);
173	0	}
174		}
175		#endif
176		}

Coverage Report

Created: 2025-08-11 07:29