/src/Simd/src/Simd/SimdSse41Float32.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2022 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #include "Simd/SimdMemory.h" |
25 | | #include "Simd/SimdStore.h" |
26 | | #include "Simd/SimdExtract.h" |
27 | | #include "Simd/SimdUnpack.h" |
28 | | |
29 | | namespace Simd |
30 | | { |
31 | | #ifdef SIMD_SSE41_ENABLE |
32 | | namespace Sse41 |
33 | | { |
34 | | template<bool align> void CosineDistance32f(const float* a, const float* b, size_t size, float* distance) |
35 | 0 | { |
36 | 0 | if (align) |
37 | 0 | assert(Aligned(a) && Aligned(b)); |
38 | |
|
39 | 0 | size_t partialAlignedSize = AlignLo(size, F); |
40 | 0 | size_t fullAlignedSize = AlignLo(size, DF); |
41 | 0 | size_t i = 0; |
42 | 0 | __m128 _aa[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; |
43 | 0 | __m128 _ab[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; |
44 | 0 | __m128 _bb[2] = { _mm_setzero_ps(), _mm_setzero_ps() }; |
45 | 0 | if (fullAlignedSize) |
46 | 0 | { |
47 | 0 | for (; i < fullAlignedSize; i += DF) |
48 | 0 | { |
49 | 0 | __m128 a0 = Load<align>(a + i + 0 * F); |
50 | 0 | __m128 b0 = Load<align>(b + i + 0 * F); |
51 | 0 | _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0)); |
52 | 0 | _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0)); |
53 | 0 | _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0)); |
54 | 0 | __m128 a1 = Load<align>(a + i + 1 * F); |
55 | 0 | __m128 b1 = Load<align>(b + i + 1 * F); |
56 | 0 | _aa[1] = _mm_add_ps(_aa[1], _mm_mul_ps(a1, a1)); |
57 | 0 | _ab[1] = _mm_add_ps(_ab[1], _mm_mul_ps(a1, b1)); |
58 | 0 | _bb[1] = _mm_add_ps(_bb[1], _mm_mul_ps(b1, b1)); |
59 | 0 | } |
60 | 0 | _aa[0] = _mm_add_ps(_aa[0], _aa[1]); |
61 | 0 | _ab[0] = _mm_add_ps(_ab[0], _ab[1]); |
62 | 0 | _bb[0] = _mm_add_ps(_bb[0], _bb[1]); |
63 | 0 | } |
64 | 0 | for (; i < partialAlignedSize; i += F) |
65 | 0 | { |
66 | 0 | __m128 a0 = Load<align>(a + i); |
67 | 0 | __m128 b0 = Load<align>(b + i); |
68 | 0 | _aa[0] = _mm_add_ps(_aa[0], _mm_mul_ps(a0, a0)); |
69 | 0 | _ab[0] = _mm_add_ps(_ab[0], _mm_mul_ps(a0, b0)); |
70 | 0 | _bb[0] = _mm_add_ps(_bb[0], _mm_mul_ps(b0, b0)); |
71 | 0 | } |
72 | 0 | float aa = ExtractSum(_aa[0]), ab = ExtractSum(_ab[0]), bb = ExtractSum(_bb[0]); |
73 | 0 | for (; i < size; ++i) |
74 | 0 | { |
75 | 0 | float _a = a[i]; |
76 | 0 | float _b = b[i]; |
77 | 0 | aa += _a * _a; |
78 | 0 | ab += _a * _b; |
79 | 0 | bb += _b * _b; |
80 | 0 | } |
81 | 0 | *distance = 1.0f - ab / ::sqrt(aa * bb); |
82 | 0 | } Unexecuted instantiation: void Simd::Sse41::CosineDistance32f<true>(float const*, float const*, unsigned long, float*) Unexecuted instantiation: void Simd::Sse41::CosineDistance32f<false>(float const*, float const*, unsigned long, float*) |
83 | | |
84 | | void CosineDistance32f(const float* a, const float* b, size_t size, float* distance) |
85 | 0 | { |
86 | 0 | if (Aligned(a) && Aligned(b)) |
87 | 0 | CosineDistance32f<true>(a, b, size, distance); |
88 | 0 | else |
89 | 0 | CosineDistance32f<false>(a, b, size, distance); |
90 | 0 | } |
91 | | |
92 | | //----------------------------------------------------------------------------------------- |
93 | | |
94 | | template <bool align> SIMD_INLINE __m128i Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost) |
95 | 0 | { |
96 | 0 | return _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(_mm_min_ps(_mm_max_ps(Load<align>(src), lower), upper), lower), boost)); |
97 | 0 | } Unexecuted instantiation: long long __vector(2) Simd::Sse41::Float32ToUint8<true>(float const*, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&) Unexecuted instantiation: long long __vector(2) Simd::Sse41::Float32ToUint8<false>(float const*, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&) |
98 | | |
99 | | template <bool align> SIMD_INLINE void Float32ToUint8(const float * src, const __m128 & lower, const __m128 & upper, const __m128 & boost, uint8_t * dst) |
100 | 0 | { |
101 | 0 | __m128i d0 = Float32ToUint8<align>(src + F * 0, lower, upper, boost); |
102 | 0 | __m128i d1 = Float32ToUint8<align>(src + F * 1, lower, upper, boost); |
103 | 0 | __m128i d2 = Float32ToUint8<align>(src + F * 2, lower, upper, boost); |
104 | 0 | __m128i d3 = Float32ToUint8<align>(src + F * 3, lower, upper, boost); |
105 | 0 | Store<align>((__m128i*)dst, _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3))); |
106 | 0 | } Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<true>(float const*, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&, unsigned char*) Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<false>(float const*, float __vector(4) const&, float __vector(4) const&, float __vector(4) const&, unsigned char*) |
107 | | |
108 | | template <bool align> void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) |
109 | 0 | { |
110 | 0 | assert(size >= A); |
111 | 0 | if (align) |
112 | 0 | assert(Aligned(src) && Aligned(dst)); |
113 | |
|
114 | 0 | __m128 _lower = _mm_set1_ps(lower[0]); |
115 | 0 | __m128 _upper = _mm_set1_ps(upper[0]); |
116 | 0 | __m128 boost = _mm_set1_ps(255.0f / (upper[0] - lower[0])); |
117 | |
|
118 | 0 | size_t alignedSize = AlignLo(size, A); |
119 | 0 | for (size_t i = 0; i < alignedSize; i += A) |
120 | 0 | Float32ToUint8<align>(src + i, _lower, _upper, boost, dst + i); |
121 | 0 | if (alignedSize != size) |
122 | 0 | Float32ToUint8<false>(src + size - A, _lower, _upper, boost, dst + size - A); |
123 | 0 | } Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<true>(float const*, unsigned long, float const*, float const*, unsigned char*) Unexecuted instantiation: void Simd::Sse41::Float32ToUint8<false>(float const*, unsigned long, float const*, float const*, unsigned char*) |
124 | | |
125 | | void Float32ToUint8(const float * src, size_t size, const float * lower, const float * upper, uint8_t * dst) |
126 | 0 | { |
127 | 0 | if (Aligned(src) && Aligned(dst)) |
128 | 0 | Float32ToUint8<true>(src, size, lower, upper, dst); |
129 | 0 | else |
130 | 0 | Float32ToUint8<false>(src, size, lower, upper, dst); |
131 | 0 | } |
132 | | |
133 | | //----------------------------------------------------------------------------------------- |
134 | | |
135 | | SIMD_INLINE __m128 Uint8ToFloat32(const __m128i & value, const __m128 & lower, const __m128 & boost) |
136 | 0 | { |
137 | 0 | return _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(value), boost), lower); |
138 | 0 | } |
139 | | |
140 | | template <bool align> SIMD_INLINE void Uint8ToFloat32(const uint8_t * src, const __m128 & lower, const __m128 & boost, float * dst) |
141 | 0 | { |
142 | 0 | __m128i _src = Load<align>((__m128i*)src); |
143 | 0 | __m128i lo = UnpackU8<0>(_src); |
144 | 0 | __m128i hi = UnpackU8<1>(_src); |
145 | 0 | Store<align>(dst + F * 0, Uint8ToFloat32(UnpackU16<0>(lo), lower, boost)); |
146 | 0 | Store<align>(dst + F * 1, Uint8ToFloat32(UnpackU16<1>(lo), lower, boost)); |
147 | 0 | Store<align>(dst + F * 2, Uint8ToFloat32(UnpackU16<0>(hi), lower, boost)); |
148 | 0 | Store<align>(dst + F * 3, Uint8ToFloat32(UnpackU16<1>(hi), lower, boost)); |
149 | 0 | } Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<true>(unsigned char const*, float __vector(4) const&, float __vector(4) const&, float*) Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<false>(unsigned char const*, float __vector(4) const&, float __vector(4) const&, float*) |
150 | | |
151 | | template <bool align> void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) |
152 | 0 | { |
153 | 0 | assert(size >= A); |
154 | 0 | if (align) |
155 | 0 | assert(Aligned(src) && Aligned(dst)); |
156 | |
|
157 | 0 | __m128 _lower = _mm_set1_ps(lower[0]); |
158 | 0 | __m128 boost = _mm_set1_ps((upper[0] - lower[0]) / 255.0f); |
159 | |
|
160 | 0 | size_t alignedSize = AlignLo(size, A); |
161 | 0 | for (size_t i = 0; i < alignedSize; i += A) |
162 | 0 | Uint8ToFloat32<align>(src + i, _lower, boost, dst + i); |
163 | 0 | if (alignedSize != size) |
164 | 0 | Uint8ToFloat32<false>(src + size - A, _lower, boost, dst + size - A); |
165 | 0 | } Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<true>(unsigned char const*, unsigned long, float const*, float const*, float*) Unexecuted instantiation: void Simd::Sse41::Uint8ToFloat32<false>(unsigned char const*, unsigned long, float const*, float const*, float*) |
166 | | |
167 | | void Uint8ToFloat32(const uint8_t * src, size_t size, const float * lower, const float * upper, float * dst) |
168 | 0 | { |
169 | 0 | if (Aligned(src) && Aligned(dst)) |
170 | 0 | Uint8ToFloat32<true>(src, size, lower, upper, dst); |
171 | 0 | else |
172 | 0 | Uint8ToFloat32<false>(src, size, lower, upper, dst); |
173 | 0 | } |
174 | | } |
175 | | #endif |
176 | | } |