/src/Simd/src/Simd/SimdFmadd.h
Line | Count | Source |
1 | | /* |
2 | | * Simd Library (http://ermig1979.github.io/Simd). |
3 | | * |
4 | | * Copyright (c) 2011-2023 Yermalayeu Ihar. |
5 | | * |
6 | | * Permission is hereby granted, free of charge, to any person obtaining a copy |
7 | | * of this software and associated documentation files (the "Software"), to deal |
8 | | * in the Software without restriction, including without limitation the rights |
9 | | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 | | * copies of the Software, and to permit persons to whom the Software is |
11 | | * furnished to do so, subject to the following conditions: |
12 | | * |
13 | | * The above copyright notice and this permission notice shall be included in |
14 | | * all copies or substantial portions of the Software. |
15 | | * |
16 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
17 | | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
18 | | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
19 | | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
20 | | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
21 | | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
22 | | * SOFTWARE. |
23 | | */ |
24 | | #ifndef __SimdFmadd_h__ |
25 | | #define __SimdFmadd_h__ |
26 | | |
27 | | #include "Simd/SimdDefs.h" |
28 | | #include "Simd/SimdConvert.h" |
29 | | |
30 | | namespace Simd |
31 | | { |
32 | | namespace Base |
33 | | { |
34 | | template<bool nofma> SIMD_INLINE float Fmadd(float a, float b, float c); |
35 | | |
36 | | template <> SIMD_INLINE float Fmadd<false>(float a, float b, float c) |
37 | 0 | { |
38 | 0 | return float(double(a) * double(b) + double(c)); |
39 | 0 | } |
40 | | |
41 | | template <> SIMD_INLINE float Fmadd<true>(float a, float b, float c) |
42 | 0 | { |
43 | 0 | return a * b + c; |
44 | 0 | } |
45 | | } |
46 | | |
47 | | #ifdef SIMD_SSE41_ENABLE |
48 | | namespace Sse41 |
49 | | { |
50 | | template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c); |
51 | | |
52 | | template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c) |
53 | 0 | { |
54 | 0 | __m128d lo = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<0>(a), Fp32ToFp64<0>(b)), Fp32ToFp64<0>(c)); |
55 | 0 | __m128d hi = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<1>(a), Fp32ToFp64<1>(b)), Fp32ToFp64<1>(c)); |
56 | 0 | return Fp64ToFp32(lo, hi); |
57 | 0 | } |
58 | | |
59 | | template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c) |
60 | 0 | { |
61 | 0 | return _mm_add_ps(_mm_mul_ps(a, b), c); |
62 | 0 | } |
63 | | } |
64 | | #endif |
65 | | |
66 | | #ifdef SIMD_AVX2_ENABLE |
67 | | namespace Avx2 |
68 | | { |
69 | | template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c); |
70 | | |
71 | | template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c) |
72 | 0 | { |
73 | 0 | return _mm_fmadd_ps(a, b, c); |
74 | 0 | } |
75 | | |
76 | | template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c) |
77 | 0 | { |
78 | 0 | return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), c); |
79 | 0 | } |
80 | | |
81 | | //----------------------------------------------------------------------------------------- |
82 | | |
83 | | template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c, const __m128 & d); |
84 | | |
85 | | template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c, const __m128 & d) |
86 | 0 | { |
87 | 0 | return _mm_fmadd_ps(a, b, _mm_mul_ps(c, d)); |
88 | 0 | } |
89 | | |
90 | | template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c, const __m128 & d) |
91 | 0 | { |
92 | 0 | return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), _mm_or_ps(_mm_mul_ps(c, d), _mm_setzero_ps())); |
93 | 0 | } |
94 | | |
95 | | //----------------------------------------------------------------------------------------- |
96 | | |
97 | | template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c); |
98 | | |
99 | | template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c) |
100 | 0 | { |
101 | 0 | return _mm256_fmadd_ps(a, b, c); |
102 | 0 | } |
103 | | |
104 | | template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c) |
105 | 0 | { |
106 | 0 | return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), c); |
107 | 0 | } |
108 | | |
109 | | //----------------------------------------------------------------------------------------- |
110 | | |
111 | | template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c, const __m256 & d); |
112 | | |
113 | | template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c, const __m256 & d) |
114 | 0 | { |
115 | 0 | return _mm256_fmadd_ps(a, b, _mm256_mul_ps(c, d)); |
116 | 0 | } |
117 | | |
118 | | template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c, const __m256 & d) |
119 | 0 | { |
120 | 0 | return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), _mm256_or_ps(_mm256_mul_ps(c, d), _mm256_setzero_ps())); |
121 | 0 | } |
122 | | } |
123 | | #endif |
124 | | |
125 | | #ifdef SIMD_AVX512BW_ENABLE |
126 | | namespace Avx512bw |
127 | | { |
128 | | template<bool nofma> SIMD_INLINE __m512 Fmadd(__m512 a, __m512 b, __m512 c); |
129 | | |
130 | | template <> SIMD_INLINE __m512 Fmadd<false>(__m512 a, __m512 b, __m512 c) |
131 | 0 | { |
132 | 0 | return _mm512_fmadd_ps(a, b, c); |
133 | 0 | } |
134 | | |
135 | | template <> SIMD_INLINE __m512 Fmadd<true>(__m512 a, __m512 b, __m512 c) |
136 | 0 | { |
137 | | #ifdef _MSC_VER |
138 | | return _mm512_add_ps(_mm512_fmadd_ps(a, b, _mm512_setzero_ps()), c); |
139 | | #else |
140 | 0 | return _mm512_maskz_add_ps(-1, _mm512_mul_ps(a, b), c); |
141 | 0 | #endif |
142 | 0 | } |
143 | | } |
144 | | #endif |
145 | | } |
146 | | |
147 | | #endif |