/src/Simd/src/Simd/SimdFmadd.h

Source
/*
* Simd Library (http://ermig1979.github.io/Simd).
*
* Copyright (c) 2011-2023 Yermalayeu Ihar.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef __SimdFmadd_h__
#define __SimdFmadd_h__

#include "Simd/SimdDefs.h"
#include "Simd/SimdConvert.h"

namespace Simd
{
    namespace Base
    {
        template<bool nofma> SIMD_INLINE float Fmadd(float a, float b, float c);

        template <> SIMD_INLINE float Fmadd<false>(float a, float b, float c)
        {
            return float(double(a) * double(b) + double(c));
        }

        template <> SIMD_INLINE float Fmadd<true>(float a, float b, float c)
        {
            return a * b + c;
        }
    }

#ifdef SIMD_SSE41_ENABLE
    namespace Sse41
    {
        template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c);

        template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c)
        {
            __m128d lo = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<0>(a), Fp32ToFp64<0>(b)), Fp32ToFp64<0>(c));
            __m128d hi = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<1>(a), Fp32ToFp64<1>(b)), Fp32ToFp64<1>(c));
            return Fp64ToFp32(lo, hi);
        }

        template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c)
        {
            return _mm_add_ps(_mm_mul_ps(a, b), c);
        }
    }
#endif

#ifdef SIMD_AVX2_ENABLE
    namespace Avx2
    {
        template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c);

        template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c)
        {
            return _mm_fmadd_ps(a, b, c);
        }

        template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c)
        {
            return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), c);
        }

        //-----------------------------------------------------------------------------------------

        template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c, const __m128 & d);

        template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c, const __m128 & d)
        {
            return _mm_fmadd_ps(a, b, _mm_mul_ps(c, d));
        }

        template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c, const __m128 & d)
        {
            return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), _mm_or_ps(_mm_mul_ps(c, d), _mm_setzero_ps()));
        }

        //-----------------------------------------------------------------------------------------

        template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c);

        template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c)
        {
            return _mm256_fmadd_ps(a, b, c);
        }

        template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c)
        {
            return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), c);
        }

        //-----------------------------------------------------------------------------------------

        template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c, const __m256 &  d);

        template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c, const __m256 & d)
        {
            return _mm256_fmadd_ps(a, b, _mm256_mul_ps(c, d));
        }

        template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c, const __m256 & d)
        {
            return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), _mm256_or_ps(_mm256_mul_ps(c, d), _mm256_setzero_ps()));
        }
    }
#endif

#ifdef SIMD_AVX512BW_ENABLE    
    namespace Avx512bw
    {
        template<bool nofma> SIMD_INLINE __m512 Fmadd(__m512 a, __m512 b, __m512 c);

        template <> SIMD_INLINE __m512 Fmadd<false>(__m512 a, __m512 b, __m512 c)
        {
            return _mm512_fmadd_ps(a, b, c);
        }

        template <> SIMD_INLINE __m512 Fmadd<true>(__m512 a, __m512 b, __m512 c)
        {
#ifdef _MSC_VER
            return _mm512_add_ps(_mm512_fmadd_ps(a, b, _mm512_setzero_ps()), c);
#else
            return _mm512_maskz_add_ps(-1, _mm512_mul_ps(a, b), c);
#endif
        }
    }
#endif
}

#endif

Line	Count	Source
1		/*
2		* Simd Library (http://ermig1979.github.io/Simd).
3		*
4		* Copyright (c) 2011-2023 Yermalayeu Ihar.
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a copy
7		* of this software and associated documentation files (the "Software"), to deal
8		* in the Software without restriction, including without limitation the rights
9		* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10		* copies of the Software, and to permit persons to whom the Software is
11		* furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice shall be included in
14		* all copies or substantial portions of the Software.
15		*
16		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19		* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21		* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22		* SOFTWARE.
23		*/
24		#ifndef __SimdFmadd_h__
25		#define __SimdFmadd_h__
26
27		#include "Simd/SimdDefs.h"
28		#include "Simd/SimdConvert.h"
29
30		namespace Simd
31		{
32		namespace Base
33		{
34		template<bool nofma> SIMD_INLINE float Fmadd(float a, float b, float c);
35
36		template <> SIMD_INLINE float Fmadd<false>(float a, float b, float c)
37	0	{
38	0	return float(double(a) * double(b) + double(c));
39	0	}
40
41		template <> SIMD_INLINE float Fmadd<true>(float a, float b, float c)
42	0	{
43	0	return a * b + c;
44	0	}
45		}
46
47		#ifdef SIMD_SSE41_ENABLE
48		namespace Sse41
49		{
50		template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c);
51
52		template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c)
53	0	{
54	0	__m128d lo = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<0>(a), Fp32ToFp64<0>(b)), Fp32ToFp64<0>(c));
55	0	__m128d hi = _mm_add_pd(_mm_mul_pd(Fp32ToFp64<1>(a), Fp32ToFp64<1>(b)), Fp32ToFp64<1>(c));
56	0	return Fp64ToFp32(lo, hi);
57	0	}
58
59		template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c)
60	0	{
61	0	return _mm_add_ps(_mm_mul_ps(a, b), c);
62	0	}
63		}
64		#endif
65
66		#ifdef SIMD_AVX2_ENABLE
67		namespace Avx2
68		{
69		template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c);
70
71		template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c)
72	0	{
73	0	return _mm_fmadd_ps(a, b, c);
74	0	}
75
76		template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c)
77	0	{
78	0	return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), c);
79	0	}
80
81		//-----------------------------------------------------------------------------------------
82
83		template<bool nofma> SIMD_INLINE __m128 Fmadd(__m128 a, __m128 b, __m128 c, const __m128 & d);
84
85		template <> SIMD_INLINE __m128 Fmadd<false>(__m128 a, __m128 b, __m128 c, const __m128 & d)
86	0	{
87	0	return _mm_fmadd_ps(a, b, _mm_mul_ps(c, d));
88	0	}
89
90		template <> SIMD_INLINE __m128 Fmadd<true>(__m128 a, __m128 b, __m128 c, const __m128 & d)
91	0	{
92	0	return _mm_add_ps(_mm_or_ps(_mm_mul_ps(a, b), _mm_setzero_ps()), _mm_or_ps(_mm_mul_ps(c, d), _mm_setzero_ps()));
93	0	}
94
95		//-----------------------------------------------------------------------------------------
96
97		template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c);
98
99		template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c)
100	0	{
101	0	return _mm256_fmadd_ps(a, b, c);
102	0	}
103
104		template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c)
105	0	{
106	0	return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), c);
107	0	}
108
109		//-----------------------------------------------------------------------------------------
110
111		template<bool nofma> SIMD_INLINE __m256 Fmadd(__m256 a, __m256 b, __m256 c, const __m256 & d);
112
113		template <> SIMD_INLINE __m256 Fmadd<false>(__m256 a, __m256 b, __m256 c, const __m256 & d)
114	0	{
115	0	return _mm256_fmadd_ps(a, b, _mm256_mul_ps(c, d));
116	0	}
117
118		template <> SIMD_INLINE __m256 Fmadd<true>(__m256 a, __m256 b, __m256 c, const __m256 & d)
119	0	{
120	0	return _mm256_add_ps(_mm256_or_ps(_mm256_mul_ps(a, b), _mm256_setzero_ps()), _mm256_or_ps(_mm256_mul_ps(c, d), _mm256_setzero_ps()));
121	0	}
122		}
123		#endif
124
125		#ifdef SIMD_AVX512BW_ENABLE
126		namespace Avx512bw
127		{
128		template<bool nofma> SIMD_INLINE __m512 Fmadd(__m512 a, __m512 b, __m512 c);
129
130		template <> SIMD_INLINE __m512 Fmadd<false>(__m512 a, __m512 b, __m512 c)
131	0	{
132	0	return _mm512_fmadd_ps(a, b, c);
133	0	}
134
135		template <> SIMD_INLINE __m512 Fmadd<true>(__m512 a, __m512 b, __m512 c)
136	0	{
137		#ifdef _MSC_VER
138		return _mm512_add_ps(_mm512_fmadd_ps(a, b, _mm512_setzero_ps()), c);
139		#else
140	0	return _mm512_maskz_add_ps(-1, _mm512_mul_ps(a, b), c);
141	0	#endif
142	0	}
143		}
144		#endif
145		}
146
147		#endif

Coverage Report

Created: 2025-12-10 07:04