/src/ogre/OgreMain/src/OgreSIMDHelper.h

Source (jump to first uncovered line)
/*
-----------------------------------------------------------------------------
This source file is part of OGRE
    (Object-oriented Graphics Rendering Engine)
For the latest info, see http://www.ogre3d.org/

Copyright (c) 2000-2014 Torus Knot Software Ltd

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
-----------------------------------------------------------------------------
*/
#ifndef __SIMDHelper_H__
#define __SIMDHelper_H__

// Stack-alignment hackery.
//
// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
// special code to ensure stack align to a 16-bytes boundary.
//
// Note:
//   This macro can only guarantee callee stack pointer (esp) align
// to a 16-bytes boundary, but not that for frame pointer (ebp).
// Because most compiler might use frame pointer to access to stack
// variables, so you need to wrap those alignment required functions
// with extra function call.
//
#if defined(__INTEL_COMPILER)
// For intel's compiler, simply calling alloca seems to do the right
// thing. The size of the allocated block seems to be irrelevant.
#define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
#define __OGRE_SIMD_ALIGN_ATTRIBUTE

#elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
// mark functions with GCC attribute to force stack alignment to 16 bytes
#define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))

#elif defined(_MSC_VER)
// Fortunately, MSVC will align the stack automatically
#define __OGRE_SIMD_ALIGN_ATTRIBUTE

#else
#define __OGRE_SIMD_ALIGN_ATTRIBUTE

#endif


// Additional platform-dependent header files and declares.
//
// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
//

#if __OGRE_HAVE_SSE
#include <xmmintrin.h>
#elif __OGRE_HAVE_NEON
#include "SSE2NEON.h"

// some conversions custom to OGRE
#define _mm_cmpnle_ps _mm_cmpgt_ps

// self written
OGRE_FORCE_INLINE __m128 _mm_loadh_pi( __m128 a , __m64 const * p )
{
  return vcombine_f32(vget_low_f32(a), vld1_f32((float32_t const *)p));
}
// self written
OGRE_FORCE_INLINE void _mm_storeh_pi( __m64 * p , __m128 a )
{
  vst1_f32((float32_t *)p, vget_high_f32((float32x4_t)a));
}

OGRE_FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
{
    a[0] *= b[0];
    return a;
}

OGRE_FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
{
    a[0] -= b[0];
    return a;
}
#endif



//---------------------------------------------------------------------
// SIMD macros and helpers
//---------------------------------------------------------------------


namespace Ogre {
    /** \addtogroup Core
    *  @{
    */
    /** \addtogroup Math
    *  @{
    */

#if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON

#define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)


/** Performing the transpose of a 4x4 matrix of single precision floating
    point values.
    Arguments r0, r1, r2, and r3 are __m128 values whose elements
    form the corresponding rows of a 4x4 matrix.
    The matrix transpose is returned in arguments r0, r1, r2, and
    r3 where r0 now holds column 0 of the original matrix, r1 now
    holds column 1 of the original matrix, etc.
*/
#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
    {                                                                                   \
        __m128 tmp3, tmp2, tmp1, tmp0;                                                  \
                                                                                        \
                                                            /* r00 r01 r02 r03 */       \
                                                            /* r10 r11 r12 r13 */       \
                                                            /* r20 r21 r22 r23 */       \
                                                            /* r30 r31 r32 r33 */       \
                                                                                        \
        tmp0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */     \
        tmp2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */     \
        tmp1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */     \
        tmp3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */     \
                                                                                        \
        r0 = _mm_movelh_ps(tmp0, tmp1);                         /* r00 r10 r20 r30 */   \
        r1 = _mm_movehl_ps(tmp1, tmp0);                         /* r01 r11 r21 r31 */   \
        r2 = _mm_movelh_ps(tmp2, tmp3);                         /* r02 r12 r22 r32 */   \
        r3 = _mm_movehl_ps(tmp3, tmp2);                         /* r03 r13 r23 r33 */   \
    }

/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
    a 3x4 matrix of single precision floating point values.
    Arguments v0, v1, and v2 are __m128 values whose elements form the
    corresponding continuous stored rows of a 4x3 matrix.
    The matrix transpose is returned in arguments v0, v1, and v2, where
    v0 now holds column 0 of the original matrix, v1 now holds column 1
    of the original matrix, etc.
*/
#define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
    {                                                                                   \
        __m128 tmp0, tmp1, tmp2;                                                        \
                                                                                        \
                                                            /* r00 r01 r02 r10 */       \
                                                            /* r11 r12 r20 r21 */       \
                                                            /* r22 r30 r31 r32 */       \
                                                                                        \
        tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */     \
        tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */     \
        tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */     \
                                                                                        \
        v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
        v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
        v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
    }

/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
    a 4x3 matrix of single precision floating point values.
    Arguments v0, v1, and v2 are __m128 values whose elements form the
    corresponding columns of a 3x4 matrix.
    The matrix transpose is returned in arguments v0, v1, and v2, as a
    continuous stored rows of a 4x3 matrix.
*/
#define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
    {                                                                               \
        __m128 tmp0, tmp1, tmp2;                                                    \
                                                                                    \
                                                            /* r00 r10 r20 r30 */   \
                                                            /* r01 r11 r21 r31 */   \
                                                            /* r02 r12 r22 r32 */   \
                                                                                    \
        tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
        tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
        tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
                                                                                    \
        v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
        v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
        v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
    }

/** Fill vector of single precision floating point with selected value.
    Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
*/
#define __MM_SELECT(v, fp)                                                          \
    _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))

/// Accumulate four vector of single precision floating point values.
#define __MM_ACCUM4_PS(a, b, c, d)                                                  \
    _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))

/** Performing dot-product between two of four vector of single precision
    floating point values.
*/
#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
    __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))

/** Performing dot-product between four vector and three vector of single
    precision floating point values.
*/
#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
    __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)

/// Accumulate three vector of single precision floating point values.
#define __MM_ACCUM3_PS(a, b, c)                                                     \
    _mm_add_ps(_mm_add_ps(a, b), c)

/** Performing dot-product between two of three vector of single precision
    floating point values.
*/
#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
    __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))

/// Calculate multiply of two vector and plus another vector
#define __MM_MADD_PS(a, b, c)                                                       \
    _mm_add_ps(_mm_mul_ps(a, b), c)

/// Linear interpolation
#define __MM_LERP_PS(t, a, b)                                                       \
    __MM_MADD_PS(_mm_sub_ps(b, a), t, a)

/// Calculate multiply of two single floating value and plus another floating value
#define __MM_MADD_SS(a, b, c)                                                       \
    _mm_add_ss(_mm_mul_ss(a, b), c)

/// Linear interpolation
#define __MM_LERP_SS(t, a, b)                                                       \
    __MM_MADD_SS(_mm_sub_ss(b, a), t, a)

/// Same as _mm_load_ps, but can help VC generate more optimised code.
#define __MM_LOAD_PS(p)                                                             \
    (*(const __m128*)(p))

/// Same as _mm_store_ps, but can help VC generate more optimised code.
#define __MM_STORE_PS(p, v)                                                         \
    (*(__m128*)(p) = (v))


    /** Helper to load/store SSE data based on whether or not aligned.
    */
    template <bool aligned = false>
    struct SSEMemoryAccessor
    {
        static OGRE_FORCE_INLINE __m128 load(const float *p)
        {
            return _mm_loadu_ps(p);
        }
        static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
        {
            _mm_storeu_ps(p, v);
        }
    };
    // Special aligned accessor
    template <>
    struct SSEMemoryAccessor<true>
    {
        static OGRE_FORCE_INLINE const __m128& load(const float *p)
        {
            return __MM_LOAD_PS(p);
        }
        static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
        {
            __MM_STORE_PS(p, v);
        }
    };

    /** Check whether or not the given pointer perfect aligned for SSE.
    */
    static OGRE_FORCE_INLINE bool _isAlignedForSSE(const void *p)
    {
        return (((size_t)p) & 15) == 0;
    }

// Macro to check the stack aligned for SSE
#if OGRE_DEBUG_MODE
#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
    {                                               \
        __m128 test = {};                           \
        assert(_isAlignedForSSE(&test));            \
    }

#else   // !OGRE_DEBUG_MODE
#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()

#endif  // OGRE_DEBUG_MODE


#endif  // __OGRE_HAVE_SSE
    /** @} */
    /** @} */

}

#endif // __SIMDHelper_H__

Coverage Report

Created: 2025-08-29 06:18

Line	Count	Source (jump to first uncovered line)
1		/*
2		-----------------------------------------------------------------------------
3		This source file is part of OGRE
4		(Object-oriented Graphics Rendering Engine)
5		For the latest info, see http://www.ogre3d.org/
6
7		Copyright (c) 2000-2014 Torus Knot Software Ltd
8
9		Permission is hereby granted, free of charge, to any person obtaining a copy
10		of this software and associated documentation files (the "Software"), to deal
11		in the Software without restriction, including without limitation the rights
12		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13		copies of the Software, and to permit persons to whom the Software is
14		furnished to do so, subject to the following conditions:
15
16		The above copyright notice and this permission notice shall be included in
17		all copies or substantial portions of the Software.
18
19		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25		THE SOFTWARE.
26		-----------------------------------------------------------------------------
27		*/
28		#ifndef __SIMDHelper_H__
29		#define __SIMDHelper_H__
30
31		// Stack-alignment hackery.
32		//
33		// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
34		// special code to ensure stack align to a 16-bytes boundary.
35		//
36		// Note:
37		// This macro can only guarantee callee stack pointer (esp) align
38		// to a 16-bytes boundary, but not that for frame pointer (ebp).
39		// Because most compiler might use frame pointer to access to stack
40		// variables, so you need to wrap those alignment required functions
41		// with extra function call.
42		//
43		#if defined(__INTEL_COMPILER)
44		// For intel's compiler, simply calling alloca seems to do the right
45		// thing. The size of the allocated block seems to be irrelevant.
46		#define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
47		#define __OGRE_SIMD_ALIGN_ATTRIBUTE
48
49		#elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC \|\| OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
50		// mark functions with GCC attribute to force stack alignment to 16 bytes
51		#define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
52
53		#elif defined(_MSC_VER)
54		// Fortunately, MSVC will align the stack automatically
55		#define __OGRE_SIMD_ALIGN_ATTRIBUTE
56
57		#else
58		#define __OGRE_SIMD_ALIGN_ATTRIBUTE
59
60		#endif
61
62
63		// Additional platform-dependent header files and declares.
64		//
65		// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
66		//
67
68		#if __OGRE_HAVE_SSE
69		#include <xmmintrin.h>
70		#elif __OGRE_HAVE_NEON
71		#include "SSE2NEON.h"
72
73		// some conversions custom to OGRE
74		#define _mm_cmpnle_ps _mm_cmpgt_ps
75
76		// self written
77		OGRE_FORCE_INLINE __m128 _mm_loadh_pi( __m128 a , __m64 const * p )
78		{
79		return vcombine_f32(vget_low_f32(a), vld1_f32((float32_t const *)p));
80		}
81		// self written
82		OGRE_FORCE_INLINE void _mm_storeh_pi( __m64 * p , __m128 a )
83		{
84		vst1_f32((float32_t *)p, vget_high_f32((float32x4_t)a));
85		}
86
87		OGRE_FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
88		{
89		a[0] *= b[0];
90		return a;
91		}
92
93		OGRE_FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
94		{
95		a[0] -= b[0];
96		return a;
97		}
98		#endif
99
100
101
102		//---------------------------------------------------------------------
103		// SIMD macros and helpers
104		//---------------------------------------------------------------------
105
106
107		namespace Ogre {
108		/** \addtogroup Core
109		* @{
110		*/
111		/** \addtogroup Math
112		* @{
113		*/
114
115		#if __OGRE_HAVE_SSE \|\| __OGRE_HAVE_NEON
116
117	0	#define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
118
119
120		/** Performing the transpose of a 4x4 matrix of single precision floating
121		point values.
122		Arguments r0, r1, r2, and r3 are __m128 values whose elements
123		form the corresponding rows of a 4x4 matrix.
124		The matrix transpose is returned in arguments r0, r1, r2, and
125		r3 where r0 now holds column 0 of the original matrix, r1 now
126		holds column 1 of the original matrix, etc.
127		*/
128		#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
129	0	{ \
130	0	__m128 tmp3, tmp2, tmp1, tmp0; \
131	0	\
132	0	/* r00 r01 r02 r03 */ \
133	0	/* r10 r11 r12 r13 */ \
134	0	/* r20 r21 r22 r23 */ \
135	0	/* r30 r31 r32 r33 */ \
136	0	\
137	0	tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
138	0	tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
139	0	tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
140	0	tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
141	0	\
142	0	r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \
143	0	r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \
144	0	r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \
145	0	r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \
146	0	}
147
148		/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
149		a 3x4 matrix of single precision floating point values.
150		Arguments v0, v1, and v2 are __m128 values whose elements form the
151		corresponding continuous stored rows of a 4x3 matrix.
152		The matrix transpose is returned in arguments v0, v1, and v2, where
153		v0 now holds column 0 of the original matrix, v1 now holds column 1
154		of the original matrix, etc.
155		*/
156		#define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
157	0	{ \
158	0	__m128 tmp0, tmp1, tmp2; \
159	0	\
160	0	/* r00 r01 r02 r10 */ \
161	0	/* r11 r12 r20 r21 */ \
162	0	/* r22 r30 r31 r32 */ \
163	0	\
164	0	tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
165	0	tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
166	0	tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
167	0	\
168	0	v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
169	0	v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
170	0	v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
171	0	}
172
173		/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
174		a 4x3 matrix of single precision floating point values.
175		Arguments v0, v1, and v2 are __m128 values whose elements form the
176		corresponding columns of a 3x4 matrix.
177		The matrix transpose is returned in arguments v0, v1, and v2, as a
178		continuous stored rows of a 4x3 matrix.
179		*/
180		#define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
181	0	{ \
182	0	__m128 tmp0, tmp1, tmp2; \
183	0	\
184	0	/* r00 r10 r20 r30 */ \
185	0	/* r01 r11 r21 r31 */ \
186	0	/* r02 r12 r22 r32 */ \
187	0	\
188	0	tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
189	0	tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
190	0	tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
191	0	\
192	0	v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
193	0	v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
194	0	v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
195	0	}
196
197		/** Fill vector of single precision floating point with selected value.
198		Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
199		*/
200		#define __MM_SELECT(v, fp) \
201	0	_mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
202
203		/// Accumulate four vector of single precision floating point values.
204		#define __MM_ACCUM4_PS(a, b, c, d) \
205	0	_mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
206
207		/** Performing dot-product between two of four vector of single precision
208		floating point values.
209		*/
210		#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
211		__MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
212
213		/** Performing dot-product between four vector and three vector of single
214		precision floating point values.
215		*/
216		#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
217	0	__MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
218
219		/// Accumulate three vector of single precision floating point values.
220		#define __MM_ACCUM3_PS(a, b, c) \
221	0	_mm_add_ps(_mm_add_ps(a, b), c)
222
223		/** Performing dot-product between two of three vector of single precision
224		floating point values.
225		*/
226		#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
227	0	__MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
228
229		/// Calculate multiply of two vector and plus another vector
230		#define __MM_MADD_PS(a, b, c) \
231	0	_mm_add_ps(_mm_mul_ps(a, b), c)
232
233		/// Linear interpolation
234		#define __MM_LERP_PS(t, a, b) \
235	0	__MM_MADD_PS(_mm_sub_ps(b, a), t, a)
236
237		/// Calculate multiply of two single floating value and plus another floating value
238		#define __MM_MADD_SS(a, b, c) \
239	0	_mm_add_ss(_mm_mul_ss(a, b), c)
240
241		/// Linear interpolation
242		#define __MM_LERP_SS(t, a, b) \
243	0	__MM_MADD_SS(_mm_sub_ss(b, a), t, a)
244
245		/// Same as _mm_load_ps, but can help VC generate more optimised code.
246		#define __MM_LOAD_PS(p) \
247	0	((const __m128)(p))
248
249		/// Same as _mm_store_ps, but can help VC generate more optimised code.
250		#define __MM_STORE_PS(p, v) \
251	0	((__m128)(p) = (v))
252
253
254		/** Helper to load/store SSE data based on whether or not aligned.
255		*/
256		template <bool aligned = false>
257		struct SSEMemoryAccessor
258		{
259		static OGRE_FORCE_INLINE __m128 load(const float *p)
260	0	{
261	0	return _mm_loadu_ps(p);
262	0	}
263		static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
264	0	{
265	0	_mm_storeu_ps(p, v);
266	0	}
267		};
268		// Special aligned accessor
269		template <>
270		struct SSEMemoryAccessor<true>
271		{
272		static OGRE_FORCE_INLINE const __m128& load(const float *p)
273	0	{
274	0	return __MM_LOAD_PS(p);
275	0	}
276		static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
277	0	{
278	0	__MM_STORE_PS(p, v);
279	0	}
280		};
281
282		/** Check whether or not the given pointer perfect aligned for SSE.
283		*/
284		static OGRE_FORCE_INLINE bool _isAlignedForSSE(const void *p)
285	0	{
286	0	return (((size_t)p) & 15) == 0;
287	0	}
288
289		// Macro to check the stack aligned for SSE
290		#if OGRE_DEBUG_MODE
291		#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
292		{ \
293		__m128 test = {}; \
294		assert(_isAlignedForSSE(&test)); \
295		}
296
297		#else // !OGRE_DEBUG_MODE
298		#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
299
300		#endif // OGRE_DEBUG_MODE
301
302
303		#endif // __OGRE_HAVE_SSE
304		/** @} */
305		/** @} */
306
307		}
308
309		#endif // __SIMDHelper_H__