Coverage Report

Created: 2025-08-29 06:18

/src/ogre/OgreMain/src/OgreSIMDHelper.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
-----------------------------------------------------------------------------
3
This source file is part of OGRE
4
    (Object-oriented Graphics Rendering Engine)
5
For the latest info, see http://www.ogre3d.org/
6
7
Copyright (c) 2000-2014 Torus Knot Software Ltd
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25
THE SOFTWARE.
26
-----------------------------------------------------------------------------
27
*/
28
#ifndef __SIMDHelper_H__
29
#define __SIMDHelper_H__
30
31
// Stack-alignment hackery.
32
//
33
// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
34
// special code to ensure stack align to a 16-bytes boundary.
35
//
36
// Note:
37
//   This macro can only guarantee callee stack pointer (esp) align
38
// to a 16-bytes boundary, but not that for frame pointer (ebp).
39
// Because most compiler might use frame pointer to access to stack
40
// variables, so you need to wrap those alignment required functions
41
// with extra function call.
42
//
43
#if defined(__INTEL_COMPILER)
44
// For intel's compiler, simply calling alloca seems to do the right
45
// thing. The size of the allocated block seems to be irrelevant.
46
#define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
47
#define __OGRE_SIMD_ALIGN_ATTRIBUTE
48
49
#elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
50
// mark functions with GCC attribute to force stack alignment to 16 bytes
51
#define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
52
53
#elif defined(_MSC_VER)
54
// Fortunately, MSVC will align the stack automatically
55
#define __OGRE_SIMD_ALIGN_ATTRIBUTE
56
57
#else
58
#define __OGRE_SIMD_ALIGN_ATTRIBUTE
59
60
#endif
61
62
63
// Additional platform-dependent header files and declares.
64
//
65
// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
66
//
67
68
#if __OGRE_HAVE_SSE
69
#include <xmmintrin.h>
70
#elif __OGRE_HAVE_NEON
71
#include "SSE2NEON.h"
72
73
// some conversions custom to OGRE
74
#define _mm_cmpnle_ps _mm_cmpgt_ps
75
76
// self written
77
OGRE_FORCE_INLINE __m128 _mm_loadh_pi( __m128 a , __m64 const * p )
78
{
79
  return vcombine_f32(vget_low_f32(a), vld1_f32((float32_t const *)p));
80
}
81
// self written
82
OGRE_FORCE_INLINE void _mm_storeh_pi( __m64 * p , __m128 a )
83
{
84
  vst1_f32((float32_t *)p, vget_high_f32((float32x4_t)a));
85
}
86
87
OGRE_FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
88
{
89
    a[0] *= b[0];
90
    return a;
91
}
92
93
OGRE_FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
94
{
95
    a[0] -= b[0];
96
    return a;
97
}
98
#endif
99
100
101
102
//---------------------------------------------------------------------
103
// SIMD macros and helpers
104
//---------------------------------------------------------------------
105
106
107
namespace Ogre {
108
    /** \addtogroup Core
109
    *  @{
110
    */
111
    /** \addtogroup Math
112
    *  @{
113
    */
114
115
#if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON
116
117
0
#define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
118
119
120
/** Performing the transpose of a 4x4 matrix of single precision floating
121
    point values.
122
    Arguments r0, r1, r2, and r3 are __m128 values whose elements
123
    form the corresponding rows of a 4x4 matrix.
124
    The matrix transpose is returned in arguments r0, r1, r2, and
125
    r3 where r0 now holds column 0 of the original matrix, r1 now
126
    holds column 1 of the original matrix, etc.
127
*/
128
#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
129
0
    {                                                                                   \
130
0
        __m128 tmp3, tmp2, tmp1, tmp0;                                                  \
131
0
                                                                                        \
132
0
                                                            /* r00 r01 r02 r03 */       \
133
0
                                                            /* r10 r11 r12 r13 */       \
134
0
                                                            /* r20 r21 r22 r23 */       \
135
0
                                                            /* r30 r31 r32 r33 */       \
136
0
                                                                                        \
137
0
        tmp0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */     \
138
0
        tmp2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */     \
139
0
        tmp1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */     \
140
0
        tmp3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */     \
141
0
                                                                                        \
142
0
        r0 = _mm_movelh_ps(tmp0, tmp1);                         /* r00 r10 r20 r30 */   \
143
0
        r1 = _mm_movehl_ps(tmp1, tmp0);                         /* r01 r11 r21 r31 */   \
144
0
        r2 = _mm_movelh_ps(tmp2, tmp3);                         /* r02 r12 r22 r32 */   \
145
0
        r3 = _mm_movehl_ps(tmp3, tmp2);                         /* r03 r13 r23 r33 */   \
146
0
    }
147
148
/** Performing the transpose of a continuous stored rows of a 4x3 matrix to
149
    a 3x4 matrix of single precision floating point values.
150
    Arguments v0, v1, and v2 are __m128 values whose elements form the
151
    corresponding continuous stored rows of a 4x3 matrix.
152
    The matrix transpose is returned in arguments v0, v1, and v2, where
153
    v0 now holds column 0 of the original matrix, v1 now holds column 1
154
    of the original matrix, etc.
155
*/
156
#define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
157
0
    {                                                                                   \
158
0
        __m128 tmp0, tmp1, tmp2;                                                        \
159
0
                                                                                        \
160
0
                                                            /* r00 r01 r02 r10 */       \
161
0
                                                            /* r11 r12 r20 r21 */       \
162
0
                                                            /* r22 r30 r31 r32 */       \
163
0
                                                                                        \
164
0
        tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */     \
165
0
        tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */     \
166
0
        tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */     \
167
0
                                                                                        \
168
0
        v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
169
0
        v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
170
0
        v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
171
0
    }
172
173
/** Performing the transpose of a 3x4 matrix to a continuous stored rows of
174
    a 4x3 matrix of single precision floating point values.
175
    Arguments v0, v1, and v2 are __m128 values whose elements form the
176
    corresponding columns of a 3x4 matrix.
177
    The matrix transpose is returned in arguments v0, v1, and v2, as a
178
    continuous stored rows of a 4x3 matrix.
179
*/
180
#define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
181
0
    {                                                                               \
182
0
        __m128 tmp0, tmp1, tmp2;                                                    \
183
0
                                                                                    \
184
0
                                                            /* r00 r10 r20 r30 */   \
185
0
                                                            /* r01 r11 r21 r31 */   \
186
0
                                                            /* r02 r12 r22 r32 */   \
187
0
                                                                                    \
188
0
        tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
189
0
        tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
190
0
        tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
191
0
                                                                                    \
192
0
        v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
193
0
        v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
194
0
        v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
195
0
    }
196
197
/** Fill vector of single precision floating point with selected value.
198
    Argument 'fp' is a digit[0123] that represents the fp of argument 'v'.
199
*/
200
#define __MM_SELECT(v, fp)                                                          \
201
0
    _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
202
203
/// Accumulate four vector of single precision floating point values.
204
#define __MM_ACCUM4_PS(a, b, c, d)                                                  \
205
0
    _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
206
207
/** Performing dot-product between two of four vector of single precision
208
    floating point values.
209
*/
210
#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
211
    __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
212
213
/** Performing dot-product between four vector and three vector of single
214
    precision floating point values.
215
*/
216
#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
217
0
    __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
218
219
/// Accumulate three vector of single precision floating point values.
220
#define __MM_ACCUM3_PS(a, b, c)                                                     \
221
0
    _mm_add_ps(_mm_add_ps(a, b), c)
222
223
/** Performing dot-product between two of three vector of single precision
224
    floating point values.
225
*/
226
#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
227
0
    __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
228
229
/// Calculate multiply of two vector and plus another vector
230
#define __MM_MADD_PS(a, b, c)                                                       \
231
0
    _mm_add_ps(_mm_mul_ps(a, b), c)
232
233
/// Linear interpolation
234
#define __MM_LERP_PS(t, a, b)                                                       \
235
0
    __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
236
237
/// Calculate multiply of two single floating value and plus another floating value
238
#define __MM_MADD_SS(a, b, c)                                                       \
239
0
    _mm_add_ss(_mm_mul_ss(a, b), c)
240
241
/// Linear interpolation
242
#define __MM_LERP_SS(t, a, b)                                                       \
243
0
    __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
244
245
/// Same as _mm_load_ps, but can help VC generate more optimised code.
246
#define __MM_LOAD_PS(p)                                                             \
247
0
    (*(const __m128*)(p))
248
249
/// Same as _mm_store_ps, but can help VC generate more optimised code.
250
#define __MM_STORE_PS(p, v)                                                         \
251
0
    (*(__m128*)(p) = (v))
252
253
254
    /** Helper to load/store SSE data based on whether or not aligned.
255
    */
256
    template <bool aligned = false>
257
    struct SSEMemoryAccessor
258
    {
259
        static OGRE_FORCE_INLINE __m128 load(const float *p)
260
0
        {
261
0
            return _mm_loadu_ps(p);
262
0
        }
263
        static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
264
0
        {
265
0
            _mm_storeu_ps(p, v);
266
0
        }
267
    };
268
    // Special aligned accessor
269
    template <>
270
    struct SSEMemoryAccessor<true>
271
    {
272
        static OGRE_FORCE_INLINE const __m128& load(const float *p)
273
0
        {
274
0
            return __MM_LOAD_PS(p);
275
0
        }
276
        static OGRE_FORCE_INLINE void store(float *p, const __m128& v)
277
0
        {
278
0
            __MM_STORE_PS(p, v);
279
0
        }
280
    };
281
282
    /** Check whether or not the given pointer perfect aligned for SSE.
283
    */
284
    static OGRE_FORCE_INLINE bool _isAlignedForSSE(const void *p)
285
0
    {
286
0
        return (((size_t)p) & 15) == 0;
287
0
    }
288
289
// Macro to check the stack aligned for SSE
290
#if OGRE_DEBUG_MODE
291
#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
292
    {                                               \
293
        __m128 test = {};                           \
294
        assert(_isAlignedForSSE(&test));            \
295
    }
296
297
#else   // !OGRE_DEBUG_MODE
298
#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
299
300
#endif  // OGRE_DEBUG_MODE
301
302
303
#endif  // __OGRE_HAVE_SSE
304
    /** @} */
305
    /** @} */
306
307
}
308
309
#endif // __SIMDHelper_H__