/src/ogre/OgreMain/src/OgreSIMDHelper.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | ----------------------------------------------------------------------------- |
3 | | This source file is part of OGRE |
4 | | (Object-oriented Graphics Rendering Engine) |
5 | | For the latest info, see http://www.ogre3d.org/ |
6 | | |
7 | | Copyright (c) 2000-2014 Torus Knot Software Ltd |
8 | | |
9 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | | of this software and associated documentation files (the "Software"), to deal |
11 | | in the Software without restriction, including without limitation the rights |
12 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | | copies of the Software, and to permit persons to whom the Software is |
14 | | furnished to do so, subject to the following conditions: |
15 | | |
16 | | The above copyright notice and this permission notice shall be included in |
17 | | all copies or substantial portions of the Software. |
18 | | |
19 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
22 | | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
24 | | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
25 | | THE SOFTWARE. |
26 | | ----------------------------------------------------------------------------- |
27 | | */ |
28 | | #ifndef __SIMDHelper_H__ |
29 | | #define __SIMDHelper_H__ |
30 | | |
31 | | // Stack-alignment hackery. |
32 | | // |
33 | | // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests |
34 | | // special code to ensure stack align to a 16-bytes boundary. |
35 | | // |
36 | | // Note: |
37 | | // This macro can only guarantee callee stack pointer (esp) align |
38 | | // to a 16-bytes boundary, but not that for frame pointer (ebp). |
39 | | // Because most compiler might use frame pointer to access to stack |
40 | | // variables, so you need to wrap those alignment required functions |
41 | | // with extra function call. |
42 | | // |
43 | | #if defined(__INTEL_COMPILER) |
44 | | // For intel's compiler, simply calling alloca seems to do the right |
45 | | // thing. The size of the allocated block seems to be irrelevant. |
46 | | #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) |
47 | | #define __OGRE_SIMD_ALIGN_ATTRIBUTE |
48 | | |
49 | | #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64) |
50 | | // mark functions with GCC attribute to force stack alignment to 16 bytes |
51 | | #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer)) |
52 | | |
53 | | #elif defined(_MSC_VER) |
54 | | // Fortunately, MSVC will align the stack automatically |
55 | | #define __OGRE_SIMD_ALIGN_ATTRIBUTE |
56 | | |
57 | | #else |
58 | | #define __OGRE_SIMD_ALIGN_ATTRIBUTE |
59 | | |
60 | | #endif |
61 | | |
62 | | |
63 | | // Additional platform-dependent header files and declares. |
64 | | // |
65 | | // NOTE: Should be sync with __OGRE_HAVE_SSE macro. |
66 | | // |
67 | | |
68 | | #if __OGRE_HAVE_SSE |
69 | | #include <xmmintrin.h> |
70 | | #elif __OGRE_HAVE_NEON |
71 | | #include "SSE2NEON.h" |
72 | | |
73 | | // some conversions custom to OGRE |
74 | | #define _mm_cmpnle_ps _mm_cmpgt_ps |
75 | | |
76 | | // self written |
77 | | OGRE_FORCE_INLINE __m128 _mm_loadh_pi( __m128 a , __m64 const * p ) |
78 | | { |
79 | | return vcombine_f32(vget_low_f32(a), vld1_f32((float32_t const *)p)); |
80 | | } |
81 | | // self written |
82 | | OGRE_FORCE_INLINE void _mm_storeh_pi( __m64 * p , __m128 a ) |
83 | | { |
84 | | vst1_f32((float32_t *)p, vget_high_f32((float32x4_t)a)); |
85 | | } |
86 | | |
87 | | OGRE_FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) |
88 | | { |
89 | | a[0] *= b[0]; |
90 | | return a; |
91 | | } |
92 | | |
93 | | OGRE_FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) |
94 | | { |
95 | | a[0] -= b[0]; |
96 | | return a; |
97 | | } |
98 | | #endif |
99 | | |
100 | | |
101 | | |
102 | | //--------------------------------------------------------------------- |
103 | | // SIMD macros and helpers |
104 | | //--------------------------------------------------------------------- |
105 | | |
106 | | |
107 | | namespace Ogre { |
108 | | /** \addtogroup Core |
109 | | * @{ |
110 | | */ |
111 | | /** \addtogroup Math |
112 | | * @{ |
113 | | */ |
114 | | |
115 | | #if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON |
116 | | |
117 | 0 | #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) |
118 | | |
119 | | |
120 | | /** Performing the transpose of a 4x4 matrix of single precision floating |
121 | | point values. |
122 | | Arguments r0, r1, r2, and r3 are __m128 values whose elements |
123 | | form the corresponding rows of a 4x4 matrix. |
124 | | The matrix transpose is returned in arguments r0, r1, r2, and |
125 | | r3 where r0 now holds column 0 of the original matrix, r1 now |
126 | | holds column 1 of the original matrix, etc. |
127 | | */ |
128 | | #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ |
129 | 0 | { \ |
130 | 0 | __m128 tmp3, tmp2, tmp1, tmp0; \ |
131 | 0 | \ |
132 | 0 | /* r00 r01 r02 r03 */ \ |
133 | 0 | /* r10 r11 r12 r13 */ \ |
134 | 0 | /* r20 r21 r22 r23 */ \ |
135 | 0 | /* r30 r31 r32 r33 */ \ |
136 | 0 | \ |
137 | 0 | tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \ |
138 | 0 | tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \ |
139 | 0 | tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \ |
140 | 0 | tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \ |
141 | 0 | \ |
142 | 0 | r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \ |
143 | 0 | r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \ |
144 | 0 | r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \ |
145 | 0 | r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \ |
146 | 0 | } |
147 | | |
148 | | /** Performing the transpose of a continuous stored rows of a 4x3 matrix to |
149 | | a 3x4 matrix of single precision floating point values. |
150 | | Arguments v0, v1, and v2 are __m128 values whose elements form the |
151 | | corresponding continuous stored rows of a 4x3 matrix. |
152 | | The matrix transpose is returned in arguments v0, v1, and v2, where |
153 | | v0 now holds column 0 of the original matrix, v1 now holds column 1 |
154 | | of the original matrix, etc. |
155 | | */ |
156 | | #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ |
157 | 0 | { \ |
158 | 0 | __m128 tmp0, tmp1, tmp2; \ |
159 | 0 | \ |
160 | 0 | /* r00 r01 r02 r10 */ \ |
161 | 0 | /* r11 r12 r20 r21 */ \ |
162 | 0 | /* r22 r30 r31 r32 */ \ |
163 | 0 | \ |
164 | 0 | tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \ |
165 | 0 | tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \ |
166 | 0 | tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \ |
167 | 0 | \ |
168 | 0 | v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \ |
169 | 0 | v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \ |
170 | 0 | v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \ |
171 | 0 | } |
172 | | |
173 | | /** Performing the transpose of a 3x4 matrix to a continuous stored rows of |
174 | | a 4x3 matrix of single precision floating point values. |
175 | | Arguments v0, v1, and v2 are __m128 values whose elements form the |
176 | | corresponding columns of a 3x4 matrix. |
177 | | The matrix transpose is returned in arguments v0, v1, and v2, as a |
178 | | continuous stored rows of a 4x3 matrix. |
179 | | */ |
180 | | #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ |
181 | 0 | { \ |
182 | 0 | __m128 tmp0, tmp1, tmp2; \ |
183 | 0 | \ |
184 | 0 | /* r00 r10 r20 r30 */ \ |
185 | 0 | /* r01 r11 r21 r31 */ \ |
186 | 0 | /* r02 r12 r22 r32 */ \ |
187 | 0 | \ |
188 | 0 | tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \ |
189 | 0 | tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \ |
190 | 0 | tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \ |
191 | 0 | \ |
192 | 0 | v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \ |
193 | 0 | v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \ |
194 | 0 | v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \ |
195 | 0 | } |
196 | | |
197 | | /** Fill vector of single precision floating point with selected value. |
198 | | Argument 'fp' is a digit[0123] that represents the fp of argument 'v'. |
199 | | */ |
200 | | #define __MM_SELECT(v, fp) \ |
201 | 0 | _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) |
202 | | |
203 | | /// Accumulate four vector of single precision floating point values. |
204 | | #define __MM_ACCUM4_PS(a, b, c, d) \ |
205 | 0 | _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) |
206 | | |
207 | | /** Performing dot-product between two of four vector of single precision |
208 | | floating point values. |
209 | | */ |
210 | | #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ |
211 | | __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) |
212 | | |
213 | | /** Performing dot-product between four vector and three vector of single |
214 | | precision floating point values. |
215 | | */ |
216 | | #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ |
217 | 0 | __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) |
218 | | |
219 | | /// Accumulate three vector of single precision floating point values. |
220 | | #define __MM_ACCUM3_PS(a, b, c) \ |
221 | 0 | _mm_add_ps(_mm_add_ps(a, b), c) |
222 | | |
223 | | /** Performing dot-product between two of three vector of single precision |
224 | | floating point values. |
225 | | */ |
226 | | #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ |
227 | 0 | __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) |
228 | | |
229 | | /// Calculate multiply of two vector and plus another vector |
230 | | #define __MM_MADD_PS(a, b, c) \ |
231 | 0 | _mm_add_ps(_mm_mul_ps(a, b), c) |
232 | | |
233 | | /// Linear interpolation |
234 | | #define __MM_LERP_PS(t, a, b) \ |
235 | 0 | __MM_MADD_PS(_mm_sub_ps(b, a), t, a) |
236 | | |
237 | | /// Calculate multiply of two single floating value and plus another floating value |
238 | | #define __MM_MADD_SS(a, b, c) \ |
239 | 0 | _mm_add_ss(_mm_mul_ss(a, b), c) |
240 | | |
241 | | /// Linear interpolation |
242 | | #define __MM_LERP_SS(t, a, b) \ |
243 | 0 | __MM_MADD_SS(_mm_sub_ss(b, a), t, a) |
244 | | |
245 | | /// Same as _mm_load_ps, but can help VC generate more optimised code. |
246 | | #define __MM_LOAD_PS(p) \ |
247 | 0 | (*(const __m128*)(p)) |
248 | | |
249 | | /// Same as _mm_store_ps, but can help VC generate more optimised code. |
250 | | #define __MM_STORE_PS(p, v) \ |
251 | 0 | (*(__m128*)(p) = (v)) |
252 | | |
253 | | |
254 | | /** Helper to load/store SSE data based on whether or not aligned. |
255 | | */ |
256 | | template <bool aligned = false> |
257 | | struct SSEMemoryAccessor |
258 | | { |
259 | | static OGRE_FORCE_INLINE __m128 load(const float *p) |
260 | 0 | { |
261 | 0 | return _mm_loadu_ps(p); |
262 | 0 | } |
263 | | static OGRE_FORCE_INLINE void store(float *p, const __m128& v) |
264 | 0 | { |
265 | 0 | _mm_storeu_ps(p, v); |
266 | 0 | } |
267 | | }; |
268 | | // Special aligned accessor |
269 | | template <> |
270 | | struct SSEMemoryAccessor<true> |
271 | | { |
272 | | static OGRE_FORCE_INLINE const __m128& load(const float *p) |
273 | 0 | { |
274 | 0 | return __MM_LOAD_PS(p); |
275 | 0 | } |
276 | | static OGRE_FORCE_INLINE void store(float *p, const __m128& v) |
277 | 0 | { |
278 | 0 | __MM_STORE_PS(p, v); |
279 | 0 | } |
280 | | }; |
281 | | |
282 | | /** Check whether or not the given pointer perfect aligned for SSE. |
283 | | */ |
284 | | static OGRE_FORCE_INLINE bool _isAlignedForSSE(const void *p) |
285 | 0 | { |
286 | 0 | return (((size_t)p) & 15) == 0; |
287 | 0 | } |
288 | | |
289 | | // Macro to check the stack aligned for SSE |
290 | | #if OGRE_DEBUG_MODE |
291 | | #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ |
292 | | { \ |
293 | | __m128 test = {}; \ |
294 | | assert(_isAlignedForSSE(&test)); \ |
295 | | } |
296 | | |
297 | | #else // !OGRE_DEBUG_MODE |
298 | | #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() |
299 | | |
300 | | #endif // OGRE_DEBUG_MODE |
301 | | |
302 | | |
303 | | #endif // __OGRE_HAVE_SSE |
304 | | /** @} */ |
305 | | /** @} */ |
306 | | |
307 | | } |
308 | | |
309 | | #endif // __SIMDHelper_H__ |