/src/ogre/OgreMain/src/OgreOptimisedUtilSSE.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | ----------------------------------------------------------------------------- |
3 | | This source file is part of OGRE |
4 | | (Object-oriented Graphics Rendering Engine) |
5 | | For the latest info, see http://www.ogre3d.org/ |
6 | | |
7 | | Copyright (c) 2000-2014 Torus Knot Software Ltd |
8 | | |
9 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | | of this software and associated documentation files (the "Software"), to deal |
11 | | in the Software without restriction, including without limitation the rights |
12 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | | copies of the Software, and to permit persons to whom the Software is |
14 | | furnished to do so, subject to the following conditions: |
15 | | |
16 | | The above copyright notice and this permission notice shall be included in |
17 | | all copies or substantial portions of the Software. |
18 | | |
19 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
22 | | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
24 | | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
25 | | THE SOFTWARE. |
26 | | ----------------------------------------------------------------------------- |
27 | | */ |
28 | | #include "OgreStableHeaders.h" |
29 | | #include "OgreOptimisedUtil.h" |
30 | | |
31 | | |
32 | | #if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON |
33 | | |
34 | | // Should keep this includes at latest to avoid potential "xmmintrin.h" included by |
35 | | // other header file on some platform for some reason. |
36 | | #include "OgreSIMDHelper.h" |
37 | | |
38 | | // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's |
39 | | // impossible when compile with gcc, due SSE instructions can only |
40 | | // enable/disable at file level. |
41 | | |
42 | | //------------------------------------------------------------------------- |
43 | | // |
44 | | // The routines implemented in this file are performance oriented, |
45 | | // which means saving every penny as possible. This requirement might |
46 | | // break some C++/STL-rules. |
47 | | // |
48 | | // |
49 | | // Some rules I'd like to respects: |
50 | | // |
51 | | // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because |
52 | | // it can saving one byte of binary code :) |
53 | | // 2. Use add/sub instead of mul. |
54 | | // 3. Eliminate prolog code of function call. |
55 | | // |
56 | | // The last, anything recommended by Intel Optimization Reference Manual. |
57 | | // |
58 | | //------------------------------------------------------------------------- |
59 | | |
60 | | // Use unrolled SSE version when vertices exceeds this limit |
61 | 0 | #define OGRE_SSE_SKINNING_UNROLL_VERTICES 16 |
62 | | |
63 | | namespace Ogre { |
64 | | |
65 | | //------------------------------------------------------------------------- |
66 | | // Local classes |
67 | | //------------------------------------------------------------------------- |
68 | | |
69 | | /** SSE implementation of OptimisedUtil. |
70 | | @note |
71 | | Don't use this class directly, use OptimisedUtil instead. |
72 | | */ |
73 | | class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil |
74 | | { |
75 | | protected: |
76 | | /// Do we prefer to use a general SSE version for position/normal shared buffers? |
77 | | bool mPreferGeneralVersionForSharedBuffers; |
78 | | |
79 | | public: |
80 | | /// Constructor |
81 | | OptimisedUtilSSE(void); |
82 | | |
83 | | /// @copydoc OptimisedUtil::softwareVertexSkinning |
84 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning( |
85 | | const float *srcPosPtr, float *destPosPtr, |
86 | | const float *srcNormPtr, float *destNormPtr, |
87 | | const float *blendWeightPtr, const unsigned char* blendIndexPtr, |
88 | | const Affine3* const* blendMatrices, |
89 | | size_t srcPosStride, size_t destPosStride, |
90 | | size_t srcNormStride, size_t destNormStride, |
91 | | size_t blendWeightStride, size_t blendIndexStride, |
92 | | size_t numWeightsPerVertex, |
93 | | size_t numVertices) override; |
94 | | |
95 | | /// @copydoc OptimisedUtil::softwareVertexMorph |
96 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph( |
97 | | float t, |
98 | | const float *srcPos1, const float *srcPos2, |
99 | | float *dstPos, |
100 | | size_t pos1VSize, size_t pos2VSize, size_t dstVSize, |
101 | | size_t numVertices, |
102 | | bool morphNormals) override; |
103 | | |
104 | | /// @copydoc OptimisedUtil::concatenateAffineMatrices |
105 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices( |
106 | | const Affine3& baseMatrix, |
107 | | const Affine3* srcMatrices, |
108 | | Affine3* dstMatrices, |
109 | | size_t numMatrices) override; |
110 | | |
111 | | /// @copydoc OptimisedUtil::calculateFaceNormals |
112 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals( |
113 | | const float *positions, |
114 | | const EdgeData::Triangle *triangles, |
115 | | Vector4 *faceNormals, |
116 | | size_t numTriangles) override; |
117 | | |
118 | | /// @copydoc OptimisedUtil::calculateLightFacing |
119 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing( |
120 | | const Vector4& lightPos, |
121 | | const Vector4* faceNormals, |
122 | | char* lightFacings, |
123 | | size_t numFaces) override; |
124 | | |
125 | | /// @copydoc OptimisedUtil::extrudeVertices |
126 | | void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices( |
127 | | const Vector4& lightPos, |
128 | | Real extrudeDist, |
129 | | const float* srcPositions, |
130 | | float* destPositions, |
131 | | size_t numVertices) override; |
132 | | }; |
133 | | |
134 | | #if defined(__OGRE_SIMD_ALIGN_STACK) |
135 | | /** Stack-align implementation of OptimisedUtil. |
136 | | |
137 | | User code compiled by icc and gcc might not align stack |
138 | | properly, we need ensure stack align to a 16-bytes boundary |
139 | | when execute SSE function. |
140 | | @par |
141 | | We implemeted as align stack following a virtual function call, |
142 | | then should guarantee call instruction are used instead of inline |
143 | | underlying function body here (which might causing problem). |
144 | | @note |
145 | | Don't use this class directly, use OptimisedUtil instead. |
146 | | */ |
147 | | class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil |
148 | | { |
149 | | protected: |
150 | | /// The actual implementation |
151 | | OptimisedUtil* mImpl; |
152 | | |
153 | | public: |
154 | | /// Constructor |
155 | | OptimisedUtilWithStackAlign(OptimisedUtil* impl) |
156 | | : mImpl(impl) |
157 | | { |
158 | | } |
159 | | |
160 | | /// @copydoc OptimisedUtil::softwareVertexSkinning |
161 | | virtual void softwareVertexSkinning( |
162 | | const float *srcPosPtr, float *destPosPtr, |
163 | | const float *srcNormPtr, float *destNormPtr, |
164 | | const float *blendWeightPtr, const unsigned char* blendIndexPtr, |
165 | | const Affine3* const* blendMatrices, |
166 | | size_t srcPosStride, size_t destPosStride, |
167 | | size_t srcNormStride, size_t destNormStride, |
168 | | size_t blendWeightStride, size_t blendIndexStride, |
169 | | size_t numWeightsPerVertex, |
170 | | size_t numVertices) |
171 | | { |
172 | | __OGRE_SIMD_ALIGN_STACK(); |
173 | | |
174 | | mImpl->softwareVertexSkinning( |
175 | | srcPosPtr, destPosPtr, |
176 | | srcNormPtr, destNormPtr, |
177 | | blendWeightPtr, blendIndexPtr, |
178 | | blendMatrices, |
179 | | srcPosStride, destPosStride, |
180 | | srcNormStride, destNormStride, |
181 | | blendWeightStride, blendIndexStride, |
182 | | numWeightsPerVertex, |
183 | | numVertices); |
184 | | } |
185 | | |
186 | | /// @copydoc OptimisedUtil::softwareVertexMorph |
187 | | virtual void softwareVertexMorph( |
188 | | float t, |
189 | | const float *srcPos1, const float *srcPos2, |
190 | | float *dstPos, |
191 | | size_t pos1VSize, size_t pos2VSize, size_t dstVSize, |
192 | | size_t numVertices, |
193 | | bool morphNormals) |
194 | | { |
195 | | __OGRE_SIMD_ALIGN_STACK(); |
196 | | |
197 | | mImpl->softwareVertexMorph( |
198 | | t, |
199 | | srcPos1, srcPos2, |
200 | | dstPos, |
201 | | pos1VSize, pos2VSize, dstVSize, |
202 | | numVertices, |
203 | | morphNormals); |
204 | | } |
205 | | |
206 | | /// @copydoc OptimisedUtil::concatenateAffineMatrices |
207 | | virtual void concatenateAffineMatrices( |
208 | | const Affine3& baseMatrix, |
209 | | const Affine3* srcMatrices, |
210 | | Affine3* dstMatrices, |
211 | | size_t numMatrices) |
212 | | { |
213 | | __OGRE_SIMD_ALIGN_STACK(); |
214 | | |
215 | | mImpl->concatenateAffineMatrices( |
216 | | baseMatrix, |
217 | | srcMatrices, |
218 | | dstMatrices, |
219 | | numMatrices); |
220 | | } |
221 | | |
222 | | /// @copydoc OptimisedUtil::calculateFaceNormals |
223 | | virtual void calculateFaceNormals( |
224 | | const float *positions, |
225 | | const EdgeData::Triangle *triangles, |
226 | | Vector4 *faceNormals, |
227 | | size_t numTriangles) |
228 | | { |
229 | | __OGRE_SIMD_ALIGN_STACK(); |
230 | | |
231 | | mImpl->calculateFaceNormals( |
232 | | positions, |
233 | | triangles, |
234 | | faceNormals, |
235 | | numTriangles); |
236 | | } |
237 | | |
238 | | /// @copydoc OptimisedUtil::calculateLightFacing |
239 | | virtual void calculateLightFacing( |
240 | | const Vector4& lightPos, |
241 | | const Vector4* faceNormals, |
242 | | char* lightFacings, |
243 | | size_t numFaces) |
244 | | { |
245 | | __OGRE_SIMD_ALIGN_STACK(); |
246 | | |
247 | | mImpl->calculateLightFacing( |
248 | | lightPos, |
249 | | faceNormals, |
250 | | lightFacings, |
251 | | numFaces); |
252 | | } |
253 | | |
254 | | /// @copydoc OptimisedUtil::extrudeVertices |
255 | | virtual void extrudeVertices( |
256 | | const Vector4& lightPos, |
257 | | Real extrudeDist, |
258 | | const float* srcPositions, |
259 | | float* destPositions, |
260 | | size_t numVertices) |
261 | | { |
262 | | __OGRE_SIMD_ALIGN_STACK(); |
263 | | |
264 | | mImpl->extrudeVertices( |
265 | | lightPos, |
266 | | extrudeDist, |
267 | | srcPositions, |
268 | | destPositions, |
269 | | numVertices); |
270 | | } |
271 | | }; |
272 | | #endif // !defined(__OGRE_SIMD_ALIGN_STACK) |
273 | | |
274 | | //--------------------------------------------------------------------- |
275 | | // Some useful macro for collapse matrices. |
276 | | //--------------------------------------------------------------------- |
277 | | |
278 | | #define __LOAD_MATRIX(row0, row1, row2, pMatrix) \ |
279 | 0 | { \ |
280 | 0 | row0 = __MM_LOAD_PS((*pMatrix)[0]); \ |
281 | 0 | row1 = __MM_LOAD_PS((*pMatrix)[1]); \ |
282 | 0 | row2 = __MM_LOAD_PS((*pMatrix)[2]); \ |
283 | 0 | } |
284 | | |
285 | | #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \ |
286 | 0 | { \ |
287 | 0 | row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \ |
288 | 0 | row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \ |
289 | 0 | row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \ |
290 | 0 | } |
291 | | |
292 | | #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ |
293 | 0 | { \ |
294 | 0 | row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight); \ |
295 | 0 | row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight); \ |
296 | 0 | row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight); \ |
297 | 0 | } |
298 | | |
299 | | #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ |
300 | 0 | { \ |
301 | 0 | row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \ |
302 | 0 | row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \ |
303 | 0 | row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \ |
304 | 0 | } |
305 | | |
306 | | //--------------------------------------------------------------------- |
307 | | // The following macros request variables declared by caller. |
308 | | // |
309 | | // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy. |
310 | | //--------------------------------------------------------------------- |
311 | | |
312 | | /** Collapse one-weighted matrix. |
313 | | Eliminated multiply by weight since the weight should be equal to one always |
314 | | */ |
315 | | #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \ |
316 | 0 | { \ |
317 | 0 | pMatrix0 = blendMatrices[pIndices[0]]; \ |
318 | 0 | __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ |
319 | 0 | } |
320 | | |
321 | | /** Collapse two-weighted matrix. |
322 | | Based on the fact that accumulated weights are equal to one, by use lerp, |
323 | | replaced two multiplies and one additive with one multiplie and two additives. |
324 | | */ |
325 | | #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \ |
326 | 0 | { \ |
327 | 0 | weight = _mm_load_ps1(pWeights + 1); \ |
328 | 0 | pMatrix0 = ppMatrices[pIndices[0]]; \ |
329 | 0 | __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ |
330 | 0 | pMatrix1 = ppMatrices[pIndices[1]]; \ |
331 | 0 | __LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \ |
332 | 0 | } |
333 | | |
334 | | /** Collapse three-weighted matrix. |
335 | | */ |
336 | | #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \ |
337 | 0 | { \ |
338 | 0 | weight = _mm_load_ps1(pWeights + 0); \ |
339 | 0 | pMatrix0 = ppMatrices[pIndices[0]]; \ |
340 | 0 | __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ |
341 | 0 | weight = _mm_load_ps1(pWeights + 1); \ |
342 | 0 | pMatrix1 = ppMatrices[pIndices[1]]; \ |
343 | 0 | __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ |
344 | 0 | weight = _mm_load_ps1(pWeights + 2); \ |
345 | 0 | pMatrix2 = ppMatrices[pIndices[2]]; \ |
346 | 0 | __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ |
347 | 0 | } |
348 | | |
349 | | /** Collapse four-weighted matrix. |
350 | | */ |
351 | | #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \ |
352 | 0 | { \ |
353 | 0 | /* Load four blend weights at one time, they will be shuffled later */ \ |
354 | 0 | weights = _mm_loadu_ps(pWeights); \ |
355 | 0 | \ |
356 | 0 | pMatrix0 = ppMatrices[pIndices[0]]; \ |
357 | 0 | weight = __MM_SELECT(weights, 0); \ |
358 | 0 | __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ |
359 | 0 | pMatrix1 = ppMatrices[pIndices[1]]; \ |
360 | 0 | weight = __MM_SELECT(weights, 1); \ |
361 | 0 | __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ |
362 | 0 | pMatrix2 = ppMatrices[pIndices[2]]; \ |
363 | 0 | weight = __MM_SELECT(weights, 2); \ |
364 | 0 | __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ |
365 | 0 | pMatrix3 = ppMatrices[pIndices[3]]; \ |
366 | 0 | weight = __MM_SELECT(weights, 3); \ |
367 | 0 | __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \ |
368 | 0 | } |
369 | | |
370 | | |
371 | | |
372 | | //--------------------------------------------------------------------- |
373 | | // Collapse a matrix at one time. The collapsed matrix are weighted by |
374 | | // blend-weights, and then can use to transform corresponding vertex directly. |
375 | | // |
376 | | // I'd like use inline function instead of macro here, but I also want to |
377 | | // ensure compiler integrate this code into its callers (release build at |
378 | | // least), doesn't matter about specific compile options. Inline function |
379 | | // work fine for VC, but looks like gcc (3.4.4 here) generate function-call |
380 | | // when implemented as inline function, even if compile with "-O3" option. |
381 | | // |
382 | | #define _collapseOneMatrix( \ |
383 | | m00, m01, m02, \ |
384 | | pBlendWeight, pBlendIndex, \ |
385 | | blendMatrices, \ |
386 | | blendWeightStride, blendIndexStride, \ |
387 | | numWeightsPerVertex) \ |
388 | 0 | { \ |
389 | 0 | /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ |
390 | 0 | /* generate wrong code here!!! */ \ |
391 | 0 | const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ |
392 | 0 | __m128 weight, weights; \ |
393 | 0 | \ |
394 | 0 | switch (numWeightsPerVertex) \ |
395 | 0 | { \ |
396 | 0 | default: /* Just in case and make compiler happy */ \ |
397 | 0 | case 1: \ |
398 | 0 | __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ |
399 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
400 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
401 | 0 | break; \ |
402 | 0 | \ |
403 | 0 | case 2: \ |
404 | 0 | __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ |
405 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
406 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
407 | 0 | break; \ |
408 | 0 | \ |
409 | 0 | case 3: \ |
410 | 0 | __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ |
411 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
412 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
413 | 0 | break; \ |
414 | 0 | \ |
415 | 0 | case 4: \ |
416 | 0 | __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ |
417 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
418 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
419 | 0 | break; \ |
420 | 0 | } \ |
421 | 0 | } |
422 | | |
423 | | //--------------------------------------------------------------------- |
424 | | // Collapse four matrices at one time. The collapsed matrix are weighted by |
425 | | // blend-weights, and then can use to transform corresponding vertex directly. |
426 | | // |
427 | | // I'd like use inline function instead of macro here, but I also want to |
428 | | // ensure compiler integrate this code into its callers (release build at |
429 | | // least), doesn't matter about specific compile options. Inline function |
430 | | // work fine for VC, but looks like gcc (3.4.4 here) generate function-call |
431 | | // when implemented as inline function, even if compile with "-O3" option. |
432 | | // |
433 | | #define _collapseFourMatrices( \ |
434 | | m00, m01, m02, \ |
435 | | m10, m11, m12, \ |
436 | | m20, m21, m22, \ |
437 | | m30, m31, m32, \ |
438 | | pBlendWeight, pBlendIndex, \ |
439 | | blendMatrices, \ |
440 | | blendWeightStride, blendIndexStride, \ |
441 | | numWeightsPerVertex) \ |
442 | 0 | { \ |
443 | 0 | /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ |
444 | 0 | /* generate wrong code here!!! */ \ |
445 | 0 | const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ |
446 | 0 | __m128 weight, weights; \ |
447 | 0 | \ |
448 | 0 | switch (numWeightsPerVertex) \ |
449 | 0 | { \ |
450 | 0 | default: /* Just in case and make compiler happy */ \ |
451 | 0 | case 1: \ |
452 | 0 | __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ |
453 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
454 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
455 | 0 | __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \ |
456 | 0 | rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ |
457 | 0 | rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ |
458 | 0 | __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \ |
459 | 0 | rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ |
460 | 0 | rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ |
461 | 0 | __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \ |
462 | 0 | rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ |
463 | 0 | rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ |
464 | 0 | break; \ |
465 | 0 | \ |
466 | 0 | case 2: \ |
467 | 0 | __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ |
468 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
469 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
470 | 0 | __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \ |
471 | 0 | rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ |
472 | 0 | rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ |
473 | 0 | __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \ |
474 | 0 | rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ |
475 | 0 | rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ |
476 | 0 | __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \ |
477 | 0 | rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ |
478 | 0 | rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ |
479 | 0 | break; \ |
480 | 0 | \ |
481 | 0 | case 3: \ |
482 | 0 | __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ |
483 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
484 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
485 | 0 | __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \ |
486 | 0 | rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ |
487 | 0 | rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ |
488 | 0 | __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \ |
489 | 0 | rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ |
490 | 0 | rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ |
491 | 0 | __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \ |
492 | 0 | rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ |
493 | 0 | rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ |
494 | 0 | break; \ |
495 | 0 | \ |
496 | 0 | case 4: \ |
497 | 0 | __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ |
498 | 0 | rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ |
499 | 0 | rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ |
500 | 0 | __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \ |
501 | 0 | rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ |
502 | 0 | rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ |
503 | 0 | __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \ |
504 | 0 | rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ |
505 | 0 | rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ |
506 | 0 | __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \ |
507 | 0 | rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ |
508 | 0 | rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ |
509 | 0 | break; \ |
510 | 0 | } \ |
511 | 0 | } |
512 | | |
513 | | |
514 | | //--------------------------------------------------------------------- |
515 | | // General SSE version skinning positions, and optional skinning normals. |
516 | | static void softwareVertexSkinning_SSE_General( |
517 | | const float *pSrcPos, float *pDestPos, |
518 | | const float *pSrcNorm, float *pDestNorm, |
519 | | const float *pBlendWeight, const unsigned char* pBlendIndex, |
520 | | const Affine3* const* blendMatrices, |
521 | | size_t srcPosStride, size_t destPosStride, |
522 | | size_t srcNormStride, size_t destNormStride, |
523 | | size_t blendWeightStride, size_t blendIndexStride, |
524 | | size_t numWeightsPerVertex, |
525 | | size_t numVertices) |
526 | 0 | { |
527 | 0 | for (size_t i = 0; i < numVertices; ++i) |
528 | 0 | { |
529 | | // Collapse matrices |
530 | 0 | __m128 m00, m01, m02; |
531 | 0 | _collapseOneMatrix( |
532 | 0 | m00, m01, m02, |
533 | 0 | pBlendWeight, pBlendIndex, |
534 | 0 | blendMatrices, |
535 | 0 | blendWeightStride, blendIndexStride, |
536 | 0 | numWeightsPerVertex); |
537 | | |
538 | | // Advance blend weight and index pointers |
539 | 0 | advanceRawPointer(pBlendWeight, blendWeightStride); |
540 | 0 | advanceRawPointer(pBlendIndex, blendIndexStride); |
541 | | |
542 | | //------------------------------------------------------------------ |
543 | | |
544 | | // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y |
545 | 0 | __m128 m03 = _mm_setzero_ps(); |
546 | 0 | __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01); |
547 | | |
548 | | //------------------------------------------------------------------ |
549 | | // Transform position |
550 | | //------------------------------------------------------------------ |
551 | |
|
552 | 0 | __m128 s0, s1, s2; |
553 | | |
554 | | // Load source position |
555 | 0 | s0 = _mm_load_ps1(pSrcPos + 0); |
556 | 0 | s1 = _mm_load_ps1(pSrcPos + 1); |
557 | 0 | s2 = _mm_load_ps1(pSrcPos + 2); |
558 | | |
559 | | // Transform by collapsed matrix |
560 | 0 | __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2); // z 0 x y |
561 | | |
562 | | // Store blended position, no aligned requirement |
563 | 0 | _mm_storeh_pi((__m64*)pDestPos, accumPos); |
564 | 0 | _mm_store_ss(pDestPos+2, accumPos); |
565 | | |
566 | | // Advance source and target position pointers |
567 | 0 | advanceRawPointer(pSrcPos, srcPosStride); |
568 | 0 | advanceRawPointer(pDestPos, destPosStride); |
569 | | |
570 | | //------------------------------------------------------------------ |
571 | | // Optional blend normal |
572 | | //------------------------------------------------------------------ |
573 | |
|
574 | 0 | if (pSrcNorm) |
575 | 0 | { |
576 | | // Load source normal |
577 | 0 | s0 = _mm_load_ps1(pSrcNorm + 0); |
578 | 0 | s1 = _mm_load_ps1(pSrcNorm + 1); |
579 | 0 | s2 = _mm_load_ps1(pSrcNorm + 2); |
580 | | |
581 | | // Transform by collapsed matrix |
582 | 0 | __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2); // z 0 x y |
583 | | |
584 | | // Normalise normal |
585 | 0 | __m128 tmp = _mm_mul_ps(accumNorm, accumNorm); // z^2 0 x^2 y^2 |
586 | 0 | tmp = __MM_ACCUM3_PS(tmp, |
587 | 0 | _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)), // x^2 0 y^2 z^2 |
588 | 0 | _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3))); // y^2 0 z^2 x^2 |
589 | | // Note: zero divided here, but neglectable |
590 | 0 | tmp = __MM_RSQRT_PS(tmp); |
591 | 0 | accumNorm = _mm_mul_ps(accumNorm, tmp); |
592 | | |
593 | | // Store blended normal, no aligned requirement |
594 | 0 | _mm_storeh_pi((__m64*)pDestNorm, accumNorm); |
595 | 0 | _mm_store_ss(pDestNorm+2, accumNorm); |
596 | | |
597 | | // Advance source and target normal pointers |
598 | 0 | advanceRawPointer(pSrcNorm, srcNormStride); |
599 | 0 | advanceRawPointer(pDestNorm, destNormStride); |
600 | 0 | } |
601 | 0 | } |
602 | 0 | } |
603 | | //--------------------------------------------------------------------- |
604 | | // Special SSE version skinning shared buffers of position and normal, |
605 | | // and the buffer are packed. |
606 | | template <bool srcAligned, bool destAligned> |
607 | | struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed |
608 | | { |
609 | | static void apply( |
610 | | const float* pSrc, float* pDest, |
611 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
612 | | const Affine3* const* blendMatrices, |
613 | | size_t blendWeightStride, size_t blendIndexStride, |
614 | | size_t numWeightsPerVertex, |
615 | | size_t numIterations) |
616 | 0 | { |
617 | 0 | typedef SSEMemoryAccessor<srcAligned> SrcAccessor; |
618 | 0 | typedef SSEMemoryAccessor<destAligned> DestAccessor; |
619 | | |
620 | | // Blending 4 vertices per-iteration |
621 | 0 | for (size_t i = 0; i < numIterations; ++i) |
622 | 0 | { |
623 | | // Collapse matrices |
624 | 0 | __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; |
625 | 0 | _collapseFourMatrices( |
626 | 0 | m00, m01, m02, |
627 | 0 | m10, m11, m12, |
628 | 0 | m20, m21, m22, |
629 | 0 | m30, m31, m32, |
630 | 0 | pBlendWeight, pBlendIndex, |
631 | 0 | blendMatrices, |
632 | 0 | blendWeightStride, blendIndexStride, |
633 | 0 | numWeightsPerVertex); |
634 | | |
635 | | // Advance 4 vertices |
636 | 0 | advanceRawPointer(pBlendWeight, 4 * blendWeightStride); |
637 | 0 | advanceRawPointer(pBlendIndex, 4 * blendIndexStride); |
638 | | |
639 | | //------------------------------------------------------------------ |
640 | | // Transform position/normals |
641 | | //------------------------------------------------------------------ |
642 | |
|
643 | 0 | __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5; |
644 | 0 | __m128 t0, t1, t2, t3, t4, t5; |
645 | | |
646 | | // Load source position/normals |
647 | 0 | s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0 |
648 | 0 | s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1 |
649 | 0 | s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1 |
650 | 0 | s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2 |
651 | 0 | s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3 |
652 | 0 | s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3 |
653 | | |
654 | | // Rearrange to component-major for batches calculate. |
655 | |
|
656 | 0 | t0 = _mm_unpacklo_ps(s0, s3); // px0 px2 py0 py2 |
657 | 0 | t1 = _mm_unpackhi_ps(s0, s3); // pz0 pz2 nx0 nx2 |
658 | 0 | t2 = _mm_unpacklo_ps(s1, s4); // ny0 ny2 nz0 nz2 |
659 | 0 | t3 = _mm_unpackhi_ps(s1, s4); // px1 px3 py1 py3 |
660 | 0 | t4 = _mm_unpacklo_ps(s2, s5); // pz1 pz3 nx1 nx3 |
661 | 0 | t5 = _mm_unpackhi_ps(s2, s5); // ny1 ny3 nz1 nz3 |
662 | |
|
663 | 0 | s0 = _mm_unpacklo_ps(t0, t3); // px0 px1 px2 px3 |
664 | 0 | s1 = _mm_unpackhi_ps(t0, t3); // py0 py1 py2 py3 |
665 | 0 | s2 = _mm_unpacklo_ps(t1, t4); // pz0 pz1 pz2 pz3 |
666 | 0 | s3 = _mm_unpackhi_ps(t1, t4); // nx0 nx1 nx2 nx3 |
667 | 0 | s4 = _mm_unpacklo_ps(t2, t5); // ny0 ny1 ny2 ny3 |
668 | 0 | s5 = _mm_unpackhi_ps(t2, t5); // nz0 nz1 nz2 nz3 |
669 | | |
670 | | // Transform by collapsed matrix |
671 | | |
672 | | // Shuffle row 0 of four collapsed matrices for calculate X component |
673 | 0 | __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); |
674 | | |
675 | | // Transform X components |
676 | 0 | d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3 |
677 | 0 | d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3 |
678 | | |
679 | | // Shuffle row 1 of four collapsed matrices for calculate Y component |
680 | 0 | __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); |
681 | | |
682 | | // Transform Y components |
683 | 0 | d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3 |
684 | 0 | d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3 |
685 | | |
686 | | // Shuffle row 2 of four collapsed matrices for calculate Z component |
687 | 0 | __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); |
688 | | |
689 | | // Transform Z components |
690 | 0 | d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3 |
691 | 0 | d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3 |
692 | | |
693 | | // Normalise normals |
694 | 0 | __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5); |
695 | 0 | tmp = __MM_RSQRT_PS(tmp); |
696 | 0 | d3 = _mm_mul_ps(d3, tmp); |
697 | 0 | d4 = _mm_mul_ps(d4, tmp); |
698 | 0 | d5 = _mm_mul_ps(d5, tmp); |
699 | | |
700 | | // Arrange back to continuous format for store results |
701 | |
|
702 | 0 | t0 = _mm_unpacklo_ps(d0, d1); // PX0 PY0 PX1 PY1 |
703 | 0 | t1 = _mm_unpackhi_ps(d0, d1); // PX2 PY2 PX3 PY3 |
704 | 0 | t2 = _mm_unpacklo_ps(d2, d3); // PZ0 NX0 PZ1 NX1 |
705 | 0 | t3 = _mm_unpackhi_ps(d2, d3); // PZ2 NX2 PZ3 NX3 |
706 | 0 | t4 = _mm_unpacklo_ps(d4, d5); // NY0 NZ0 NY1 NZ1 |
707 | 0 | t5 = _mm_unpackhi_ps(d4, d5); // NY2 NZ2 NY3 NZ3 |
708 | |
|
709 | 0 | d0 = _mm_movelh_ps(t0, t2); // PX0 PY0 PZ0 NX0 |
710 | 0 | d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0)); // NY0 NZ0 PX1 PY1 |
711 | 0 | d2 = _mm_movehl_ps(t4, t2); // PZ1 NX1 NY1 NZ1 |
712 | 0 | d3 = _mm_movelh_ps(t1, t3); // PX2 PY2 PZ2 NX2 |
713 | 0 | d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0)); // NY2 NZ2 PX3 PY3 |
714 | 0 | d5 = _mm_movehl_ps(t5, t3); // PZ3 NX3 NY3 NZ3 |
715 | | |
716 | | // Store blended position/normals |
717 | 0 | DestAccessor::store(pDest + 0, d0); |
718 | 0 | DestAccessor::store(pDest + 4, d1); |
719 | 0 | DestAccessor::store(pDest + 8, d2); |
720 | 0 | DestAccessor::store(pDest + 12, d3); |
721 | 0 | DestAccessor::store(pDest + 16, d4); |
722 | 0 | DestAccessor::store(pDest + 20, d5); |
723 | | |
724 | | // Advance 4 vertices |
725 | 0 | pSrc += 4 * (3 + 3); |
726 | 0 | pDest += 4 * (3 + 3); |
727 | 0 | } |
728 | 0 | } Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) |
729 | | }; |
730 | | static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed( |
731 | | const float* pSrcPos, float* pDestPos, |
732 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
733 | | const Affine3* const* blendMatrices, |
734 | | size_t blendWeightStride, size_t blendIndexStride, |
735 | | size_t numWeightsPerVertex, |
736 | | size_t numIterations) |
737 | 0 | { |
738 | | // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex |
739 | | |
740 | | // Instantiating two version only, since other alignment combinations are not that important. |
741 | 0 | if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos)) |
742 | 0 | { |
743 | 0 | SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply( |
744 | 0 | pSrcPos, pDestPos, |
745 | 0 | pBlendWeight, pBlendIndex, |
746 | 0 | blendMatrices, |
747 | 0 | blendWeightStride, blendIndexStride, |
748 | 0 | numWeightsPerVertex, |
749 | 0 | numIterations); |
750 | 0 | } |
751 | 0 | else |
752 | 0 | { |
753 | 0 | SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply( |
754 | 0 | pSrcPos, pDestPos, |
755 | 0 | pBlendWeight, pBlendIndex, |
756 | 0 | blendMatrices, |
757 | 0 | blendWeightStride, blendIndexStride, |
758 | 0 | numWeightsPerVertex, |
759 | 0 | numIterations); |
760 | 0 | } |
761 | 0 | } |
762 | | //--------------------------------------------------------------------- |
763 | | // Special SSE version skinning separated buffers of position and normal, |
764 | | // both of position and normal buffer are packed. |
765 | | template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned> |
766 | | struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed |
767 | | { |
768 | | static void apply( |
769 | | const float* pSrcPos, float* pDestPos, |
770 | | const float* pSrcNorm, float* pDestNorm, |
771 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
772 | | const Affine3* const* blendMatrices, |
773 | | size_t blendWeightStride, size_t blendIndexStride, |
774 | | size_t numWeightsPerVertex, |
775 | | size_t numIterations) |
776 | 0 | { |
777 | 0 | typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; |
778 | 0 | typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; |
779 | 0 | typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor; |
780 | 0 | typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor; |
781 | | |
782 | | // Blending 4 vertices per-iteration |
783 | 0 | for (size_t i = 0; i < numIterations; ++i) |
784 | 0 | { |
785 | | // Collapse matrices |
786 | 0 | __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; |
787 | 0 | _collapseFourMatrices( |
788 | 0 | m00, m01, m02, |
789 | 0 | m10, m11, m12, |
790 | 0 | m20, m21, m22, |
791 | 0 | m30, m31, m32, |
792 | 0 | pBlendWeight, pBlendIndex, |
793 | 0 | blendMatrices, |
794 | 0 | blendWeightStride, blendIndexStride, |
795 | 0 | numWeightsPerVertex); |
796 | | |
797 | | // Advance 4 vertices |
798 | 0 | advanceRawPointer(pBlendWeight, 4 * blendWeightStride); |
799 | 0 | advanceRawPointer(pBlendIndex, 4 * blendIndexStride); |
800 | | |
801 | | //------------------------------------------------------------------ |
802 | | // Transform positions |
803 | | //------------------------------------------------------------------ |
804 | |
|
805 | 0 | __m128 s0, s1, s2, d0, d1, d2; |
806 | | |
807 | | // Load source positions |
808 | 0 | s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 |
809 | 0 | s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 |
810 | 0 | s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 |
811 | | |
812 | | // Arrange to 3x4 component-major for batches calculate |
813 | 0 | __MM_TRANSPOSE4x3_PS(s0, s1, s2); |
814 | | |
815 | | // Transform by collapsed matrix |
816 | | |
817 | | // Shuffle row 0 of four collapsed matrices for calculate X component |
818 | 0 | __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); |
819 | | |
820 | | // Transform X components |
821 | 0 | d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 |
822 | | |
823 | | // Shuffle row 1 of four collapsed matrices for calculate Y component |
824 | 0 | __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); |
825 | | |
826 | | // Transform Y components |
827 | 0 | d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 |
828 | | |
829 | | // Shuffle row 2 of four collapsed matrices for calculate Z component |
830 | 0 | __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); |
831 | | |
832 | | // Transform Z components |
833 | 0 | d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 |
834 | | |
835 | | // Arrange back to 4x3 continuous format for store results |
836 | 0 | __MM_TRANSPOSE3x4_PS(d0, d1, d2); |
837 | | |
838 | | // Store blended positions |
839 | 0 | DestPosAccessor::store(pDestPos + 0, d0); |
840 | 0 | DestPosAccessor::store(pDestPos + 4, d1); |
841 | 0 | DestPosAccessor::store(pDestPos + 8, d2); |
842 | | |
843 | | // Advance 4 vertices |
844 | 0 | pSrcPos += 4 * 3; |
845 | 0 | pDestPos += 4 * 3; |
846 | | |
847 | | //------------------------------------------------------------------ |
848 | | // Transform normals |
849 | | //------------------------------------------------------------------ |
850 | | |
851 | | // Load source normals |
852 | 0 | s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1 |
853 | 0 | s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2 |
854 | 0 | s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3 |
855 | | |
856 | | // Arrange to 3x4 component-major for batches calculate |
857 | 0 | __MM_TRANSPOSE4x3_PS(s0, s1, s2); |
858 | | |
859 | | // Transform by collapsed and shuffled matrices |
860 | 0 | d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3 |
861 | 0 | d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3 |
862 | 0 | d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3 |
863 | | |
864 | | // Normalise normals |
865 | 0 | __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2); |
866 | 0 | tmp = __MM_RSQRT_PS(tmp); |
867 | 0 | d0 = _mm_mul_ps(d0, tmp); |
868 | 0 | d1 = _mm_mul_ps(d1, tmp); |
869 | 0 | d2 = _mm_mul_ps(d2, tmp); |
870 | | |
871 | | // Arrange back to 4x3 continuous format for store results |
872 | 0 | __MM_TRANSPOSE3x4_PS(d0, d1, d2); |
873 | | |
874 | | // Store blended normals |
875 | 0 | DestNormAccessor::store(pDestNorm + 0, d0); |
876 | 0 | DestNormAccessor::store(pDestNorm + 4, d1); |
877 | 0 | DestNormAccessor::store(pDestNorm + 8, d2); |
878 | | |
879 | | // Advance 4 vertices |
880 | 0 | pSrcNorm += 4 * 3; |
881 | 0 | pDestNorm += 4 * 3; |
882 | 0 | } |
883 | 0 | } Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(float const*, float*, float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(float const*, float*, float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) |
884 | | }; |
885 | | static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed( |
886 | | const float* pSrcPos, float* pDestPos, |
887 | | const float* pSrcNorm, float* pDestNorm, |
888 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
889 | | const Affine3* const* blendMatrices, |
890 | | size_t blendWeightStride, size_t blendIndexStride, |
891 | | size_t numWeightsPerVertex, |
892 | | size_t numIterations) |
893 | 0 | { |
894 | 0 | assert(_isAlignedForSSE(pSrcPos)); |
895 | | |
896 | | // Instantiating two version only, since other alignment combination not that important. |
897 | 0 | if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm)) |
898 | 0 | { |
899 | 0 | SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply( |
900 | 0 | pSrcPos, pDestPos, |
901 | 0 | pSrcNorm, pDestNorm, |
902 | 0 | pBlendWeight, pBlendIndex, |
903 | 0 | blendMatrices, |
904 | 0 | blendWeightStride, blendIndexStride, |
905 | 0 | numWeightsPerVertex, |
906 | 0 | numIterations); |
907 | 0 | } |
908 | 0 | else |
909 | 0 | { |
910 | 0 | SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply( |
911 | 0 | pSrcPos, pDestPos, |
912 | 0 | pSrcNorm, pDestNorm, |
913 | 0 | pBlendWeight, pBlendIndex, |
914 | 0 | blendMatrices, |
915 | 0 | blendWeightStride, blendIndexStride, |
916 | 0 | numWeightsPerVertex, |
917 | 0 | numIterations); |
918 | 0 | } |
919 | 0 | } |
920 | | //--------------------------------------------------------------------- |
921 | | // Special SSE version skinning position only, the position buffer are |
922 | | // packed. |
923 | | template <bool srcPosAligned, bool destPosAligned> |
924 | | struct SoftwareVertexSkinning_SSE_PosOnly_Packed |
925 | | { |
926 | | static void apply( |
927 | | const float* pSrcPos, float* pDestPos, |
928 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
929 | | const Affine3* const* blendMatrices, |
930 | | size_t blendWeightStride, size_t blendIndexStride, |
931 | | size_t numWeightsPerVertex, |
932 | | size_t numIterations) |
933 | 0 | { |
934 | 0 | typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; |
935 | 0 | typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; |
936 | | |
937 | | // Blending 4 vertices per-iteration |
938 | 0 | for (size_t i = 0; i < numIterations; ++i) |
939 | 0 | { |
940 | | // Collapse matrices |
941 | 0 | __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; |
942 | 0 | _collapseFourMatrices( |
943 | 0 | m00, m01, m02, |
944 | 0 | m10, m11, m12, |
945 | 0 | m20, m21, m22, |
946 | 0 | m30, m31, m32, |
947 | 0 | pBlendWeight, pBlendIndex, |
948 | 0 | blendMatrices, |
949 | 0 | blendWeightStride, blendIndexStride, |
950 | 0 | numWeightsPerVertex); |
951 | | |
952 | | // Advance 4 vertices |
953 | 0 | advanceRawPointer(pBlendWeight, 4 * blendWeightStride); |
954 | 0 | advanceRawPointer(pBlendIndex, 4 * blendIndexStride); |
955 | | |
956 | | //------------------------------------------------------------------ |
957 | | // Transform positions |
958 | | //------------------------------------------------------------------ |
959 | |
|
960 | 0 | __m128 s0, s1, s2, d0, d1, d2; |
961 | | |
962 | | // Load source positions |
963 | 0 | s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 |
964 | 0 | s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 |
965 | 0 | s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 |
966 | | |
967 | | // Arrange to 3x4 component-major for batches calculate |
968 | 0 | __MM_TRANSPOSE4x3_PS(s0, s1, s2); |
969 | | |
970 | | // Transform by collapsed matrix |
971 | | |
972 | | // Shuffle row 0 of four collapsed matrices for calculate X component |
973 | 0 | __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); |
974 | | |
975 | | // Transform X components |
976 | 0 | d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 |
977 | | |
978 | | // Shuffle row 1 of four collapsed matrices for calculate Y component |
979 | 0 | __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); |
980 | | |
981 | | // Transform Y components |
982 | 0 | d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 |
983 | | |
984 | | // Shuffle row 2 of four collapsed matrices for calculate Z component |
985 | 0 | __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); |
986 | | |
987 | | // Transform Z components |
988 | 0 | d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 |
989 | | |
990 | | // Arrange back to 4x3 continuous format for store results |
991 | 0 | __MM_TRANSPOSE3x4_PS(d0, d1, d2); |
992 | | |
993 | | // Store blended positions |
994 | 0 | DestPosAccessor::store(pDestPos + 0, d0); |
995 | 0 | DestPosAccessor::store(pDestPos + 4, d1); |
996 | 0 | DestPosAccessor::store(pDestPos + 8, d2); |
997 | | |
998 | | // Advance 4 vertices |
999 | 0 | pSrcPos += 4 * 3; |
1000 | 0 | pDestPos += 4 * 3; |
1001 | 0 | } |
1002 | 0 | } Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long) |
1003 | | }; |
1004 | | static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosOnly_Packed( |
1005 | | const float* pSrcPos, float* pDestPos, |
1006 | | const float* pBlendWeight, const unsigned char* pBlendIndex, |
1007 | | const Affine3* const* blendMatrices, |
1008 | | size_t blendWeightStride, size_t blendIndexStride, |
1009 | | size_t numWeightsPerVertex, |
1010 | | size_t numIterations) |
1011 | 0 | { |
1012 | 0 | assert(_isAlignedForSSE(pSrcPos)); |
1013 | | |
1014 | | // Instantiating two version only, since other alignment combination not that important. |
1015 | 0 | if (_isAlignedForSSE(pDestPos)) |
1016 | 0 | { |
1017 | 0 | SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply( |
1018 | 0 | pSrcPos, pDestPos, |
1019 | 0 | pBlendWeight, pBlendIndex, |
1020 | 0 | blendMatrices, |
1021 | 0 | blendWeightStride, blendIndexStride, |
1022 | 0 | numWeightsPerVertex, |
1023 | 0 | numIterations); |
1024 | 0 | } |
1025 | 0 | else |
1026 | 0 | { |
1027 | 0 | SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply( |
1028 | 0 | pSrcPos, pDestPos, |
1029 | 0 | pBlendWeight, pBlendIndex, |
1030 | 0 | blendMatrices, |
1031 | 0 | blendWeightStride, blendIndexStride, |
1032 | 0 | numWeightsPerVertex, |
1033 | 0 | numIterations); |
1034 | 0 | } |
1035 | 0 | } |
1036 | | //--------------------------------------------------------------------- |
1037 | | //--------------------------------------------------------------------- |
1038 | | //--------------------------------------------------------------------- |
1039 | | OptimisedUtilSSE::OptimisedUtilSSE(void) |
1040 | 2 | : mPreferGeneralVersionForSharedBuffers(false) |
1041 | 2 | { |
1042 | | // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use |
1043 | | // unrolled version for shared buffers at all, I guess because that version |
1044 | | // run out of usable CPU registers, or L1/L2 cache related problem, causing |
1045 | | // slight performance loss than general version. |
1046 | | // |
1047 | 2 | #if __OGRE_HAVE_NEON == 0 |
1048 | 2 | if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos) |
1049 | 0 | { |
1050 | | // How can I check it's an Athlon XP but not Althon 64? |
1051 | | // Ok, just test whether supports SSE2/SSE3 or not, if not, |
1052 | | // assume general version faster than unrolled version :) |
1053 | | // |
1054 | 0 | if (!(PlatformInformation::getCpuFeatures() & |
1055 | 0 | (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3))) |
1056 | 0 | { |
1057 | 0 | mPreferGeneralVersionForSharedBuffers = true; |
1058 | 0 | } |
1059 | 0 | } |
1060 | 2 | #endif |
1061 | 2 | } |
1062 | | //--------------------------------------------------------------------- |
1063 | | void OptimisedUtilSSE::softwareVertexSkinning( |
1064 | | const float *pSrcPos, float *pDestPos, |
1065 | | const float *pSrcNorm, float *pDestNorm, |
1066 | | const float *pBlendWeight, const unsigned char* pBlendIndex, |
1067 | | const Affine3* const* blendMatrices, |
1068 | | size_t srcPosStride, size_t destPosStride, |
1069 | | size_t srcNormStride, size_t destNormStride, |
1070 | | size_t blendWeightStride, size_t blendIndexStride, |
1071 | | size_t numWeightsPerVertex, |
1072 | | size_t numVertices) |
1073 | 0 | { |
1074 | |
|
1075 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
1076 | | |
1077 | | // All position/normal pointers should be perfect aligned, but still check here |
1078 | | // for avoid hardware buffer which allocated by potential buggy driver doesn't |
1079 | | // support alignment properly. |
1080 | | // Because we are used meta-function technique here, the code is easy to maintenance |
1081 | | // and still provides all possible alignment combination. |
1082 | | // |
1083 | | |
1084 | | // Use unrolled routines only if there a lot of vertices |
1085 | 0 | if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES) |
1086 | 0 | { |
1087 | 0 | if (pSrcNorm) |
1088 | 0 | { |
1089 | | // Blend position and normal |
1090 | |
|
1091 | 0 | if (!mPreferGeneralVersionForSharedBuffers && |
1092 | 0 | srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) && |
1093 | 0 | pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3) |
1094 | 0 | { |
1095 | | // Position and normal are sharing with packed buffer |
1096 | |
|
1097 | 0 | size_t srcPosAlign = (size_t)pSrcPos & 15; |
1098 | 0 | assert((srcPosAlign & 3) == 0); |
1099 | | |
1100 | | // Blend unaligned vertices with general SIMD routine |
1101 | 0 | if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex |
1102 | 0 | { |
1103 | 0 | size_t count = srcPosAlign / 8; |
1104 | 0 | numVertices -= count; |
1105 | 0 | softwareVertexSkinning_SSE_General( |
1106 | 0 | pSrcPos, pDestPos, |
1107 | 0 | pSrcNorm, pDestNorm, |
1108 | 0 | pBlendWeight, pBlendIndex, |
1109 | 0 | blendMatrices, |
1110 | 0 | srcPosStride, destPosStride, |
1111 | 0 | srcNormStride, destNormStride, |
1112 | 0 | blendWeightStride, blendIndexStride, |
1113 | 0 | numWeightsPerVertex, |
1114 | 0 | count); |
1115 | |
|
1116 | 0 | pSrcPos += count * (3 + 3); |
1117 | 0 | pDestPos += count * (3 + 3); |
1118 | 0 | pSrcNorm += count * (3 + 3); |
1119 | 0 | pDestNorm += count * (3 + 3); |
1120 | 0 | advanceRawPointer(pBlendWeight, count * blendWeightStride); |
1121 | 0 | advanceRawPointer(pBlendIndex, count * blendIndexStride); |
1122 | 0 | } |
1123 | | |
1124 | | // Blend vertices, four vertices per-iteration |
1125 | 0 | size_t numIterations = numVertices / 4; |
1126 | 0 | softwareVertexSkinning_SSE_PosNorm_Shared_Packed( |
1127 | 0 | pSrcPos, pDestPos, |
1128 | 0 | pBlendWeight, pBlendIndex, |
1129 | 0 | blendMatrices, |
1130 | 0 | blendWeightStride, blendIndexStride, |
1131 | 0 | numWeightsPerVertex, |
1132 | 0 | numIterations); |
1133 | | |
1134 | | // Advance pointers for remaining vertices |
1135 | 0 | numVertices &= 3; |
1136 | 0 | if (numVertices) |
1137 | 0 | { |
1138 | 0 | pSrcPos += numIterations * 4 * (3 + 3); |
1139 | 0 | pDestPos += numIterations * 4 * (3 + 3); |
1140 | 0 | pSrcNorm += numIterations * 4 * (3 + 3); |
1141 | 0 | pDestNorm += numIterations * 4 * (3 + 3); |
1142 | 0 | advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); |
1143 | 0 | advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); |
1144 | 0 | } |
1145 | 0 | } |
1146 | 0 | else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 && |
1147 | 0 | srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3) |
1148 | 0 | { |
1149 | | // Position and normal are separate buffers, and all of them are packed |
1150 | |
|
1151 | 0 | size_t srcPosAlign = (size_t)pSrcPos & 15; |
1152 | 0 | assert((srcPosAlign & 3) == 0); |
1153 | | |
1154 | | // Blend unaligned vertices with general SIMD routine |
1155 | 0 | if (srcPosAlign) |
1156 | 0 | { |
1157 | 0 | size_t count = srcPosAlign / 4; |
1158 | 0 | numVertices -= count; |
1159 | 0 | softwareVertexSkinning_SSE_General( |
1160 | 0 | pSrcPos, pDestPos, |
1161 | 0 | pSrcNorm, pDestNorm, |
1162 | 0 | pBlendWeight, pBlendIndex, |
1163 | 0 | blendMatrices, |
1164 | 0 | srcPosStride, destPosStride, |
1165 | 0 | srcNormStride, destNormStride, |
1166 | 0 | blendWeightStride, blendIndexStride, |
1167 | 0 | numWeightsPerVertex, |
1168 | 0 | count); |
1169 | |
|
1170 | 0 | pSrcPos += count * 3; |
1171 | 0 | pDestPos += count * 3; |
1172 | 0 | pSrcNorm += count * 3; |
1173 | 0 | pDestNorm += count * 3; |
1174 | 0 | advanceRawPointer(pBlendWeight, count * blendWeightStride); |
1175 | 0 | advanceRawPointer(pBlendIndex, count * blendIndexStride); |
1176 | 0 | } |
1177 | | |
1178 | | // Blend vertices, four vertices per-iteration |
1179 | 0 | size_t numIterations = numVertices / 4; |
1180 | 0 | softwareVertexSkinning_SSE_PosNorm_Separated_Packed( |
1181 | 0 | pSrcPos, pDestPos, |
1182 | 0 | pSrcNorm, pDestNorm, |
1183 | 0 | pBlendWeight, pBlendIndex, |
1184 | 0 | blendMatrices, |
1185 | 0 | blendWeightStride, blendIndexStride, |
1186 | 0 | numWeightsPerVertex, |
1187 | 0 | numIterations); |
1188 | | |
1189 | | // Advance pointers for remaining vertices |
1190 | 0 | numVertices &= 3; |
1191 | 0 | if (numVertices) |
1192 | 0 | { |
1193 | 0 | pSrcPos += numIterations * 4 * 3; |
1194 | 0 | pDestPos += numIterations * 4 * 3; |
1195 | 0 | pSrcNorm += numIterations * 4 * 3; |
1196 | 0 | pDestNorm += numIterations * 4 * 3; |
1197 | 0 | advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); |
1198 | 0 | advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); |
1199 | 0 | } |
1200 | 0 | } |
1201 | 0 | else // Not 'packed' form or wrong order between position and normal |
1202 | 0 | { |
1203 | | // Should never occur, do nothing here just in case |
1204 | 0 | } |
1205 | 0 | } |
1206 | 0 | else // !pSrcNorm |
1207 | 0 | { |
1208 | | // Blend position only |
1209 | |
|
1210 | 0 | if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3) |
1211 | 0 | { |
1212 | | // All buffers are packed |
1213 | |
|
1214 | 0 | size_t srcPosAlign = (size_t)pSrcPos & 15; |
1215 | 0 | assert((srcPosAlign & 3) == 0); |
1216 | | |
1217 | | // Blend unaligned vertices with general SIMD routine |
1218 | 0 | if (srcPosAlign) |
1219 | 0 | { |
1220 | 0 | size_t count = srcPosAlign / 4; |
1221 | 0 | numVertices -= count; |
1222 | 0 | softwareVertexSkinning_SSE_General( |
1223 | 0 | pSrcPos, pDestPos, |
1224 | 0 | pSrcNorm, pDestNorm, |
1225 | 0 | pBlendWeight, pBlendIndex, |
1226 | 0 | blendMatrices, |
1227 | 0 | srcPosStride, destPosStride, |
1228 | 0 | srcNormStride, destNormStride, |
1229 | 0 | blendWeightStride, blendIndexStride, |
1230 | 0 | numWeightsPerVertex, |
1231 | 0 | count); |
1232 | |
|
1233 | 0 | pSrcPos += count * 3; |
1234 | 0 | pDestPos += count * 3; |
1235 | 0 | advanceRawPointer(pBlendWeight, count * blendWeightStride); |
1236 | 0 | advanceRawPointer(pBlendIndex, count * blendIndexStride); |
1237 | 0 | } |
1238 | | |
1239 | | // Blend vertices, four vertices per-iteration |
1240 | 0 | size_t numIterations = numVertices / 4; |
1241 | 0 | softwareVertexSkinning_SSE_PosOnly_Packed( |
1242 | 0 | pSrcPos, pDestPos, |
1243 | 0 | pBlendWeight, pBlendIndex, |
1244 | 0 | blendMatrices, |
1245 | 0 | blendWeightStride, blendIndexStride, |
1246 | 0 | numWeightsPerVertex, |
1247 | 0 | numIterations); |
1248 | | |
1249 | | // Advance pointers for remaining vertices |
1250 | 0 | numVertices &= 3; |
1251 | 0 | if (numVertices) |
1252 | 0 | { |
1253 | 0 | pSrcPos += numIterations * 4 * 3; |
1254 | 0 | pDestPos += numIterations * 4 * 3; |
1255 | 0 | advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); |
1256 | 0 | advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); |
1257 | 0 | } |
1258 | 0 | } |
1259 | 0 | else // Not 'packed' form |
1260 | 0 | { |
1261 | | // Might occur only if user forced software blending position only |
1262 | 0 | } |
1263 | 0 | } |
1264 | 0 | } |
1265 | | |
1266 | | // Blend remaining vertices, need to do it with SIMD for identical result, |
1267 | | // since mixing general floating-point and SIMD algorithm will causing |
1268 | | // floating-point error. |
1269 | 0 | if (numVertices) |
1270 | 0 | { |
1271 | 0 | softwareVertexSkinning_SSE_General( |
1272 | 0 | pSrcPos, pDestPos, |
1273 | 0 | pSrcNorm, pDestNorm, |
1274 | 0 | pBlendWeight, pBlendIndex, |
1275 | 0 | blendMatrices, |
1276 | 0 | srcPosStride, destPosStride, |
1277 | 0 | srcNormStride, destNormStride, |
1278 | 0 | blendWeightStride, blendIndexStride, |
1279 | 0 | numWeightsPerVertex, |
1280 | 0 | numVertices); |
1281 | 0 | } |
1282 | 0 | } |
1283 | | //--------------------------------------------------------------------- |
1284 | | void OptimisedUtilSSE::softwareVertexMorph( |
1285 | | float t, |
1286 | | const float *pSrc1, const float *pSrc2, |
1287 | | float *pDst, |
1288 | | size_t pos1VSize, size_t pos2VSize, size_t dstVSize, |
1289 | | size_t numVertices, |
1290 | | bool morphNormals) |
1291 | 0 | { |
1292 | 0 | OgreAssert(pos1VSize == pos2VSize && pos2VSize == dstVSize && dstVSize == (morphNormals ? 24 : 12), |
1293 | 0 | "stride not supported"); |
1294 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
1295 | |
|
1296 | 0 | __m128 src01, src02, src11, src12, src21, src22; |
1297 | 0 | __m128 dst0, dst1, dst2; |
1298 | |
|
1299 | 0 | __m128 t4 = _mm_load_ps1(&t); |
1300 | | |
1301 | | |
1302 | | // If we're morphing normals, we have twice the number of floats to process |
1303 | | // Positions are interleaved with normals, so we'll have to separately |
1304 | | // normalise just the normals later; we'll just lerp in the first pass |
1305 | | // We can't normalise as we go because normals & positions are only 3 floats |
1306 | | // each so are not aligned for SSE, we'd mix the data up |
1307 | 0 | size_t normalsMultiplier = morphNormals ? 2 : 1; |
1308 | 0 | size_t numIterations = (numVertices*normalsMultiplier) / 4; |
1309 | 0 | size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3; |
1310 | | |
1311 | | // Save for later |
1312 | 0 | float *pStartDst = pDst; |
1313 | | |
1314 | | // Never use meta-function technique to accessing memory because looks like |
1315 | | // VC7.1 generate a bit inefficient binary code when put following code into |
1316 | | // inline function. |
1317 | |
|
1318 | 0 | if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst)) |
1319 | 0 | { |
1320 | | // All data aligned |
1321 | | |
1322 | | // Morph 4 vertices per-iteration. Special designed for use all |
1323 | | // available CPU registers as possible (7 registers used here), |
1324 | | // and avoid temporary values allocated in stack for suppress |
1325 | | // extra memory access. |
1326 | 0 | for (size_t i = 0; i < numIterations; ++i) |
1327 | 0 | { |
1328 | | // 12 floating-point values |
1329 | 0 | src01 = __MM_LOAD_PS(pSrc1 + 0); |
1330 | 0 | src02 = __MM_LOAD_PS(pSrc2 + 0); |
1331 | 0 | src11 = __MM_LOAD_PS(pSrc1 + 4); |
1332 | 0 | src12 = __MM_LOAD_PS(pSrc2 + 4); |
1333 | 0 | src21 = __MM_LOAD_PS(pSrc1 + 8); |
1334 | 0 | src22 = __MM_LOAD_PS(pSrc2 + 8); |
1335 | 0 | pSrc1 += 12; pSrc2 += 12; |
1336 | |
|
1337 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1338 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1339 | 0 | dst2 = __MM_LERP_PS(t4, src21, src22); |
1340 | |
|
1341 | 0 | __MM_STORE_PS(pDst + 0, dst0); |
1342 | 0 | __MM_STORE_PS(pDst + 4, dst1); |
1343 | 0 | __MM_STORE_PS(pDst + 8, dst2); |
1344 | 0 | pDst += 12; |
1345 | 0 | } |
1346 | | |
1347 | | // Morph remaining vertices |
1348 | 0 | switch (numVerticesRemainder) |
1349 | 0 | { |
1350 | 0 | case 3: |
1351 | | // 9 floating-point values |
1352 | 0 | src01 = __MM_LOAD_PS(pSrc1 + 0); |
1353 | 0 | src02 = __MM_LOAD_PS(pSrc2 + 0); |
1354 | 0 | src11 = __MM_LOAD_PS(pSrc1 + 4); |
1355 | 0 | src12 = __MM_LOAD_PS(pSrc2 + 4); |
1356 | 0 | src21 = _mm_load_ss(pSrc1 + 8); |
1357 | 0 | src22 = _mm_load_ss(pSrc2 + 8); |
1358 | |
|
1359 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1360 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1361 | 0 | dst2 = __MM_LERP_SS(t4, src21, src22); |
1362 | |
|
1363 | 0 | __MM_STORE_PS(pDst + 0, dst0); |
1364 | 0 | __MM_STORE_PS(pDst + 4, dst1); |
1365 | 0 | _mm_store_ss(pDst + 8, dst2); |
1366 | 0 | break; |
1367 | | |
1368 | 0 | case 2: |
1369 | | // 6 floating-point values |
1370 | 0 | src01 = __MM_LOAD_PS(pSrc1 + 0); |
1371 | 0 | src02 = __MM_LOAD_PS(pSrc2 + 0); |
1372 | 0 | src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here |
1373 | 0 | src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here |
1374 | |
|
1375 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1376 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1377 | |
|
1378 | 0 | __MM_STORE_PS(pDst + 0, dst0); |
1379 | 0 | _mm_storel_pi((__m64*)(pDst + 4), dst1); |
1380 | 0 | break; |
1381 | | |
1382 | 0 | case 1: |
1383 | | // 3 floating-point values |
1384 | 0 | src01 = _mm_load_ss(pSrc1 + 2); |
1385 | 0 | src02 = _mm_load_ss(pSrc2 + 2); |
1386 | 0 | src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); |
1387 | 0 | src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); |
1388 | |
|
1389 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1390 | |
|
1391 | 0 | _mm_storeh_pi((__m64*)(pDst + 0), dst0); |
1392 | 0 | _mm_store_ss(pDst + 2, dst0); |
1393 | 0 | break; |
1394 | 0 | } |
1395 | 0 | } |
1396 | 0 | else // Should never occur, just in case of buggy drivers |
1397 | 0 | { |
1398 | | // Assume all data unaligned |
1399 | | |
1400 | | // Morph 4 vertices per-iteration. Special designed for use all |
1401 | | // available CPU registers as possible (7 registers used here), |
1402 | | // and avoid temporary values allocated in stack for suppress |
1403 | | // extra memory access. |
1404 | 0 | for (size_t i = 0; i < numIterations; ++i) |
1405 | 0 | { |
1406 | | // 12 floating-point values |
1407 | 0 | src01 = _mm_loadu_ps(pSrc1 + 0); |
1408 | 0 | src02 = _mm_loadu_ps(pSrc2 + 0); |
1409 | 0 | src11 = _mm_loadu_ps(pSrc1 + 4); |
1410 | 0 | src12 = _mm_loadu_ps(pSrc2 + 4); |
1411 | 0 | src21 = _mm_loadu_ps(pSrc1 + 8); |
1412 | 0 | src22 = _mm_loadu_ps(pSrc2 + 8); |
1413 | 0 | pSrc1 += 12; pSrc2 += 12; |
1414 | |
|
1415 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1416 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1417 | 0 | dst2 = __MM_LERP_PS(t4, src21, src22); |
1418 | |
|
1419 | 0 | _mm_storeu_ps(pDst + 0, dst0); |
1420 | 0 | _mm_storeu_ps(pDst + 4, dst1); |
1421 | 0 | _mm_storeu_ps(pDst + 8, dst2); |
1422 | 0 | pDst += 12; |
1423 | | |
1424 | 0 | } |
1425 | | |
1426 | | // Morph remaining vertices |
1427 | 0 | switch (numVerticesRemainder) |
1428 | 0 | { |
1429 | 0 | case 3: |
1430 | | // 9 floating-point values |
1431 | 0 | src01 = _mm_loadu_ps(pSrc1 + 0); |
1432 | 0 | src02 = _mm_loadu_ps(pSrc2 + 0); |
1433 | 0 | src11 = _mm_loadu_ps(pSrc1 + 4); |
1434 | 0 | src12 = _mm_loadu_ps(pSrc2 + 4); |
1435 | 0 | src21 = _mm_load_ss(pSrc1 + 8); |
1436 | 0 | src22 = _mm_load_ss(pSrc2 + 8); |
1437 | |
|
1438 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1439 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1440 | 0 | dst2 = __MM_LERP_SS(t4, src21, src22); |
1441 | |
|
1442 | 0 | _mm_storeu_ps(pDst + 0, dst0); |
1443 | 0 | _mm_storeu_ps(pDst + 4, dst1); |
1444 | 0 | _mm_store_ss(pDst + 8, dst2); |
1445 | 0 | break; |
1446 | | |
1447 | 0 | case 2: |
1448 | | // 6 floating-point values |
1449 | 0 | src01 = _mm_loadu_ps(pSrc1 + 0); |
1450 | 0 | src02 = _mm_loadu_ps(pSrc2 + 0); |
1451 | 0 | src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here |
1452 | 0 | src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here |
1453 | |
|
1454 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1455 | 0 | dst1 = __MM_LERP_PS(t4, src11, src12); |
1456 | |
|
1457 | 0 | _mm_storeu_ps(pDst + 0, dst0); |
1458 | 0 | _mm_storel_pi((__m64*)(pDst + 4), dst1); |
1459 | 0 | break; |
1460 | | |
1461 | 0 | case 1: |
1462 | | // 3 floating-point values |
1463 | 0 | src01 = _mm_load_ss(pSrc1 + 2); |
1464 | 0 | src02 = _mm_load_ss(pSrc2 + 2); |
1465 | 0 | src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); |
1466 | 0 | src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); |
1467 | |
|
1468 | 0 | dst0 = __MM_LERP_PS(t4, src01, src02); |
1469 | |
|
1470 | 0 | _mm_storeh_pi((__m64*)(pDst + 0), dst0); |
1471 | 0 | _mm_store_ss(pDst + 2, dst0); |
1472 | 0 | break; |
1473 | 0 | } |
1474 | | |
1475 | 0 | } |
1476 | | |
1477 | 0 | if (morphNormals) |
1478 | 0 | { |
1479 | | |
1480 | | // Now we need to do and unaligned normalise on the normals data we just |
1481 | | // lerped; because normals are 3 elements each they're always unaligned |
1482 | 0 | float *pNorm = pStartDst; |
1483 | | |
1484 | | // Offset past first position |
1485 | 0 | pNorm += 3; |
1486 | | |
1487 | | // We'll do one normal each iteration, but still use SSE |
1488 | 0 | for (size_t n = 0; n < numVertices; ++n) |
1489 | 0 | { |
1490 | | // normalise function |
1491 | 0 | __m128 norm; |
1492 | | |
1493 | | // load 3 floating-point normal values |
1494 | | // This loads into [0] and clears the rest |
1495 | 0 | norm = _mm_load_ss(pNorm + 2); |
1496 | | // This loads into [2,3]. [1] is unused |
1497 | 0 | norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0)); |
1498 | | |
1499 | | // Fill a 4-vec with vector length |
1500 | | // square |
1501 | 0 | __m128 tmp = _mm_mul_ps(norm, norm); |
1502 | | // Add - for this we want this effect: |
1503 | | // orig 3 | 2 | 1 | 0 |
1504 | | // add1 0 | 0 | 0 | 2 |
1505 | | // add2 2 | 3 | 0 | 3 |
1506 | | // This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused) |
1507 | | |
1508 | 0 | tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2))); |
1509 | | // Add final combination & sqrt |
1510 | | // bottom 3 elements of l will have length, we don't care about 4 |
1511 | 0 | tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3))); |
1512 | | // Then divide to normalise |
1513 | 0 | norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp)); |
1514 | | |
1515 | | // Store back in the same place |
1516 | 0 | _mm_storeh_pi((__m64*)(pNorm + 0), norm); |
1517 | 0 | _mm_store_ss(pNorm + 2, norm); |
1518 | | |
1519 | | // Skip to next vertex (3x normal components, 3x position components) |
1520 | 0 | pNorm += 6; |
1521 | | |
1522 | | |
1523 | 0 | } |
1524 | | |
1525 | |
|
1526 | 0 | } |
1527 | 0 | } |
1528 | | //--------------------------------------------------------------------- |
1529 | | void OptimisedUtilSSE::concatenateAffineMatrices( |
1530 | | const Affine3& baseMatrix, |
1531 | | const Affine3* pSrcMat, |
1532 | | Affine3* pDstMat, |
1533 | | size_t numMatrices) |
1534 | 0 | { |
1535 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
1536 | |
|
1537 | 0 | assert(_isAlignedForSSE(pSrcMat)); |
1538 | 0 | assert(_isAlignedForSSE(pDstMat)); |
1539 | | |
1540 | | // Load base matrix, unaligned |
1541 | 0 | __m128 m0 = _mm_loadu_ps(baseMatrix[0]); |
1542 | 0 | __m128 m1 = _mm_loadu_ps(baseMatrix[1]); |
1543 | 0 | __m128 m2 = _mm_loadu_ps(baseMatrix[2]); |
1544 | 0 | __m128 m3 = _mm_loadu_ps(baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1) |
1545 | |
|
1546 | 0 | for (size_t i = 0; i < numMatrices; ++i) |
1547 | 0 | { |
1548 | | // Load source matrix, aligned |
1549 | 0 | __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]); |
1550 | 0 | __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]); |
1551 | 0 | __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]); |
1552 | |
|
1553 | 0 | ++pSrcMat; |
1554 | |
|
1555 | 0 | __m128 t0, t1, t2, t3; |
1556 | | |
1557 | | // Concatenate matrix, and store results |
1558 | | |
1559 | | // Row 0 |
1560 | 0 | t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0); |
1561 | 0 | t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1); |
1562 | 0 | t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2); |
1563 | 0 | t3 = _mm_mul_ps(m0, m3); // Compiler should optimise this out of the loop |
1564 | 0 | __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3)); |
1565 | | |
1566 | | // Row 1 |
1567 | 0 | t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0); |
1568 | 0 | t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1); |
1569 | 0 | t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2); |
1570 | 0 | t3 = _mm_mul_ps(m1, m3); // Compiler should optimise this out of the loop |
1571 | 0 | __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3)); |
1572 | | |
1573 | | // Row 2 |
1574 | 0 | t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0); |
1575 | 0 | t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1); |
1576 | 0 | t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2); |
1577 | 0 | t3 = _mm_mul_ps(m2, m3); // Compiler should optimise this out of the loop |
1578 | 0 | __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3)); |
1579 | |
|
1580 | 0 | ++pDstMat; |
1581 | 0 | } |
1582 | 0 | } |
1583 | | //--------------------------------------------------------------------- |
1584 | | void OptimisedUtilSSE::calculateFaceNormals( |
1585 | | const float *positions, |
1586 | | const EdgeData::Triangle *triangles, |
1587 | | Vector4 *faceNormals, |
1588 | | size_t numTriangles) |
1589 | 0 | { |
1590 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
1591 | |
|
1592 | 0 | assert(_isAlignedForSSE(faceNormals)); |
1593 | | |
1594 | | // Load Vector3 as: (x, 0, y, z) |
1595 | 0 | #define __LOAD_VECTOR3(p) _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1)) |
1596 | | |
1597 | | // Mask used to changes sign of single precision floating point values. |
1598 | 0 | OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) = |
1599 | 0 | { |
1600 | 0 | 0x80000000, 0x80000000, 0x80000000, 0x80000000, |
1601 | 0 | }; |
1602 | |
|
1603 | 0 | size_t numIterations = numTriangles / 4; |
1604 | 0 | numTriangles &= 3; |
1605 | | |
1606 | | // Four triangles per-iteration |
1607 | 0 | for (size_t i = 0; i < numIterations; ++i) |
1608 | 0 | { |
1609 | | |
1610 | | // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3) |
1611 | 0 | #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \ |
1612 | 0 | { \ |
1613 | 0 | __m128 v0 = __LOAD_VECTOR3(p0); /* x0 -- y0 z0 */ \ |
1614 | 0 | __m128 v1 = __LOAD_VECTOR3(p1); /* x1 -- y1 z1 */ \ |
1615 | 0 | __m128 v2 = __LOAD_VECTOR3(p2); /* x2 -- y2 z2 */ \ |
1616 | 0 | __m128 v3 = __LOAD_VECTOR3(p3); /* x3 -- y3 z3 */ \ |
1617 | 0 | __m128 t0, t1; \ |
1618 | 0 | \ |
1619 | 0 | t0 = _mm_unpacklo_ps(v0, v2); /* x0 x2 -- -- */ \ |
1620 | 0 | t1 = _mm_unpacklo_ps(v1, v3); /* x1 x3 -- -- */ \ |
1621 | 0 | x = _mm_unpacklo_ps(t0, t1); /* x0 x1 x2 x3 */ \ |
1622 | 0 | \ |
1623 | 0 | t0 = _mm_unpackhi_ps(v0, v2); /* y0 y2 z0 z2 */ \ |
1624 | 0 | t1 = _mm_unpackhi_ps(v1, v3); /* y1 y3 z1 z3 */ \ |
1625 | 0 | y = _mm_unpacklo_ps(t0, t1); /* y0 y1 y2 y3 */ \ |
1626 | 0 | z = _mm_unpackhi_ps(t0, t1); /* z0 z1 z2 z3 */ \ |
1627 | 0 | } |
1628 | |
|
1629 | 0 | __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2; |
1630 | | |
1631 | | // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz |
1632 | 0 | __LOAD_FOUR_VECTOR3(x0, y0, z0, |
1633 | 0 | positions + triangles[0].vertIndex[0] * 3, |
1634 | 0 | positions + triangles[1].vertIndex[0] * 3, |
1635 | 0 | positions + triangles[2].vertIndex[0] * 3, |
1636 | 0 | positions + triangles[3].vertIndex[0] * 3); |
1637 | | |
1638 | | // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz |
1639 | 0 | __LOAD_FOUR_VECTOR3(x1, y1, z1, |
1640 | 0 | positions + triangles[0].vertIndex[1] * 3, |
1641 | 0 | positions + triangles[1].vertIndex[1] * 3, |
1642 | 0 | positions + triangles[2].vertIndex[1] * 3, |
1643 | 0 | positions + triangles[3].vertIndex[1] * 3); |
1644 | | |
1645 | | // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz |
1646 | 0 | __LOAD_FOUR_VECTOR3(x2, y2, z2, |
1647 | 0 | positions + triangles[0].vertIndex[2] * 3, |
1648 | 0 | positions + triangles[1].vertIndex[2] * 3, |
1649 | 0 | positions + triangles[2].vertIndex[2] * 3, |
1650 | 0 | positions + triangles[3].vertIndex[2] * 3); |
1651 | |
|
1652 | 0 | triangles += 4; |
1653 | | |
1654 | | // Calculate triangle face normals |
1655 | | |
1656 | | // a = v1 - v0 |
1657 | 0 | __m128 ax = _mm_sub_ps(x1, x0); |
1658 | 0 | __m128 ay = _mm_sub_ps(y1, y0); |
1659 | 0 | __m128 az = _mm_sub_ps(z1, z0); |
1660 | | |
1661 | | // b = v2 - v0 |
1662 | 0 | __m128 bx = _mm_sub_ps(x2, x0); |
1663 | 0 | __m128 by = _mm_sub_ps(y2, y0); |
1664 | 0 | __m128 bz = _mm_sub_ps(z2, z0); |
1665 | | |
1666 | | // n = a cross b |
1667 | 0 | __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by)); |
1668 | 0 | __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz)); |
1669 | 0 | __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx)); |
1670 | | |
1671 | | // w = - (n dot v0) |
1672 | 0 | __m128 nw = _mm_xor_ps( |
1673 | 0 | __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0), |
1674 | 0 | *(const __m128 *)&msSignMask); |
1675 | | |
1676 | | // Arrange to per-triangle face normal major format |
1677 | 0 | __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw); |
1678 | | |
1679 | | // Store results |
1680 | 0 | __MM_STORE_PS(&faceNormals[0].x, nx); |
1681 | 0 | __MM_STORE_PS(&faceNormals[1].x, ny); |
1682 | 0 | __MM_STORE_PS(&faceNormals[2].x, nz); |
1683 | 0 | __MM_STORE_PS(&faceNormals[3].x, nw); |
1684 | 0 | faceNormals += 4; |
1685 | |
|
1686 | 0 | #undef __LOAD_FOUR_VECTOR3 |
1687 | 0 | } |
1688 | | |
1689 | | // Dealing with remaining triangles |
1690 | 0 | for (size_t j = 0; j < numTriangles; ++j) |
1691 | 0 | { |
1692 | | // Load vertices of the triangle |
1693 | 0 | __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3); |
1694 | 0 | __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3); |
1695 | 0 | __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3); |
1696 | 0 | ++triangles; |
1697 | | |
1698 | | // Calculate face normal |
1699 | |
|
1700 | 0 | __m128 t0, t1; |
1701 | |
|
1702 | 0 | __m128 a = _mm_sub_ps(v1, v0); // ax 0 ay az |
1703 | 0 | __m128 b = _mm_sub_ps(v2, v0); // bx 0 by bz |
1704 | 0 | t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3)); // az 0 ax ay |
1705 | 0 | t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3)); // bz 0 bx by |
1706 | 0 | t0 = _mm_mul_ps(t0, b); // az*bx 0 ax*by ay*bz |
1707 | 0 | t1 = _mm_mul_ps(t1, a); // ax*bz 0 ay*bx az*by |
1708 | |
|
1709 | 0 | __m128 n = _mm_sub_ps(t0, t1); // ny 0 nz nx |
1710 | |
|
1711 | 0 | __m128 d = _mm_mul_ps( // dy 0 dz dx |
1712 | 0 | _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n); |
1713 | |
|
1714 | 0 | n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps( // nx ny nz -(dx+dy+dz) |
1715 | 0 | _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)), // nx ny nz 0 |
1716 | 0 | _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))), // 0 0 0 dx |
1717 | 0 | _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))), // 0 0 0 dy |
1718 | 0 | _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1))); // 0 0 0 dz |
1719 | | |
1720 | | // Store result |
1721 | 0 | __MM_STORE_PS(&faceNormals->x, n); |
1722 | 0 | ++faceNormals; |
1723 | 0 | } |
1724 | |
|
1725 | 0 | #undef __LOAD_VECTOR3 |
1726 | 0 | } |
1727 | | //--------------------------------------------------------------------- |
1728 | | void OptimisedUtilSSE::calculateLightFacing( |
1729 | | const Vector4& lightPos, |
1730 | | const Vector4* faceNormals, |
1731 | | char* lightFacings, |
1732 | | size_t numFaces) |
1733 | 0 | { |
1734 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
1735 | |
|
1736 | 0 | assert(_isAlignedForSSE(faceNormals)); |
1737 | | |
1738 | | // Map to convert 4-bits mask to 4 byte values |
1739 | 0 | static const char msMaskMapping[16][4] = |
1740 | 0 | { |
1741 | 0 | {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, |
1742 | 0 | {0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, |
1743 | 0 | {0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1}, |
1744 | 0 | {0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, |
1745 | 0 | }; |
1746 | |
|
1747 | 0 | __m128 n0, n1, n2, n3; |
1748 | 0 | __m128 t0, t1; |
1749 | 0 | __m128 dp; |
1750 | 0 | int bitmask; |
1751 | | |
1752 | | // Load light vector, unaligned |
1753 | 0 | __m128 lp = _mm_loadu_ps(&lightPos.x); |
1754 | | |
1755 | | // Perload zero to register for compare dot product values |
1756 | 0 | __m128 zero = _mm_setzero_ps(); |
1757 | |
|
1758 | 0 | size_t numIterations = numFaces / 4; |
1759 | 0 | numFaces &= 3; |
1760 | | |
1761 | | // Four faces per-iteration |
1762 | 0 | for (size_t i = 0; i < numIterations; ++i) |
1763 | 0 | { |
1764 | | // Load face normals, aligned |
1765 | 0 | n0 = __MM_LOAD_PS(&faceNormals[0].x); |
1766 | 0 | n1 = __MM_LOAD_PS(&faceNormals[1].x); |
1767 | 0 | n2 = __MM_LOAD_PS(&faceNormals[2].x); |
1768 | 0 | n3 = __MM_LOAD_PS(&faceNormals[3].x); |
1769 | 0 | faceNormals += 4; |
1770 | | |
1771 | | // Multiply by light vector |
1772 | 0 | n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 |
1773 | 0 | n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 |
1774 | 0 | n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 |
1775 | 0 | n3 = _mm_mul_ps(n3, lp); // x3 y3 z3 w3 |
1776 | | |
1777 | | // Horizontal add four vector values. |
1778 | 0 | t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 |
1779 | 0 | _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 |
1780 | 0 | _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 |
1781 | 0 | t1 = _mm_add_ps( // x2+z2 x3+z3 y2+w2 y3+w3 |
1782 | 0 | _mm_unpacklo_ps(n2, n3), // x2 x3 y2 y3 |
1783 | 0 | _mm_unpackhi_ps(n2, n3)); // z2 z3 w2 w3 |
1784 | 0 | dp = _mm_add_ps( // dp0 dp1 dp2 dp3 |
1785 | 0 | _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3 |
1786 | 0 | _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3 |
1787 | | |
1788 | | // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps' |
1789 | | // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch, |
1790 | | // i.e. it's 2nd operand of the assembly instruction. And in fact |
1791 | | // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped |
1792 | | // in VC7.1. |
1793 | 0 | bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); |
1794 | | |
1795 | | // Convert 4-bits mask to 4 bytes, and store results. |
1796 | | /* |
1797 | | *reinterpret_cast<uint32*>(lightFacings) = |
1798 | | *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]); |
1799 | | */ |
1800 | 0 | memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32)); |
1801 | | |
1802 | | |
1803 | 0 | lightFacings += 4; |
1804 | 0 | } |
1805 | | |
1806 | | // Dealing with remaining faces |
1807 | 0 | switch (numFaces) |
1808 | 0 | { |
1809 | 0 | case 3: |
1810 | 0 | n0 = __MM_LOAD_PS(&faceNormals[0].x); |
1811 | 0 | n1 = __MM_LOAD_PS(&faceNormals[1].x); |
1812 | 0 | n2 = __MM_LOAD_PS(&faceNormals[2].x); |
1813 | |
|
1814 | 0 | n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 |
1815 | 0 | n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 |
1816 | 0 | n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 |
1817 | |
|
1818 | 0 | t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 |
1819 | 0 | _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 |
1820 | 0 | _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 |
1821 | 0 | t1 = _mm_add_ps( // x2+z2 x2+z2 y2+w2 y2+w2 |
1822 | 0 | _mm_unpacklo_ps(n2, n2), // x2 x2 y2 y2 |
1823 | 0 | _mm_unpackhi_ps(n2, n2)); // z2 z2 w2 w2 |
1824 | 0 | dp = _mm_add_ps( // dp0 dp1 dp2 dp2 |
1825 | 0 | _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2 |
1826 | 0 | _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2 |
1827 | |
|
1828 | 0 | bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); |
1829 | |
|
1830 | 0 | lightFacings[0] = msMaskMapping[bitmask][0]; |
1831 | 0 | lightFacings[1] = msMaskMapping[bitmask][1]; |
1832 | 0 | lightFacings[2] = msMaskMapping[bitmask][2]; |
1833 | 0 | break; |
1834 | | |
1835 | 0 | case 2: |
1836 | 0 | n0 = __MM_LOAD_PS(&faceNormals[0].x); |
1837 | 0 | n1 = __MM_LOAD_PS(&faceNormals[1].x); |
1838 | |
|
1839 | 0 | n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 |
1840 | 0 | n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 |
1841 | |
|
1842 | 0 | t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 |
1843 | 0 | _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 |
1844 | 0 | _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 |
1845 | 0 | dp = _mm_add_ps( // dp0 dp1 dp0 dp1 |
1846 | 0 | _mm_movelh_ps(t0, t0), // x0+z0 x1+z1 x0+z0 x1+z1 |
1847 | 0 | _mm_movehl_ps(t0, t0)); // y0+w0 y1+w1 y0+w0 y1+w1 |
1848 | |
|
1849 | 0 | bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); |
1850 | |
|
1851 | 0 | lightFacings[0] = msMaskMapping[bitmask][0]; |
1852 | 0 | lightFacings[1] = msMaskMapping[bitmask][1]; |
1853 | 0 | break; |
1854 | | |
1855 | 0 | case 1: |
1856 | 0 | n0 = __MM_LOAD_PS(&faceNormals[0].x); |
1857 | |
|
1858 | 0 | n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 |
1859 | |
|
1860 | 0 | t0 = _mm_add_ps( // x0+z0 x0+z0 y0+w0 y0+w0 |
1861 | 0 | _mm_unpacklo_ps(n0, n0), // x0 x0 y0 y0 |
1862 | 0 | _mm_unpackhi_ps(n0, n0)); // z0 z0 w0 w0 |
1863 | 0 | dp = _mm_add_ps( // dp0 dp0 dp0 dp0 |
1864 | 0 | _mm_movelh_ps(t0, t0), // x0+z0 x0+z0 x0+z0 x0+z0 |
1865 | 0 | _mm_movehl_ps(t0, t0)); // y0+w0 y0+w0 y0+w0 y0+w0 |
1866 | |
|
1867 | 0 | bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); |
1868 | |
|
1869 | 0 | lightFacings[0] = msMaskMapping[bitmask][0]; |
1870 | 0 | break; |
1871 | 0 | } |
1872 | 0 | } |
1873 | | //--------------------------------------------------------------------- |
1874 | | // Template to extrude vertices for directional light. |
1875 | | template <bool srcAligned, bool destAligned> |
1876 | | struct ExtrudeVertices_SSE_DirectionalLight |
1877 | | { |
1878 | | static void apply( |
1879 | | const Vector4& lightPos, |
1880 | | Real extrudeDist, |
1881 | | const float* pSrcPos, |
1882 | | float* pDestPos, |
1883 | | size_t numVertices) |
1884 | 0 | { |
1885 | 0 | typedef SSEMemoryAccessor<srcAligned> SrcAccessor; |
1886 | 0 | typedef SSEMemoryAccessor<destAligned> DestAccessor; |
1887 | | |
1888 | | // Directional light, extrusion is along light direction |
1889 | | |
1890 | | // Load light vector, unaligned |
1891 | 0 | __m128 lp = _mm_loadu_ps(&lightPos.x); |
1892 | | |
1893 | | // Calculate extrusion direction, note that we use inverted direction here |
1894 | | // for eliminate an extra negative instruction, we'll compensate for that |
1895 | | // by use subtract instruction instead later. |
1896 | 0 | __m128 tmp = _mm_mul_ps(lp, lp); |
1897 | 0 | tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp)); |
1898 | | // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead |
1899 | 0 | tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist)); |
1900 | 0 | __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z - |
1901 | | |
1902 | | // Prepare extrude direction for extruding 4 vertices parallelly |
1903 | 0 | __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X |
1904 | 0 | __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y |
1905 | 0 | __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z |
1906 | |
|
1907 | 0 | __m128 s0, s1, s2; |
1908 | 0 | __m128 d0, d1, d2; |
1909 | |
|
1910 | 0 | size_t numIterations = numVertices / 4; |
1911 | 0 | numVertices &= 3; |
1912 | | |
1913 | | // Extruding 4 vertices per-iteration |
1914 | 0 | for (size_t i = 0; i < numIterations; ++i) |
1915 | 0 | { |
1916 | 0 | s0 = SrcAccessor::load(pSrcPos + 0); |
1917 | 0 | s1 = SrcAccessor::load(pSrcPos + 4); |
1918 | 0 | s2 = SrcAccessor::load(pSrcPos + 8); |
1919 | 0 | pSrcPos += 12; |
1920 | | |
1921 | | // The extrusion direction is inverted, use subtract instruction here |
1922 | 0 | d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 |
1923 | 0 | d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 |
1924 | 0 | d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3 |
1925 | |
|
1926 | 0 | DestAccessor::store(pDestPos + 0, d0); |
1927 | 0 | DestAccessor::store(pDestPos + 4, d1); |
1928 | 0 | DestAccessor::store(pDestPos + 8, d2); |
1929 | 0 | pDestPos += 12; |
1930 | 0 | } |
1931 | | |
1932 | | // Dealing with remaining vertices |
1933 | 0 | switch (numVertices) |
1934 | 0 | { |
1935 | 0 | case 3: |
1936 | | // 9 floating-point values |
1937 | 0 | s0 = SrcAccessor::load(pSrcPos + 0); |
1938 | 0 | s1 = SrcAccessor::load(pSrcPos + 4); |
1939 | 0 | s2 = _mm_load_ss(pSrcPos + 8); |
1940 | | |
1941 | | // The extrusion direction is inverted, use subtract instruction here |
1942 | 0 | d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 |
1943 | 0 | d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 |
1944 | 0 | d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- -- |
1945 | |
|
1946 | 0 | DestAccessor::store(pDestPos + 0, d0); |
1947 | 0 | DestAccessor::store(pDestPos + 4, d1); |
1948 | 0 | _mm_store_ss(pDestPos + 8, d2); |
1949 | 0 | break; |
1950 | | |
1951 | 0 | case 2: |
1952 | | // 6 floating-point values |
1953 | 0 | s0 = SrcAccessor::load(pSrcPos + 0); |
1954 | 0 | s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here |
1955 | | |
1956 | | // The extrusion direction is inverted, use subtract instruction here |
1957 | 0 | d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 |
1958 | 0 | d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- -- |
1959 | |
|
1960 | 0 | DestAccessor::store(pDestPos + 0, d0); |
1961 | 0 | _mm_storel_pi((__m64*)(pDestPos + 4), d1); |
1962 | 0 | break; |
1963 | | |
1964 | 0 | case 1: |
1965 | | // 3 floating-point values |
1966 | 0 | s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here |
1967 | 0 | s1 = _mm_load_ss(pSrcPos + 2); |
1968 | | |
1969 | | // The extrusion direction is inverted, use subtract instruction here |
1970 | 0 | d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- -- |
1971 | 0 | d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- -- |
1972 | |
|
1973 | 0 | _mm_storel_pi((__m64*)(pDestPos + 0), d0); |
1974 | 0 | _mm_store_ss(pDestPos + 2, d1); |
1975 | 0 | break; |
1976 | 0 | } |
1977 | 0 | } Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) |
1978 | | }; |
1979 | | //--------------------------------------------------------------------- |
1980 | | // Template to extrude vertices for point light. |
1981 | | template <bool srcAligned, bool destAligned> |
1982 | | struct ExtrudeVertices_SSE_PointLight |
1983 | | { |
1984 | | static void apply( |
1985 | | const Vector4& lightPos, |
1986 | | Real extrudeDist, |
1987 | | const float* pSrcPos, |
1988 | | float* pDestPos, |
1989 | | size_t numVertices) |
1990 | 0 | { |
1991 | 0 | typedef SSEMemoryAccessor<srcAligned> SrcAccessor; |
1992 | 0 | typedef SSEMemoryAccessor<destAligned> DestAccessor; |
1993 | | |
1994 | | // Point light, will calculate extrusion direction for every vertex |
1995 | | |
1996 | | // Load light vector, unaligned |
1997 | 0 | __m128 lp = _mm_loadu_ps(&lightPos.x); |
1998 | | |
1999 | | // Load extrude distance |
2000 | 0 | __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist); |
2001 | |
|
2002 | 0 | size_t numIterations = numVertices / 4; |
2003 | 0 | numVertices &= 3; |
2004 | | |
2005 | | // Extruding 4 vertices per-iteration |
2006 | 0 | for (size_t i = 0; i < numIterations; ++i) |
2007 | 0 | { |
2008 | | // Load source positions |
2009 | 0 | __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 |
2010 | 0 | __m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 |
2011 | 0 | __m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 |
2012 | 0 | pSrcPos += 12; |
2013 | | |
2014 | | // Arrange to 3x4 component-major for batches calculate |
2015 | 0 | __MM_TRANSPOSE4x3_PS(s0, s1, s2); |
2016 | | |
2017 | | // Calculate unnormalised extrusion direction |
2018 | 0 | __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3 |
2019 | 0 | __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3 |
2020 | 0 | __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3 |
2021 | | |
2022 | | // Normalise extrusion direction and multiply by extrude distance |
2023 | 0 | __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz); |
2024 | 0 | tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4); |
2025 | 0 | dx = _mm_mul_ps(dx, tmp); |
2026 | 0 | dy = _mm_mul_ps(dy, tmp); |
2027 | 0 | dz = _mm_mul_ps(dz, tmp); |
2028 | | |
2029 | | // Calculate extruded positions |
2030 | 0 | __m128 d0 = _mm_add_ps(dx, s0); |
2031 | 0 | __m128 d1 = _mm_add_ps(dy, s1); |
2032 | 0 | __m128 d2 = _mm_add_ps(dz, s2); |
2033 | | |
2034 | | // Arrange back to 4x3 continuous format for store results |
2035 | 0 | __MM_TRANSPOSE3x4_PS(d0, d1, d2); |
2036 | | |
2037 | | // Store extruded positions |
2038 | 0 | DestAccessor::store(pDestPos + 0, d0); |
2039 | 0 | DestAccessor::store(pDestPos + 4, d1); |
2040 | 0 | DestAccessor::store(pDestPos + 8, d2); |
2041 | 0 | pDestPos += 12; |
2042 | 0 | } |
2043 | | |
2044 | | // Dealing with remaining vertices |
2045 | 0 | for (size_t j = 0; j < numVertices; ++j) |
2046 | 0 | { |
2047 | | // Load source position |
2048 | 0 | __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z |
2049 | 0 | pSrcPos += 3; |
2050 | | |
2051 | | // Calculate unnormalised extrusion direction |
2052 | 0 | __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z |
2053 | | |
2054 | | // Normalise extrusion direction and multiply by extrude distance |
2055 | 0 | __m128 tmp = _mm_mul_ps(dir, dir); |
2056 | 0 | tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3)); |
2057 | | // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead |
2058 | 0 | tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4); |
2059 | 0 | dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0)); |
2060 | | |
2061 | | // Calculate extruded position |
2062 | 0 | __m128 dst = _mm_add_ps(dir, src); |
2063 | | |
2064 | | // Store extruded position |
2065 | 0 | _mm_store_ss(pDestPos + 0, dst); |
2066 | 0 | _mm_storeh_pi((__m64*)(pDestPos + 1), dst); |
2067 | 0 | pDestPos += 3; |
2068 | 0 | } |
2069 | 0 | } Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<true, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<true, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<false, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<false, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long) |
2070 | | }; |
2071 | | //--------------------------------------------------------------------- |
2072 | | void OptimisedUtilSSE::extrudeVertices( |
2073 | | const Vector4& lightPos, |
2074 | | Real extrudeDist, |
2075 | | const float* pSrcPos, |
2076 | | float* pDestPos, |
2077 | | size_t numVertices) |
2078 | 0 | { |
2079 | 0 | __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); |
2080 | | |
2081 | | // Note: Since pDestPos is following tail of pSrcPos, we can't assume |
2082 | | // it's aligned to SIMD alignment properly, so must check for it here. |
2083 | | // |
2084 | | // TODO: Add extra vertex to the vertex buffer for make sure pDestPos |
2085 | | // aligned same as pSrcPos. |
2086 | | // |
2087 | | |
2088 | | // We are use SSE reciprocal square root directly while calculating |
2089 | | // extrusion direction, since precision loss not that important here. |
2090 | | // |
2091 | 0 | if (lightPos.w == 0.0f) |
2092 | 0 | { |
2093 | 0 | if (_isAlignedForSSE(pSrcPos)) |
2094 | 0 | { |
2095 | 0 | if (_isAlignedForSSE(pDestPos)) |
2096 | 0 | ExtrudeVertices_SSE_DirectionalLight<true, true>::apply( |
2097 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2098 | 0 | else |
2099 | 0 | ExtrudeVertices_SSE_DirectionalLight<true, false>::apply( |
2100 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2101 | 0 | } |
2102 | 0 | else |
2103 | 0 | { |
2104 | 0 | if (_isAlignedForSSE(pDestPos)) |
2105 | 0 | ExtrudeVertices_SSE_DirectionalLight<false, true>::apply( |
2106 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2107 | 0 | else |
2108 | 0 | ExtrudeVertices_SSE_DirectionalLight<false, false>::apply( |
2109 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2110 | 0 | } |
2111 | 0 | } |
2112 | 0 | else |
2113 | 0 | { |
2114 | 0 | assert(lightPos.w == 1.0f); |
2115 | |
|
2116 | 0 | if (_isAlignedForSSE(pSrcPos)) |
2117 | 0 | { |
2118 | 0 | if (_isAlignedForSSE(pDestPos)) |
2119 | 0 | ExtrudeVertices_SSE_PointLight<true, true>::apply( |
2120 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2121 | 0 | else |
2122 | 0 | ExtrudeVertices_SSE_PointLight<true, false>::apply( |
2123 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2124 | 0 | } |
2125 | 0 | else |
2126 | 0 | { |
2127 | 0 | if (_isAlignedForSSE(pDestPos)) |
2128 | 0 | ExtrudeVertices_SSE_PointLight<false, true>::apply( |
2129 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2130 | 0 | else |
2131 | 0 | ExtrudeVertices_SSE_PointLight<false, false>::apply( |
2132 | 0 | lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); |
2133 | 0 | } |
2134 | 0 | } |
2135 | 0 | } |
2136 | | //--------------------------------------------------------------------- |
2137 | | //--------------------------------------------------------------------- |
2138 | | //--------------------------------------------------------------------- |
2139 | | extern OptimisedUtil* _getOptimisedUtilSSE(void); |
2140 | | extern OptimisedUtil* _getOptimisedUtilSSE(void) |
2141 | 2 | { |
2142 | 2 | static OptimisedUtilSSE msOptimisedUtilSSE; |
2143 | | #if defined(__OGRE_SIMD_ALIGN_STACK) |
2144 | | static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE); |
2145 | | return &msOptimisedUtilWithStackAlign; |
2146 | | #else |
2147 | 2 | return &msOptimisedUtilSSE; |
2148 | 2 | #endif |
2149 | 2 | } |
2150 | | |
2151 | | } |
2152 | | |
2153 | | #endif // __OGRE_HAVE_SSE |