Coverage Report

Created: 2025-08-28 06:22

/src/ogre/OgreMain/src/OgreOptimisedUtilSSE.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
-----------------------------------------------------------------------------
3
This source file is part of OGRE
4
    (Object-oriented Graphics Rendering Engine)
5
For the latest info, see http://www.ogre3d.org/
6
7
Copyright (c) 2000-2014 Torus Knot Software Ltd
8
9
Permission is hereby granted, free of charge, to any person obtaining a copy
10
of this software and associated documentation files (the "Software"), to deal
11
in the Software without restriction, including without limitation the rights
12
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
copies of the Software, and to permit persons to whom the Software is
14
furnished to do so, subject to the following conditions:
15
16
The above copyright notice and this permission notice shall be included in
17
all copies or substantial portions of the Software.
18
19
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25
THE SOFTWARE.
26
-----------------------------------------------------------------------------
27
*/
28
#include "OgreStableHeaders.h"
29
#include "OgreOptimisedUtil.h"
30
31
32
#if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON
33
34
// Should keep this includes at latest to avoid potential "xmmintrin.h" included by
35
// other header file on some platform for some reason.
36
#include "OgreSIMDHelper.h"
37
38
// I'd like to merge this file with OgreOptimisedUtil.cpp, but it's
39
// impossible when compile with gcc, due SSE instructions can only
40
// enable/disable at file level.
41
42
//-------------------------------------------------------------------------
43
//
44
// The routines implemented in this file are performance oriented,
45
// which means saving every penny as possible. This requirement might
46
// break some C++/STL-rules.
47
//
48
//
49
// Some rules I'd like to respects:
50
//
51
// 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because
52
//    it can saving one byte of binary code :)
53
// 2. Use add/sub instead of mul.
54
// 3. Eliminate prolog code of function call.
55
//
56
// The last, anything recommended by Intel Optimization Reference Manual.
57
//
58
//-------------------------------------------------------------------------
59
60
// Use unrolled SSE version when vertices exceeds this limit
61
0
#define OGRE_SSE_SKINNING_UNROLL_VERTICES  16
62
63
namespace Ogre {
64
65
//-------------------------------------------------------------------------
66
// Local classes
67
//-------------------------------------------------------------------------
68
69
    /** SSE implementation of OptimisedUtil.
70
    @note
71
        Don't use this class directly, use OptimisedUtil instead.
72
    */
73
    class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil
74
    {
75
    protected:
76
        /// Do we prefer to use a general SSE version for position/normal shared buffers?
77
        bool mPreferGeneralVersionForSharedBuffers;
78
79
    public:
80
        /// Constructor
81
        OptimisedUtilSSE(void);
82
83
        /// @copydoc OptimisedUtil::softwareVertexSkinning
84
        void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning(
85
            const float *srcPosPtr, float *destPosPtr,
86
            const float *srcNormPtr, float *destNormPtr,
87
            const float *blendWeightPtr, const unsigned char* blendIndexPtr,
88
            const Affine3* const* blendMatrices,
89
            size_t srcPosStride, size_t destPosStride,
90
            size_t srcNormStride, size_t destNormStride,
91
            size_t blendWeightStride, size_t blendIndexStride,
92
            size_t numWeightsPerVertex,
93
            size_t numVertices) override;
94
95
        /// @copydoc OptimisedUtil::softwareVertexMorph
96
        void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph(
97
            float t,
98
            const float *srcPos1, const float *srcPos2,
99
            float *dstPos,
100
            size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 
101
            size_t numVertices,
102
            bool morphNormals) override;
103
104
        /// @copydoc OptimisedUtil::concatenateAffineMatrices
105
        void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices(
106
            const Affine3& baseMatrix,
107
            const Affine3* srcMatrices,
108
            Affine3* dstMatrices,
109
            size_t numMatrices) override;
110
111
        /// @copydoc OptimisedUtil::calculateFaceNormals
112
        void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals(
113
            const float *positions,
114
            const EdgeData::Triangle *triangles,
115
            Vector4 *faceNormals,
116
            size_t numTriangles) override;
117
118
        /// @copydoc OptimisedUtil::calculateLightFacing
119
        void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing(
120
            const Vector4& lightPos,
121
            const Vector4* faceNormals,
122
            char* lightFacings,
123
            size_t numFaces) override;
124
125
        /// @copydoc OptimisedUtil::extrudeVertices
126
        void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices(
127
            const Vector4& lightPos,
128
            Real extrudeDist,
129
            const float* srcPositions,
130
            float* destPositions,
131
            size_t numVertices) override;
132
    };
133
134
#if defined(__OGRE_SIMD_ALIGN_STACK)
135
    /** Stack-align implementation of OptimisedUtil.
136
137
        User code compiled by icc and gcc might not align stack
138
        properly, we need ensure stack align to a 16-bytes boundary
139
        when execute SSE function.
140
    @par
141
        We implemeted as align stack following a virtual function call,
142
        then should guarantee call instruction are used instead of inline
143
        underlying function body here (which might causing problem).
144
    @note
145
        Don't use this class directly, use OptimisedUtil instead.
146
    */
147
    class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil
148
    {
149
    protected:
150
        /// The actual implementation
151
        OptimisedUtil* mImpl;
152
153
    public:
154
        /// Constructor
155
        OptimisedUtilWithStackAlign(OptimisedUtil* impl)
156
            : mImpl(impl)
157
        {
158
        }
159
160
        /// @copydoc OptimisedUtil::softwareVertexSkinning
161
        virtual void softwareVertexSkinning(
162
            const float *srcPosPtr, float *destPosPtr,
163
            const float *srcNormPtr, float *destNormPtr,
164
            const float *blendWeightPtr, const unsigned char* blendIndexPtr,
165
            const Affine3* const* blendMatrices,
166
            size_t srcPosStride, size_t destPosStride,
167
            size_t srcNormStride, size_t destNormStride,
168
            size_t blendWeightStride, size_t blendIndexStride,
169
            size_t numWeightsPerVertex,
170
            size_t numVertices)
171
        {
172
            __OGRE_SIMD_ALIGN_STACK();
173
174
            mImpl->softwareVertexSkinning(
175
                srcPosPtr, destPosPtr,
176
                srcNormPtr, destNormPtr,
177
                blendWeightPtr, blendIndexPtr,
178
                blendMatrices,
179
                srcPosStride, destPosStride,
180
                srcNormStride, destNormStride,
181
                blendWeightStride, blendIndexStride,
182
                numWeightsPerVertex,
183
                numVertices);
184
        }
185
186
        /// @copydoc OptimisedUtil::softwareVertexMorph
187
        virtual void softwareVertexMorph(
188
            float t,
189
            const float *srcPos1, const float *srcPos2,
190
            float *dstPos,
191
            size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 
192
            size_t numVertices,
193
            bool morphNormals)
194
        {
195
            __OGRE_SIMD_ALIGN_STACK();
196
197
            mImpl->softwareVertexMorph(
198
                t,
199
                srcPos1, srcPos2,
200
                dstPos,
201
                pos1VSize, pos2VSize, dstVSize, 
202
                numVertices,
203
                morphNormals);
204
        }
205
206
        /// @copydoc OptimisedUtil::concatenateAffineMatrices
207
        virtual void concatenateAffineMatrices(
208
            const Affine3& baseMatrix,
209
            const Affine3* srcMatrices,
210
            Affine3* dstMatrices,
211
            size_t numMatrices)
212
        {
213
            __OGRE_SIMD_ALIGN_STACK();
214
215
            mImpl->concatenateAffineMatrices(
216
                baseMatrix,
217
                srcMatrices,
218
                dstMatrices,
219
                numMatrices);
220
        }
221
222
        /// @copydoc OptimisedUtil::calculateFaceNormals
223
        virtual void calculateFaceNormals(
224
            const float *positions,
225
            const EdgeData::Triangle *triangles,
226
            Vector4 *faceNormals,
227
            size_t numTriangles)
228
        {
229
            __OGRE_SIMD_ALIGN_STACK();
230
231
            mImpl->calculateFaceNormals(
232
                positions,
233
                triangles,
234
                faceNormals,
235
                numTriangles);
236
        }
237
238
        /// @copydoc OptimisedUtil::calculateLightFacing
239
        virtual void calculateLightFacing(
240
            const Vector4& lightPos,
241
            const Vector4* faceNormals,
242
            char* lightFacings,
243
            size_t numFaces)
244
        {
245
            __OGRE_SIMD_ALIGN_STACK();
246
247
            mImpl->calculateLightFacing(
248
                lightPos,
249
                faceNormals,
250
                lightFacings,
251
                numFaces);
252
        }
253
254
        /// @copydoc OptimisedUtil::extrudeVertices
255
        virtual void extrudeVertices(
256
            const Vector4& lightPos,
257
            Real extrudeDist,
258
            const float* srcPositions,
259
            float* destPositions,
260
            size_t numVertices)
261
        {
262
            __OGRE_SIMD_ALIGN_STACK();
263
264
            mImpl->extrudeVertices(
265
                lightPos,
266
                extrudeDist,
267
                srcPositions,
268
                destPositions,
269
                numVertices);
270
        }
271
    };
272
#endif  // !defined(__OGRE_SIMD_ALIGN_STACK)
273
274
//---------------------------------------------------------------------
275
// Some useful macro for collapse matrices.
276
//---------------------------------------------------------------------
277
278
#define __LOAD_MATRIX(row0, row1, row2, pMatrix)                        \
279
0
    {                                                                   \
280
0
        row0 = __MM_LOAD_PS((*pMatrix)[0]);                             \
281
0
        row1 = __MM_LOAD_PS((*pMatrix)[1]);                             \
282
0
        row2 = __MM_LOAD_PS((*pMatrix)[2]);                             \
283
0
    }
284
285
#define __LERP_MATRIX(row0, row1, row2, weight, pMatrix)                \
286
0
    {                                                                   \
287
0
        row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \
288
0
        row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \
289
0
        row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \
290
0
    }
291
292
#define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)       \
293
0
    {                                                                   \
294
0
        row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight);         \
295
0
        row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight);         \
296
0
        row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight);         \
297
0
    }
298
299
#define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)      \
300
0
    {                                                                   \
301
0
        row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \
302
0
        row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \
303
0
        row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \
304
0
    }
305
306
//---------------------------------------------------------------------
307
// The following macros request variables declared by caller.
308
//
309
// :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
310
//---------------------------------------------------------------------
311
312
/** Collapse one-weighted matrix.
313
    Eliminated multiply by weight since the weight should be equal to one always
314
*/
315
#define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
316
0
    {                                                                           \
317
0
        pMatrix0 = blendMatrices[pIndices[0]];                                  \
318
0
        __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
319
0
    }
320
321
/** Collapse two-weighted matrix.
322
    Based on the fact that accumulated weights are equal to one, by use lerp,
323
    replaced two multiplies and one additive with one multiplie and two additives.
324
*/
325
#define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
326
0
    {                                                                           \
327
0
        weight = _mm_load_ps1(pWeights + 1);                                    \
328
0
        pMatrix0 = ppMatrices[pIndices[0]];                                     \
329
0
        __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
330
0
        pMatrix1 = ppMatrices[pIndices[1]];                                     \
331
0
        __LERP_MATRIX(row0, row1, row2, weight, pMatrix1);                      \
332
0
    }
333
334
/** Collapse three-weighted matrix.
335
*/
336
#define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
337
0
    {                                                                           \
338
0
        weight = _mm_load_ps1(pWeights + 0);                                    \
339
0
        pMatrix0 = ppMatrices[pIndices[0]];                                     \
340
0
        __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
341
0
        weight = _mm_load_ps1(pWeights + 1);                                    \
342
0
        pMatrix1 = ppMatrices[pIndices[1]];                                     \
343
0
        __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
344
0
        weight = _mm_load_ps1(pWeights + 2);                                    \
345
0
        pMatrix2 = ppMatrices[pIndices[2]];                                     \
346
0
        __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
347
0
    }
348
349
/** Collapse four-weighted matrix.
350
*/
351
#define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
352
0
    {                                                                           \
353
0
        /* Load four blend weights at one time, they will be shuffled later */  \
354
0
        weights = _mm_loadu_ps(pWeights);                                       \
355
0
                                                                                \
356
0
        pMatrix0 = ppMatrices[pIndices[0]];                                     \
357
0
        weight = __MM_SELECT(weights, 0);                                       \
358
0
        __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
359
0
        pMatrix1 = ppMatrices[pIndices[1]];                                     \
360
0
        weight = __MM_SELECT(weights, 1);                                       \
361
0
        __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
362
0
        pMatrix2 = ppMatrices[pIndices[2]];                                     \
363
0
        weight = __MM_SELECT(weights, 2);                                       \
364
0
        __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
365
0
        pMatrix3 = ppMatrices[pIndices[3]];                                     \
366
0
        weight = __MM_SELECT(weights, 3);                                       \
367
0
        __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3);            \
368
0
    }
369
370
371
372
    //---------------------------------------------------------------------
373
    // Collapse a matrix at one time. The collapsed matrix are weighted by
374
    // blend-weights, and then can use to transform corresponding vertex directly.
375
    //
376
    // I'd like use inline function instead of macro here, but I also want to
377
    // ensure compiler integrate this code into its callers (release build at
378
    // least), doesn't matter about specific compile options. Inline function
379
    // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
380
    // when implemented as inline function, even if compile with "-O3" option.
381
    //
382
#define _collapseOneMatrix(                                                     \
383
        m00, m01, m02,                                                          \
384
        pBlendWeight, pBlendIndex,                                              \
385
        blendMatrices,                                                          \
386
        blendWeightStride, blendIndexStride,                                    \
387
        numWeightsPerVertex)                                                    \
388
0
    {                                                                           \
389
0
        /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
390
0
        /* generate wrong code here!!!                                   */     \
391
0
        const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
392
0
        __m128 weight, weights;                                                 \
393
0
                                                                                \
394
0
        switch (numWeightsPerVertex)                                            \
395
0
        {                                                                       \
396
0
        default:    /* Just in case and make compiler happy */                  \
397
0
        case 1:                                                                 \
398
0
            __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
399
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
400
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
401
0
            break;                                                              \
402
0
                                                                                \
403
0
        case 2:                                                                 \
404
0
            __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
405
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
406
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
407
0
            break;                                                              \
408
0
                                                                                \
409
0
        case 3:                                                                 \
410
0
            __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
411
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
412
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
413
0
            break;                                                              \
414
0
                                                                                \
415
0
        case 4:                                                                 \
416
0
            __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
417
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
418
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
419
0
            break;                                                              \
420
0
        }                                                                       \
421
0
    }
422
423
    //---------------------------------------------------------------------
424
    // Collapse four matrices at one time. The collapsed matrix are weighted by
425
    // blend-weights, and then can use to transform corresponding vertex directly.
426
    //
427
    // I'd like use inline function instead of macro here, but I also want to
428
    // ensure compiler integrate this code into its callers (release build at
429
    // least), doesn't matter about specific compile options. Inline function
430
    // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
431
    // when implemented as inline function, even if compile with "-O3" option.
432
    //
433
#define _collapseFourMatrices(                                                  \
434
        m00, m01, m02,                                                          \
435
        m10, m11, m12,                                                          \
436
        m20, m21, m22,                                                          \
437
        m30, m31, m32,                                                          \
438
        pBlendWeight, pBlendIndex,                                              \
439
        blendMatrices,                                                          \
440
        blendWeightStride, blendIndexStride,                                    \
441
        numWeightsPerVertex)                                                    \
442
0
    {                                                                           \
443
0
        /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
444
0
        /* generate wrong code here!!!                                   */     \
445
0
        const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
446
0
        __m128 weight, weights;                                                 \
447
0
                                                                                \
448
0
        switch (numWeightsPerVertex)                                            \
449
0
        {                                                                       \
450
0
        default:    /* Just in case and make compiler happy */                  \
451
0
        case 1:                                                                 \
452
0
            __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
453
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
454
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
455
0
            __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices,                  \
456
0
                rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
457
0
                rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
458
0
            __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices,                  \
459
0
                rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
460
0
                rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
461
0
            __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices,                  \
462
0
                rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
463
0
                rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
464
0
            break;                                                              \
465
0
                                                                                \
466
0
        case 2:                                                                 \
467
0
            __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
468
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
469
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
470
0
            __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices,                  \
471
0
                rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
472
0
                rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
473
0
            __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices,                  \
474
0
                rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
475
0
                rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
476
0
            __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices,                  \
477
0
                rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
478
0
                rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
479
0
            break;                                                              \
480
0
                                                                                \
481
0
        case 3:                                                                 \
482
0
            __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
483
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
484
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
485
0
            __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices,                  \
486
0
                rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
487
0
                rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
488
0
            __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices,                  \
489
0
                rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
490
0
                rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
491
0
            __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices,                  \
492
0
                rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
493
0
                rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
494
0
            break;                                                              \
495
0
                                                                                \
496
0
        case 4:                                                                 \
497
0
            __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
498
0
                rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
499
0
                rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
500
0
            __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices,                  \
501
0
                rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
502
0
                rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
503
0
            __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices,                  \
504
0
                rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
505
0
                rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
506
0
            __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices,                  \
507
0
                rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
508
0
                rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
509
0
            break;                                                              \
510
0
        }                                                                       \
511
0
    }
512
513
514
    //---------------------------------------------------------------------
515
    // General SSE version skinning positions, and optional skinning normals.
516
    static void softwareVertexSkinning_SSE_General(
517
        const float *pSrcPos, float *pDestPos,
518
        const float *pSrcNorm, float *pDestNorm,
519
        const float *pBlendWeight, const unsigned char* pBlendIndex,
520
        const Affine3* const* blendMatrices,
521
        size_t srcPosStride, size_t destPosStride,
522
        size_t srcNormStride, size_t destNormStride,
523
        size_t blendWeightStride, size_t blendIndexStride,
524
        size_t numWeightsPerVertex,
525
        size_t numVertices)
526
0
    {
527
0
        for (size_t i = 0; i < numVertices; ++i)
528
0
        {
529
            // Collapse matrices
530
0
            __m128 m00, m01, m02;
531
0
            _collapseOneMatrix(
532
0
                m00, m01, m02,
533
0
                pBlendWeight, pBlendIndex,
534
0
                blendMatrices,
535
0
                blendWeightStride, blendIndexStride,
536
0
                numWeightsPerVertex);
537
538
            // Advance blend weight and index pointers
539
0
            advanceRawPointer(pBlendWeight, blendWeightStride);
540
0
            advanceRawPointer(pBlendIndex, blendIndexStride);
541
542
            //------------------------------------------------------------------
543
544
            // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y
545
0
            __m128 m03 = _mm_setzero_ps();
546
0
            __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01);
547
548
            //------------------------------------------------------------------
549
            // Transform position
550
            //------------------------------------------------------------------
551
552
0
            __m128 s0, s1, s2;
553
554
            // Load source position
555
0
            s0 = _mm_load_ps1(pSrcPos + 0);
556
0
            s1 = _mm_load_ps1(pSrcPos + 1);
557
0
            s2 = _mm_load_ps1(pSrcPos + 2);
558
559
            // Transform by collapsed matrix
560
0
            __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2);   // z 0 x y
561
562
            // Store blended position, no aligned requirement
563
0
            _mm_storeh_pi((__m64*)pDestPos, accumPos);
564
0
            _mm_store_ss(pDestPos+2, accumPos);
565
566
            // Advance source and target position pointers
567
0
            advanceRawPointer(pSrcPos, srcPosStride);
568
0
            advanceRawPointer(pDestPos, destPosStride);
569
570
            //------------------------------------------------------------------
571
            // Optional blend normal
572
            //------------------------------------------------------------------
573
574
0
            if (pSrcNorm)
575
0
            {
576
                // Load source normal
577
0
                s0 = _mm_load_ps1(pSrcNorm + 0);
578
0
                s1 = _mm_load_ps1(pSrcNorm + 1);
579
0
                s2 = _mm_load_ps1(pSrcNorm + 2);
580
581
                // Transform by collapsed matrix
582
0
                __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2);   // z 0 x y
583
584
                // Normalise normal
585
0
                __m128 tmp = _mm_mul_ps(accumNorm, accumNorm);                  // z^2 0 x^2 y^2
586
0
                tmp = __MM_ACCUM3_PS(tmp,
587
0
                        _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)),         // x^2 0 y^2 z^2
588
0
                        _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3)));        // y^2 0 z^2 x^2
589
                // Note: zero divided here, but neglectable
590
0
                tmp = __MM_RSQRT_PS(tmp);
591
0
                accumNorm = _mm_mul_ps(accumNorm, tmp);
592
593
                // Store blended normal, no aligned requirement
594
0
                _mm_storeh_pi((__m64*)pDestNorm, accumNorm);
595
0
                _mm_store_ss(pDestNorm+2, accumNorm);
596
597
                // Advance source and target normal pointers
598
0
                advanceRawPointer(pSrcNorm, srcNormStride);
599
0
                advanceRawPointer(pDestNorm, destNormStride);
600
0
            }
601
0
        }
602
0
    }
603
    //---------------------------------------------------------------------
604
    // Special SSE version skinning shared buffers of position and normal,
605
    // and the buffer are packed.
606
    template <bool srcAligned, bool destAligned>
607
    struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed
608
    {
609
        static void apply(
610
            const float* pSrc, float* pDest,
611
            const float* pBlendWeight, const unsigned char* pBlendIndex,
612
            const Affine3* const* blendMatrices,
613
            size_t blendWeightStride, size_t blendIndexStride,
614
            size_t numWeightsPerVertex,
615
            size_t numIterations)
616
0
        {
617
0
            typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
618
0
            typedef SSEMemoryAccessor<destAligned> DestAccessor;
619
620
            // Blending 4 vertices per-iteration
621
0
            for (size_t i = 0; i < numIterations; ++i)
622
0
            {
623
                // Collapse matrices
624
0
                __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
625
0
                _collapseFourMatrices(
626
0
                    m00, m01, m02,
627
0
                    m10, m11, m12,
628
0
                    m20, m21, m22,
629
0
                    m30, m31, m32,
630
0
                    pBlendWeight, pBlendIndex,
631
0
                    blendMatrices,
632
0
                    blendWeightStride, blendIndexStride,
633
0
                    numWeightsPerVertex);
634
635
                // Advance 4 vertices
636
0
                advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
637
0
                advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
638
639
                //------------------------------------------------------------------
640
                // Transform position/normals
641
                //------------------------------------------------------------------
642
643
0
                __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
644
0
                __m128 t0, t1, t2, t3, t4, t5;
645
646
                // Load source position/normals
647
0
                s0 = SrcAccessor::load(pSrc + 0);                       // px0 py0 pz0 nx0
648
0
                s1 = SrcAccessor::load(pSrc + 4);                       // ny0 nz0 px1 py1
649
0
                s2 = SrcAccessor::load(pSrc + 8);                       // pz1 nx1 ny1 nz1
650
0
                s3 = SrcAccessor::load(pSrc + 12);                      // px2 py2 pz2 nx2
651
0
                s4 = SrcAccessor::load(pSrc + 16);                      // ny2 nz2 px3 py3
652
0
                s5 = SrcAccessor::load(pSrc + 20);                      // pz3 nx3 ny3 nz3
653
654
                // Rearrange to component-major for batches calculate.
655
656
0
                t0 = _mm_unpacklo_ps(s0, s3);                           // px0 px2 py0 py2
657
0
                t1 = _mm_unpackhi_ps(s0, s3);                           // pz0 pz2 nx0 nx2
658
0
                t2 = _mm_unpacklo_ps(s1, s4);                           // ny0 ny2 nz0 nz2
659
0
                t3 = _mm_unpackhi_ps(s1, s4);                           // px1 px3 py1 py3
660
0
                t4 = _mm_unpacklo_ps(s2, s5);                           // pz1 pz3 nx1 nx3
661
0
                t5 = _mm_unpackhi_ps(s2, s5);                           // ny1 ny3 nz1 nz3
662
663
0
                s0 = _mm_unpacklo_ps(t0, t3);                           // px0 px1 px2 px3
664
0
                s1 = _mm_unpackhi_ps(t0, t3);                           // py0 py1 py2 py3
665
0
                s2 = _mm_unpacklo_ps(t1, t4);                           // pz0 pz1 pz2 pz3
666
0
                s3 = _mm_unpackhi_ps(t1, t4);                           // nx0 nx1 nx2 nx3
667
0
                s4 = _mm_unpacklo_ps(t2, t5);                           // ny0 ny1 ny2 ny3
668
0
                s5 = _mm_unpackhi_ps(t2, t5);                           // nz0 nz1 nz2 nz3
669
670
                // Transform by collapsed matrix
671
672
                // Shuffle row 0 of four collapsed matrices for calculate X component
673
0
                __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
674
675
                // Transform X components
676
0
                d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // PX0 PX1 PX2 PX3
677
0
                d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5);         // NX0 NX1 NX2 NX3
678
679
                // Shuffle row 1 of four collapsed matrices for calculate Y component
680
0
                __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
681
682
                // Transform Y components
683
0
                d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // PY0 PY1 PY2 PY3
684
0
                d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5);         // NY0 NY1 NY2 NY3
685
686
                // Shuffle row 2 of four collapsed matrices for calculate Z component
687
0
                __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
688
689
                // Transform Z components
690
0
                d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // PZ0 PZ1 PZ2 PZ3
691
0
                d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5);         // NZ0 NZ1 NZ2 NZ3
692
693
                // Normalise normals
694
0
                __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
695
0
                tmp = __MM_RSQRT_PS(tmp);
696
0
                d3 = _mm_mul_ps(d3, tmp);
697
0
                d4 = _mm_mul_ps(d4, tmp);
698
0
                d5 = _mm_mul_ps(d5, tmp);
699
700
                // Arrange back to continuous format for store results
701
702
0
                t0 = _mm_unpacklo_ps(d0, d1);                           // PX0 PY0 PX1 PY1
703
0
                t1 = _mm_unpackhi_ps(d0, d1);                           // PX2 PY2 PX3 PY3
704
0
                t2 = _mm_unpacklo_ps(d2, d3);                           // PZ0 NX0 PZ1 NX1
705
0
                t3 = _mm_unpackhi_ps(d2, d3);                           // PZ2 NX2 PZ3 NX3
706
0
                t4 = _mm_unpacklo_ps(d4, d5);                           // NY0 NZ0 NY1 NZ1
707
0
                t5 = _mm_unpackhi_ps(d4, d5);                           // NY2 NZ2 NY3 NZ3
708
709
0
                d0 = _mm_movelh_ps(t0, t2);                             // PX0 PY0 PZ0 NX0
710
0
                d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0));      // NY0 NZ0 PX1 PY1
711
0
                d2 = _mm_movehl_ps(t4, t2);                             // PZ1 NX1 NY1 NZ1
712
0
                d3 = _mm_movelh_ps(t1, t3);                             // PX2 PY2 PZ2 NX2
713
0
                d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0));      // NY2 NZ2 PX3 PY3
714
0
                d5 = _mm_movehl_ps(t5, t3);                             // PZ3 NX3 NY3 NZ3
715
716
                // Store blended position/normals
717
0
                DestAccessor::store(pDest + 0, d0);
718
0
                DestAccessor::store(pDest + 4, d1);
719
0
                DestAccessor::store(pDest + 8, d2);
720
0
                DestAccessor::store(pDest + 12, d3);
721
0
                DestAccessor::store(pDest + 16, d4);
722
0
                DestAccessor::store(pDest + 20, d5);
723
724
                // Advance 4 vertices
725
0
                pSrc += 4 * (3 + 3);
726
0
                pDest += 4 * (3 + 3);
727
0
            }
728
0
        }
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
729
    };
730
    static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
731
            const float* pSrcPos, float* pDestPos,
732
            const float* pBlendWeight, const unsigned char* pBlendIndex,
733
            const Affine3* const* blendMatrices,
734
            size_t blendWeightStride, size_t blendIndexStride,
735
            size_t numWeightsPerVertex,
736
            size_t numIterations)
737
0
    {
738
        // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
739
740
        // Instantiating two version only, since other alignment combinations are not that important.
741
0
        if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos))
742
0
        {
743
0
            SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(
744
0
                pSrcPos, pDestPos,
745
0
                pBlendWeight, pBlendIndex,
746
0
                blendMatrices,
747
0
                blendWeightStride, blendIndexStride,
748
0
                numWeightsPerVertex,
749
0
                numIterations);
750
0
        }
751
0
        else
752
0
        {
753
0
            SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(
754
0
                pSrcPos, pDestPos,
755
0
                pBlendWeight, pBlendIndex,
756
0
                blendMatrices,
757
0
                blendWeightStride, blendIndexStride,
758
0
                numWeightsPerVertex,
759
0
                numIterations);
760
0
        }
761
0
    }
762
    //---------------------------------------------------------------------
763
    // Special SSE version skinning separated buffers of position and normal,
764
    // both of position and normal buffer are packed.
765
    template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
766
    struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed
767
    {
768
        static void apply(
769
            const float* pSrcPos, float* pDestPos,
770
            const float* pSrcNorm, float* pDestNorm,
771
            const float* pBlendWeight, const unsigned char* pBlendIndex,
772
            const Affine3* const* blendMatrices,
773
            size_t blendWeightStride, size_t blendIndexStride,
774
            size_t numWeightsPerVertex,
775
            size_t numIterations)
776
0
        {
777
0
            typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
778
0
            typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
779
0
            typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor;
780
0
            typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor;
781
782
            // Blending 4 vertices per-iteration
783
0
            for (size_t i = 0; i < numIterations; ++i)
784
0
            {
785
                // Collapse matrices
786
0
                __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
787
0
                _collapseFourMatrices(
788
0
                    m00, m01, m02,
789
0
                    m10, m11, m12,
790
0
                    m20, m21, m22,
791
0
                    m30, m31, m32,
792
0
                    pBlendWeight, pBlendIndex,
793
0
                    blendMatrices,
794
0
                    blendWeightStride, blendIndexStride,
795
0
                    numWeightsPerVertex);
796
797
                // Advance 4 vertices
798
0
                advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
799
0
                advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
800
801
                //------------------------------------------------------------------
802
                // Transform positions
803
                //------------------------------------------------------------------
804
805
0
                __m128 s0, s1, s2, d0, d1, d2;
806
807
                // Load source positions
808
0
                s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
809
0
                s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
810
0
                s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
811
812
                // Arrange to 3x4 component-major for batches calculate
813
0
                __MM_TRANSPOSE4x3_PS(s0, s1, s2);
814
815
                // Transform by collapsed matrix
816
817
                // Shuffle row 0 of four collapsed matrices for calculate X component
818
0
                __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
819
820
                // Transform X components
821
0
                d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
822
823
                // Shuffle row 1 of four collapsed matrices for calculate Y component
824
0
                __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
825
826
                // Transform Y components
827
0
                d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
828
829
                // Shuffle row 2 of four collapsed matrices for calculate Z component
830
0
                __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
831
832
                // Transform Z components
833
0
                d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
834
835
                // Arrange back to 4x3 continuous format for store results
836
0
                __MM_TRANSPOSE3x4_PS(d0, d1, d2);
837
838
                // Store blended positions
839
0
                DestPosAccessor::store(pDestPos + 0, d0);
840
0
                DestPosAccessor::store(pDestPos + 4, d1);
841
0
                DestPosAccessor::store(pDestPos + 8, d2);
842
843
                // Advance 4 vertices
844
0
                pSrcPos += 4 * 3;
845
0
                pDestPos += 4 * 3;
846
847
                //------------------------------------------------------------------
848
                // Transform normals
849
                //------------------------------------------------------------------
850
851
                // Load source normals
852
0
                s0 = SrcNormAccessor::load(pSrcNorm + 0);               // x0 y0 z0 x1
853
0
                s1 = SrcNormAccessor::load(pSrcNorm + 4);               // y1 z1 x2 y2
854
0
                s2 = SrcNormAccessor::load(pSrcNorm + 8);               // z2 x3 y3 z3
855
856
                // Arrange to 3x4 component-major for batches calculate
857
0
                __MM_TRANSPOSE4x3_PS(s0, s1, s2);
858
859
                // Transform by collapsed and shuffled matrices
860
0
                d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2);         // X0 X1 X2 X3
861
0
                d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2);         // Y0 Y1 Y2 Y3
862
0
                d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2);         // Z0 Z1 Z2 Z3
863
864
                // Normalise normals
865
0
                __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
866
0
                tmp = __MM_RSQRT_PS(tmp);
867
0
                d0 = _mm_mul_ps(d0, tmp);
868
0
                d1 = _mm_mul_ps(d1, tmp);
869
0
                d2 = _mm_mul_ps(d2, tmp);
870
871
                // Arrange back to 4x3 continuous format for store results
872
0
                __MM_TRANSPOSE3x4_PS(d0, d1, d2);
873
874
                // Store blended normals
875
0
                DestNormAccessor::store(pDestNorm + 0, d0);
876
0
                DestNormAccessor::store(pDestNorm + 4, d1);
877
0
                DestNormAccessor::store(pDestNorm + 8, d2);
878
879
                // Advance 4 vertices
880
0
                pSrcNorm += 4 * 3;
881
0
                pDestNorm += 4 * 3;
882
0
            }
883
0
        }
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(float const*, float*, float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(float const*, float*, float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
884
    };
885
    static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
886
        const float* pSrcPos, float* pDestPos,
887
        const float* pSrcNorm, float* pDestNorm,
888
        const float* pBlendWeight, const unsigned char* pBlendIndex,
889
        const Affine3* const* blendMatrices,
890
        size_t blendWeightStride, size_t blendIndexStride,
891
        size_t numWeightsPerVertex,
892
        size_t numIterations)
893
0
    {
894
0
        assert(_isAlignedForSSE(pSrcPos));
895
896
        // Instantiating two version only, since other alignment combination not that important.
897
0
        if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm))
898
0
        {
899
0
            SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(
900
0
                pSrcPos, pDestPos,
901
0
                pSrcNorm, pDestNorm,
902
0
                pBlendWeight, pBlendIndex,
903
0
                blendMatrices,
904
0
                blendWeightStride, blendIndexStride,
905
0
                numWeightsPerVertex,
906
0
                numIterations);
907
0
        }
908
0
        else
909
0
        {
910
0
            SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(
911
0
                pSrcPos, pDestPos,
912
0
                pSrcNorm, pDestNorm,
913
0
                pBlendWeight, pBlendIndex,
914
0
                blendMatrices,
915
0
                blendWeightStride, blendIndexStride,
916
0
                numWeightsPerVertex,
917
0
                numIterations);
918
0
        }
919
0
    }
920
    //---------------------------------------------------------------------
921
    // Special SSE version skinning position only, the position buffer are
922
    // packed.
923
    template <bool srcPosAligned, bool destPosAligned>
924
    struct SoftwareVertexSkinning_SSE_PosOnly_Packed
925
    {
926
        static void apply(
927
            const float* pSrcPos, float* pDestPos,
928
            const float* pBlendWeight, const unsigned char* pBlendIndex,
929
            const Affine3* const* blendMatrices,
930
            size_t blendWeightStride, size_t blendIndexStride,
931
            size_t numWeightsPerVertex,
932
            size_t numIterations)
933
0
        {
934
0
            typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
935
0
            typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
936
937
            // Blending 4 vertices per-iteration
938
0
            for (size_t i = 0; i < numIterations; ++i)
939
0
            {
940
                // Collapse matrices
941
0
                __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
942
0
                _collapseFourMatrices(
943
0
                    m00, m01, m02,
944
0
                    m10, m11, m12,
945
0
                    m20, m21, m22,
946
0
                    m30, m31, m32,
947
0
                    pBlendWeight, pBlendIndex,
948
0
                    blendMatrices,
949
0
                    blendWeightStride, blendIndexStride,
950
0
                    numWeightsPerVertex);
951
952
                // Advance 4 vertices
953
0
                advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
954
0
                advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
955
956
                //------------------------------------------------------------------
957
                // Transform positions
958
                //------------------------------------------------------------------
959
960
0
                __m128 s0, s1, s2, d0, d1, d2;
961
962
                // Load source positions
963
0
                s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
964
0
                s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
965
0
                s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
966
967
                // Arrange to 3x4 component-major for batches calculate
968
0
                __MM_TRANSPOSE4x3_PS(s0, s1, s2);
969
970
                // Transform by collapsed matrix
971
972
                // Shuffle row 0 of four collapsed matrices for calculate X component
973
0
                __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
974
975
                // Transform X components
976
0
                d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
977
978
                // Shuffle row 1 of four collapsed matrices for calculate Y component
979
0
                __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
980
981
                // Transform Y components
982
0
                d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
983
984
                // Shuffle row 2 of four collapsed matrices for calculate Z component
985
0
                __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
986
987
                // Transform Z components
988
0
                d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
989
990
                // Arrange back to 4x3 continuous format for store results
991
0
                __MM_TRANSPOSE3x4_PS(d0, d1, d2);
992
993
                // Store blended positions
994
0
                DestPosAccessor::store(pDestPos + 0, d0);
995
0
                DestPosAccessor::store(pDestPos + 4, d1);
996
0
                DestPosAccessor::store(pDestPos + 8, d2);
997
998
                // Advance 4 vertices
999
0
                pSrcPos += 4 * 3;
1000
0
                pDestPos += 4 * 3;
1001
0
            }
1002
0
        }
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
Unexecuted instantiation: Ogre::SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(float const*, float*, float const*, unsigned char const*, Ogre::Affine3 const* const*, unsigned long, unsigned long, unsigned long, unsigned long)
1003
    };
1004
    static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosOnly_Packed(
1005
        const float* pSrcPos, float* pDestPos,
1006
        const float* pBlendWeight, const unsigned char* pBlendIndex,
1007
        const Affine3* const* blendMatrices,
1008
        size_t blendWeightStride, size_t blendIndexStride,
1009
        size_t numWeightsPerVertex,
1010
        size_t numIterations)
1011
0
    {
1012
0
        assert(_isAlignedForSSE(pSrcPos));
1013
1014
        // Instantiating two version only, since other alignment combination not that important.
1015
0
        if (_isAlignedForSSE(pDestPos))
1016
0
        {
1017
0
            SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(
1018
0
                pSrcPos, pDestPos,
1019
0
                pBlendWeight, pBlendIndex,
1020
0
                blendMatrices,
1021
0
                blendWeightStride, blendIndexStride,
1022
0
                numWeightsPerVertex,
1023
0
                numIterations);
1024
0
        }
1025
0
        else
1026
0
        {
1027
0
            SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(
1028
0
                pSrcPos, pDestPos,
1029
0
                pBlendWeight, pBlendIndex,
1030
0
                blendMatrices,
1031
0
                blendWeightStride, blendIndexStride,
1032
0
                numWeightsPerVertex,
1033
0
                numIterations);
1034
0
        }
1035
0
    }
1036
    //---------------------------------------------------------------------
1037
    //---------------------------------------------------------------------
1038
    //---------------------------------------------------------------------
1039
    OptimisedUtilSSE::OptimisedUtilSSE(void)
1040
2
        : mPreferGeneralVersionForSharedBuffers(false)
1041
2
    {
1042
        // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use
1043
        // unrolled version for shared buffers at all, I guess because that version
1044
        // run out of usable CPU registers, or L1/L2 cache related problem, causing
1045
        // slight performance loss than general version.
1046
        //
1047
2
#if __OGRE_HAVE_NEON == 0
1048
2
        if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos)
1049
0
        {
1050
            // How can I check it's an Athlon XP but not Althon 64?
1051
            // Ok, just test whether supports SSE2/SSE3 or not, if not,
1052
            // assume general version faster than unrolled version :)
1053
            //
1054
0
            if (!(PlatformInformation::getCpuFeatures() &
1055
0
                (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3)))
1056
0
            {
1057
0
                mPreferGeneralVersionForSharedBuffers = true;
1058
0
            }
1059
0
        }
1060
2
#endif
1061
2
    }
1062
    //---------------------------------------------------------------------
1063
    void OptimisedUtilSSE::softwareVertexSkinning(
1064
        const float *pSrcPos, float *pDestPos,
1065
        const float *pSrcNorm, float *pDestNorm,
1066
        const float *pBlendWeight, const unsigned char* pBlendIndex,
1067
        const Affine3* const* blendMatrices,
1068
        size_t srcPosStride, size_t destPosStride,
1069
        size_t srcNormStride, size_t destNormStride,
1070
        size_t blendWeightStride, size_t blendIndexStride,
1071
        size_t numWeightsPerVertex,
1072
        size_t numVertices)
1073
0
    {
1074
1075
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1076
1077
        // All position/normal pointers should be perfect aligned, but still check here
1078
        // for avoid hardware buffer which allocated by potential buggy driver doesn't
1079
        // support alignment properly.
1080
        // Because we are used meta-function technique here, the code is easy to maintenance
1081
        // and still provides all possible alignment combination.
1082
        //
1083
1084
        // Use unrolled routines only if there a lot of vertices
1085
0
        if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES)
1086
0
        {
1087
0
            if (pSrcNorm)
1088
0
            {
1089
                // Blend position and normal
1090
1091
0
                if (!mPreferGeneralVersionForSharedBuffers &&
1092
0
                    srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1093
0
                    pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1094
0
                {
1095
                    // Position and normal are sharing with packed buffer
1096
1097
0
                    size_t srcPosAlign = (size_t)pSrcPos & 15;
1098
0
                    assert((srcPosAlign & 3) == 0);
1099
1100
                    // Blend unaligned vertices with general SIMD routine
1101
0
                    if (srcPosAlign == 8)   // Because 8 bytes alignment shift per-vertex
1102
0
                    {
1103
0
                        size_t count = srcPosAlign / 8;
1104
0
                        numVertices -= count;
1105
0
                        softwareVertexSkinning_SSE_General(
1106
0
                            pSrcPos, pDestPos,
1107
0
                            pSrcNorm, pDestNorm,
1108
0
                            pBlendWeight, pBlendIndex,
1109
0
                            blendMatrices,
1110
0
                            srcPosStride, destPosStride,
1111
0
                            srcNormStride, destNormStride,
1112
0
                            blendWeightStride, blendIndexStride,
1113
0
                            numWeightsPerVertex,
1114
0
                            count);
1115
1116
0
                        pSrcPos += count * (3 + 3);
1117
0
                        pDestPos += count * (3 + 3);
1118
0
                        pSrcNorm += count * (3 + 3);
1119
0
                        pDestNorm += count * (3 + 3);
1120
0
                        advanceRawPointer(pBlendWeight, count * blendWeightStride);
1121
0
                        advanceRawPointer(pBlendIndex, count * blendIndexStride);
1122
0
                    }
1123
1124
                    // Blend vertices, four vertices per-iteration
1125
0
                    size_t numIterations = numVertices / 4;
1126
0
                    softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
1127
0
                        pSrcPos, pDestPos,
1128
0
                        pBlendWeight, pBlendIndex,
1129
0
                        blendMatrices,
1130
0
                        blendWeightStride, blendIndexStride,
1131
0
                        numWeightsPerVertex,
1132
0
                        numIterations);
1133
1134
                    // Advance pointers for remaining vertices
1135
0
                    numVertices &= 3;
1136
0
                    if (numVertices)
1137
0
                    {
1138
0
                        pSrcPos += numIterations * 4 * (3 + 3);
1139
0
                        pDestPos += numIterations * 4 * (3 + 3);
1140
0
                        pSrcNorm += numIterations * 4 * (3 + 3);
1141
0
                        pDestNorm += numIterations * 4 * (3 + 3);
1142
0
                        advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1143
0
                        advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1144
0
                    }
1145
0
                }
1146
0
                else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1147
0
                         srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1148
0
                {
1149
                    // Position and normal are separate buffers, and all of them are packed
1150
1151
0
                    size_t srcPosAlign = (size_t)pSrcPos & 15;
1152
0
                    assert((srcPosAlign & 3) == 0);
1153
1154
                    // Blend unaligned vertices with general SIMD routine
1155
0
                    if (srcPosAlign)
1156
0
                    {
1157
0
                        size_t count = srcPosAlign / 4;
1158
0
                        numVertices -= count;
1159
0
                        softwareVertexSkinning_SSE_General(
1160
0
                            pSrcPos, pDestPos,
1161
0
                            pSrcNorm, pDestNorm,
1162
0
                            pBlendWeight, pBlendIndex,
1163
0
                            blendMatrices,
1164
0
                            srcPosStride, destPosStride,
1165
0
                            srcNormStride, destNormStride,
1166
0
                            blendWeightStride, blendIndexStride,
1167
0
                            numWeightsPerVertex,
1168
0
                            count);
1169
1170
0
                        pSrcPos += count * 3;
1171
0
                        pDestPos += count * 3;
1172
0
                        pSrcNorm += count * 3;
1173
0
                        pDestNorm += count * 3;
1174
0
                        advanceRawPointer(pBlendWeight, count * blendWeightStride);
1175
0
                        advanceRawPointer(pBlendIndex, count * blendIndexStride);
1176
0
                    }
1177
1178
                    // Blend vertices, four vertices per-iteration
1179
0
                    size_t numIterations = numVertices / 4;
1180
0
                    softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
1181
0
                        pSrcPos, pDestPos,
1182
0
                        pSrcNorm, pDestNorm,
1183
0
                        pBlendWeight, pBlendIndex,
1184
0
                        blendMatrices,
1185
0
                        blendWeightStride, blendIndexStride,
1186
0
                        numWeightsPerVertex,
1187
0
                        numIterations);
1188
1189
                    // Advance pointers for remaining vertices
1190
0
                    numVertices &= 3;
1191
0
                    if (numVertices)
1192
0
                    {
1193
0
                        pSrcPos += numIterations * 4 * 3;
1194
0
                        pDestPos += numIterations * 4 * 3;
1195
0
                        pSrcNorm += numIterations * 4 * 3;
1196
0
                        pDestNorm += numIterations * 4 * 3;
1197
0
                        advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1198
0
                        advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1199
0
                    }
1200
0
                }
1201
0
                else    // Not 'packed' form or wrong order between position and normal
1202
0
                {
1203
                    // Should never occur, do nothing here just in case
1204
0
                }
1205
0
            }
1206
0
            else    // !pSrcNorm
1207
0
            {
1208
                // Blend position only
1209
1210
0
                if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1211
0
                {
1212
                    // All buffers are packed
1213
1214
0
                    size_t srcPosAlign = (size_t)pSrcPos & 15;
1215
0
                    assert((srcPosAlign & 3) == 0);
1216
1217
                    // Blend unaligned vertices with general SIMD routine
1218
0
                    if (srcPosAlign)
1219
0
                    {
1220
0
                        size_t count = srcPosAlign / 4;
1221
0
                        numVertices -= count;
1222
0
                        softwareVertexSkinning_SSE_General(
1223
0
                            pSrcPos, pDestPos,
1224
0
                            pSrcNorm, pDestNorm,
1225
0
                            pBlendWeight, pBlendIndex,
1226
0
                            blendMatrices,
1227
0
                            srcPosStride, destPosStride,
1228
0
                            srcNormStride, destNormStride,
1229
0
                            blendWeightStride, blendIndexStride,
1230
0
                            numWeightsPerVertex,
1231
0
                            count);
1232
1233
0
                        pSrcPos += count * 3;
1234
0
                        pDestPos += count * 3;
1235
0
                        advanceRawPointer(pBlendWeight, count * blendWeightStride);
1236
0
                        advanceRawPointer(pBlendIndex, count * blendIndexStride);
1237
0
                    }
1238
1239
                    // Blend vertices, four vertices per-iteration
1240
0
                    size_t numIterations = numVertices / 4;
1241
0
                    softwareVertexSkinning_SSE_PosOnly_Packed(
1242
0
                        pSrcPos, pDestPos,
1243
0
                        pBlendWeight, pBlendIndex,
1244
0
                        blendMatrices,
1245
0
                        blendWeightStride, blendIndexStride,
1246
0
                        numWeightsPerVertex,
1247
0
                        numIterations);
1248
1249
                    // Advance pointers for remaining vertices
1250
0
                    numVertices &= 3;
1251
0
                    if (numVertices)
1252
0
                    {
1253
0
                        pSrcPos += numIterations * 4 * 3;
1254
0
                        pDestPos += numIterations * 4 * 3;
1255
0
                        advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1256
0
                        advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1257
0
                    }
1258
0
                }
1259
0
                else    // Not 'packed' form
1260
0
                {
1261
                    // Might occur only if user forced software blending position only
1262
0
                }
1263
0
            }
1264
0
        }
1265
1266
        // Blend remaining vertices, need to do it with SIMD for identical result,
1267
        // since mixing general floating-point and SIMD algorithm will causing
1268
        // floating-point error.
1269
0
        if (numVertices)
1270
0
        {
1271
0
            softwareVertexSkinning_SSE_General(
1272
0
                pSrcPos, pDestPos,
1273
0
                pSrcNorm, pDestNorm,
1274
0
                pBlendWeight, pBlendIndex,
1275
0
                blendMatrices,
1276
0
                srcPosStride, destPosStride,
1277
0
                srcNormStride, destNormStride,
1278
0
                blendWeightStride, blendIndexStride,
1279
0
                numWeightsPerVertex,
1280
0
                numVertices);
1281
0
        }
1282
0
    }
1283
    //---------------------------------------------------------------------
1284
    void OptimisedUtilSSE::softwareVertexMorph(
1285
        float t,
1286
        const float *pSrc1, const float *pSrc2,
1287
        float *pDst,
1288
        size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 
1289
        size_t numVertices,
1290
        bool morphNormals)
1291
0
    {
1292
0
        OgreAssert(pos1VSize == pos2VSize && pos2VSize == dstVSize && dstVSize == (morphNormals ? 24 : 12),
1293
0
                   "stride not supported");
1294
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1295
1296
0
        __m128 src01, src02, src11, src12, src21, src22;
1297
0
        __m128 dst0, dst1, dst2;
1298
1299
0
        __m128 t4 = _mm_load_ps1(&t);
1300
1301
1302
        // If we're morphing normals, we have twice the number of floats to process
1303
        // Positions are interleaved with normals, so we'll have to separately
1304
        // normalise just the normals later; we'll just lerp in the first pass
1305
        // We can't normalise as we go because normals & positions are only 3 floats
1306
        // each so are not aligned for SSE, we'd mix the data up
1307
0
        size_t normalsMultiplier = morphNormals ? 2 : 1;
1308
0
        size_t numIterations = (numVertices*normalsMultiplier) / 4;
1309
0
        size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3;
1310
        
1311
        // Save for later
1312
0
        float *pStartDst = pDst;
1313
                        
1314
        // Never use meta-function technique to accessing memory because looks like
1315
        // VC7.1 generate a bit inefficient binary code when put following code into
1316
        // inline function.
1317
1318
0
        if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst))
1319
0
        {
1320
            // All data aligned
1321
1322
            // Morph 4 vertices per-iteration. Special designed for use all
1323
            // available CPU registers as possible (7 registers used here),
1324
            // and avoid temporary values allocated in stack for suppress
1325
            // extra memory access.
1326
0
            for (size_t i = 0; i < numIterations; ++i)
1327
0
            {
1328
                // 12 floating-point values
1329
0
                src01 = __MM_LOAD_PS(pSrc1 + 0);
1330
0
                src02 = __MM_LOAD_PS(pSrc2 + 0);
1331
0
                src11 = __MM_LOAD_PS(pSrc1 + 4);
1332
0
                src12 = __MM_LOAD_PS(pSrc2 + 4);
1333
0
                src21 = __MM_LOAD_PS(pSrc1 + 8);
1334
0
                src22 = __MM_LOAD_PS(pSrc2 + 8);
1335
0
                pSrc1 += 12; pSrc2 += 12;
1336
1337
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1338
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1339
0
                dst2 = __MM_LERP_PS(t4, src21, src22);
1340
1341
0
                __MM_STORE_PS(pDst + 0, dst0);
1342
0
                __MM_STORE_PS(pDst + 4, dst1);
1343
0
                __MM_STORE_PS(pDst + 8, dst2);
1344
0
                pDst += 12;
1345
0
            }
1346
1347
            // Morph remaining vertices
1348
0
            switch (numVerticesRemainder)
1349
0
            {
1350
0
            case 3:
1351
                // 9 floating-point values
1352
0
                src01 = __MM_LOAD_PS(pSrc1 + 0);
1353
0
                src02 = __MM_LOAD_PS(pSrc2 + 0);
1354
0
                src11 = __MM_LOAD_PS(pSrc1 + 4);
1355
0
                src12 = __MM_LOAD_PS(pSrc2 + 4);
1356
0
                src21 = _mm_load_ss(pSrc1 + 8);
1357
0
                src22 = _mm_load_ss(pSrc2 + 8);
1358
1359
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1360
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1361
0
                dst2 = __MM_LERP_SS(t4, src21, src22);
1362
1363
0
                __MM_STORE_PS(pDst + 0, dst0);
1364
0
                __MM_STORE_PS(pDst + 4, dst1);
1365
0
                _mm_store_ss(pDst + 8, dst2);
1366
0
                break;
1367
1368
0
            case 2:
1369
                // 6 floating-point values
1370
0
                src01 = __MM_LOAD_PS(pSrc1 + 0);
1371
0
                src02 = __MM_LOAD_PS(pSrc2 + 0);
1372
0
                src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1373
0
                src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1374
1375
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1376
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1377
1378
0
                __MM_STORE_PS(pDst + 0, dst0);
1379
0
                _mm_storel_pi((__m64*)(pDst + 4), dst1);
1380
0
                break;
1381
1382
0
            case 1:
1383
                // 3 floating-point values
1384
0
                src01 = _mm_load_ss(pSrc1 + 2);
1385
0
                src02 = _mm_load_ss(pSrc2 + 2);
1386
0
                src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1387
0
                src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1388
1389
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1390
1391
0
                _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1392
0
                _mm_store_ss(pDst + 2, dst0);
1393
0
                break;
1394
0
            }
1395
0
        }
1396
0
        else    // Should never occur, just in case of buggy drivers
1397
0
        {
1398
            // Assume all data unaligned
1399
1400
            // Morph 4 vertices per-iteration. Special designed for use all
1401
            // available CPU registers as possible (7 registers used here),
1402
            // and avoid temporary values allocated in stack for suppress
1403
            // extra memory access.
1404
0
            for (size_t i = 0; i < numIterations; ++i)
1405
0
            {
1406
                // 12 floating-point values
1407
0
                src01 = _mm_loadu_ps(pSrc1 + 0);
1408
0
                src02 = _mm_loadu_ps(pSrc2 + 0);
1409
0
                src11 = _mm_loadu_ps(pSrc1 + 4);
1410
0
                src12 = _mm_loadu_ps(pSrc2 + 4);
1411
0
                src21 = _mm_loadu_ps(pSrc1 + 8);
1412
0
                src22 = _mm_loadu_ps(pSrc2 + 8);
1413
0
                pSrc1 += 12; pSrc2 += 12;
1414
1415
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1416
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1417
0
                dst2 = __MM_LERP_PS(t4, src21, src22);
1418
1419
0
                _mm_storeu_ps(pDst + 0, dst0);
1420
0
                _mm_storeu_ps(pDst + 4, dst1);
1421
0
                _mm_storeu_ps(pDst + 8, dst2);
1422
0
                pDst += 12;
1423
                
1424
0
            }
1425
1426
            // Morph remaining vertices
1427
0
            switch (numVerticesRemainder)
1428
0
            {
1429
0
            case 3:
1430
                // 9 floating-point values
1431
0
                src01 = _mm_loadu_ps(pSrc1 + 0);
1432
0
                src02 = _mm_loadu_ps(pSrc2 + 0);
1433
0
                src11 = _mm_loadu_ps(pSrc1 + 4);
1434
0
                src12 = _mm_loadu_ps(pSrc2 + 4);
1435
0
                src21 = _mm_load_ss(pSrc1 + 8);
1436
0
                src22 = _mm_load_ss(pSrc2 + 8);
1437
1438
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1439
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1440
0
                dst2 = __MM_LERP_SS(t4, src21, src22);
1441
1442
0
                _mm_storeu_ps(pDst + 0, dst0);
1443
0
                _mm_storeu_ps(pDst + 4, dst1);
1444
0
                _mm_store_ss(pDst + 8, dst2);
1445
0
                break;
1446
1447
0
            case 2:
1448
                // 6 floating-point values
1449
0
                src01 = _mm_loadu_ps(pSrc1 + 0);
1450
0
                src02 = _mm_loadu_ps(pSrc2 + 0);
1451
0
                src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1452
0
                src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1453
1454
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1455
0
                dst1 = __MM_LERP_PS(t4, src11, src12);
1456
1457
0
                _mm_storeu_ps(pDst + 0, dst0);
1458
0
                _mm_storel_pi((__m64*)(pDst + 4), dst1);
1459
0
                break;
1460
1461
0
            case 1:
1462
                // 3 floating-point values
1463
0
                src01 = _mm_load_ss(pSrc1 + 2);
1464
0
                src02 = _mm_load_ss(pSrc2 + 2);
1465
0
                src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1466
0
                src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1467
1468
0
                dst0 = __MM_LERP_PS(t4, src01, src02);
1469
1470
0
                _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1471
0
                _mm_store_ss(pDst + 2, dst0);
1472
0
                break;
1473
0
            }
1474
            
1475
0
        }
1476
        
1477
0
        if (morphNormals)
1478
0
        {
1479
            
1480
            // Now we need to do and unaligned normalise on the normals data we just
1481
            // lerped; because normals are 3 elements each they're always unaligned
1482
0
            float *pNorm = pStartDst;
1483
            
1484
            // Offset past first position
1485
0
            pNorm += 3;
1486
            
1487
            // We'll do one normal each iteration, but still use SSE
1488
0
            for (size_t n = 0; n < numVertices; ++n)
1489
0
            {
1490
                // normalise function
1491
0
                __m128 norm;
1492
                
1493
                // load 3 floating-point normal values
1494
                // This loads into [0] and clears the rest
1495
0
                norm = _mm_load_ss(pNorm + 2);
1496
                // This loads into [2,3]. [1] is unused
1497
0
                norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0));
1498
                
1499
                // Fill a 4-vec with vector length
1500
                // square
1501
0
                __m128 tmp = _mm_mul_ps(norm, norm);
1502
                // Add - for this we want this effect:
1503
                // orig   3 | 2 | 1 | 0
1504
                // add1   0 | 0 | 0 | 2
1505
                // add2   2 | 3 | 0 | 3
1506
                // This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused)
1507
                
1508
0
                tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2)));
1509
                // Add final combination & sqrt 
1510
                // bottom 3 elements of l will have length, we don't care about 4
1511
0
                tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3)));
1512
                // Then divide to normalise
1513
0
                norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp));
1514
                
1515
                // Store back in the same place
1516
0
                _mm_storeh_pi((__m64*)(pNorm + 0), norm);
1517
0
                _mm_store_ss(pNorm + 2, norm);
1518
                
1519
                // Skip to next vertex (3x normal components, 3x position components)
1520
0
                pNorm += 6;
1521
1522
                
1523
0
            }
1524
            
1525
1526
0
        }
1527
0
    }
1528
    //---------------------------------------------------------------------
1529
    void OptimisedUtilSSE::concatenateAffineMatrices(
1530
        const Affine3& baseMatrix,
1531
        const Affine3* pSrcMat,
1532
        Affine3* pDstMat,
1533
        size_t numMatrices)
1534
0
    {
1535
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1536
1537
0
        assert(_isAlignedForSSE(pSrcMat));
1538
0
        assert(_isAlignedForSSE(pDstMat));
1539
1540
        // Load base matrix, unaligned
1541
0
        __m128 m0 = _mm_loadu_ps(baseMatrix[0]);
1542
0
        __m128 m1 = _mm_loadu_ps(baseMatrix[1]);
1543
0
        __m128 m2 = _mm_loadu_ps(baseMatrix[2]);
1544
0
        __m128 m3 = _mm_loadu_ps(baseMatrix[3]);        // m3 should be equal to (0, 0, 0, 1)
1545
1546
0
        for (size_t i = 0; i < numMatrices; ++i)
1547
0
        {
1548
            // Load source matrix, aligned
1549
0
            __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]);
1550
0
            __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]);
1551
0
            __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]);
1552
1553
0
            ++pSrcMat;
1554
1555
0
            __m128 t0, t1, t2, t3;
1556
1557
            // Concatenate matrix, and store results
1558
1559
            // Row 0
1560
0
            t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0);
1561
0
            t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1);
1562
0
            t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2);
1563
0
            t3 = _mm_mul_ps(m0, m3);    // Compiler should optimise this out of the loop
1564
0
            __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3));
1565
1566
            // Row 1
1567
0
            t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0);
1568
0
            t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1);
1569
0
            t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2);
1570
0
            t3 = _mm_mul_ps(m1, m3);    // Compiler should optimise this out of the loop
1571
0
            __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3));
1572
1573
            // Row 2
1574
0
            t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0);
1575
0
            t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1);
1576
0
            t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2);
1577
0
            t3 = _mm_mul_ps(m2, m3);    // Compiler should optimise this out of the loop
1578
0
            __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3));
1579
1580
0
            ++pDstMat;
1581
0
        }
1582
0
    }
1583
    //---------------------------------------------------------------------
1584
    void OptimisedUtilSSE::calculateFaceNormals(
1585
        const float *positions,
1586
        const EdgeData::Triangle *triangles,
1587
        Vector4 *faceNormals,
1588
        size_t numTriangles)
1589
0
    {
1590
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1591
1592
0
        assert(_isAlignedForSSE(faceNormals));
1593
1594
// Load Vector3 as: (x, 0, y, z)
1595
0
#define __LOAD_VECTOR3(p)   _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1))
1596
1597
        // Mask used to changes sign of single precision floating point values.
1598
0
        OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) =
1599
0
        {
1600
0
            0x80000000, 0x80000000, 0x80000000, 0x80000000,
1601
0
        };
1602
1603
0
        size_t numIterations = numTriangles / 4;
1604
0
        numTriangles &= 3;
1605
1606
        // Four triangles per-iteration
1607
0
        for (size_t i = 0; i < numIterations; ++i)
1608
0
        {
1609
1610
// Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1611
0
#define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3)                    \
1612
0
            {                                                           \
1613
0
                __m128 v0 = __LOAD_VECTOR3(p0);     /* x0 -- y0 z0 */   \
1614
0
                __m128 v1 = __LOAD_VECTOR3(p1);     /* x1 -- y1 z1 */   \
1615
0
                __m128 v2 = __LOAD_VECTOR3(p2);     /* x2 -- y2 z2 */   \
1616
0
                __m128 v3 = __LOAD_VECTOR3(p3);     /* x3 -- y3 z3 */   \
1617
0
                __m128 t0, t1;                                          \
1618
0
                                                                        \
1619
0
                t0 = _mm_unpacklo_ps(v0, v2);       /* x0 x2 -- -- */   \
1620
0
                t1 = _mm_unpacklo_ps(v1, v3);       /* x1 x3 -- -- */   \
1621
0
                x  = _mm_unpacklo_ps(t0, t1);       /* x0 x1 x2 x3 */   \
1622
0
                                                                        \
1623
0
                t0 = _mm_unpackhi_ps(v0, v2);       /* y0 y2 z0 z2 */   \
1624
0
                t1 = _mm_unpackhi_ps(v1, v3);       /* y1 y3 z1 z3 */   \
1625
0
                y  = _mm_unpacklo_ps(t0, t1);       /* y0 y1 y2 y3 */   \
1626
0
                z  = _mm_unpackhi_ps(t0, t1);       /* z0 z1 z2 z3 */   \
1627
0
            }
1628
1629
0
            __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2;
1630
1631
            // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1632
0
            __LOAD_FOUR_VECTOR3(x0, y0, z0,
1633
0
                positions + triangles[0].vertIndex[0] * 3,
1634
0
                positions + triangles[1].vertIndex[0] * 3,
1635
0
                positions + triangles[2].vertIndex[0] * 3,
1636
0
                positions + triangles[3].vertIndex[0] * 3);
1637
1638
            // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1639
0
            __LOAD_FOUR_VECTOR3(x1, y1, z1,
1640
0
                positions + triangles[0].vertIndex[1] * 3,
1641
0
                positions + triangles[1].vertIndex[1] * 3,
1642
0
                positions + triangles[2].vertIndex[1] * 3,
1643
0
                positions + triangles[3].vertIndex[1] * 3);
1644
1645
            // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1646
0
            __LOAD_FOUR_VECTOR3(x2, y2, z2,
1647
0
                positions + triangles[0].vertIndex[2] * 3,
1648
0
                positions + triangles[1].vertIndex[2] * 3,
1649
0
                positions + triangles[2].vertIndex[2] * 3,
1650
0
                positions + triangles[3].vertIndex[2] * 3);
1651
1652
0
            triangles += 4;
1653
1654
            // Calculate triangle face normals
1655
1656
            // a = v1 - v0
1657
0
            __m128 ax = _mm_sub_ps(x1, x0);
1658
0
            __m128 ay = _mm_sub_ps(y1, y0);
1659
0
            __m128 az = _mm_sub_ps(z1, z0);
1660
1661
            // b = v2 - v0
1662
0
            __m128 bx = _mm_sub_ps(x2, x0);
1663
0
            __m128 by = _mm_sub_ps(y2, y0);
1664
0
            __m128 bz = _mm_sub_ps(z2, z0);
1665
1666
            // n = a cross b
1667
0
            __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by));
1668
0
            __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz));
1669
0
            __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx));
1670
1671
            // w = - (n dot v0)
1672
0
            __m128 nw = _mm_xor_ps(
1673
0
                __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0),
1674
0
                *(const __m128 *)&msSignMask);
1675
1676
            // Arrange to per-triangle face normal major format
1677
0
            __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1678
1679
            // Store results
1680
0
            __MM_STORE_PS(&faceNormals[0].x, nx);
1681
0
            __MM_STORE_PS(&faceNormals[1].x, ny);
1682
0
            __MM_STORE_PS(&faceNormals[2].x, nz);
1683
0
            __MM_STORE_PS(&faceNormals[3].x, nw);
1684
0
            faceNormals += 4;
1685
1686
0
#undef __LOAD_FOUR_VECTOR3
1687
0
        }
1688
1689
        // Dealing with remaining triangles
1690
0
        for (size_t j = 0; j < numTriangles; ++j)
1691
0
        {
1692
            // Load vertices of the triangle
1693
0
            __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3);
1694
0
            __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3);
1695
0
            __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3);
1696
0
            ++triangles;
1697
1698
            // Calculate face normal
1699
1700
0
            __m128 t0, t1;
1701
1702
0
            __m128 a = _mm_sub_ps(v1, v0);                      // ax 0 ay az
1703
0
            __m128 b = _mm_sub_ps(v2, v0);                      // bx 0 by bz
1704
0
            t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3));    // az 0 ax ay
1705
0
            t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3));    // bz 0 bx by
1706
0
            t0 = _mm_mul_ps(t0, b);                             // az*bx 0 ax*by ay*bz
1707
0
            t1 = _mm_mul_ps(t1, a);                             // ax*bz 0 ay*bx az*by
1708
1709
0
            __m128 n = _mm_sub_ps(t0, t1);                      // ny 0  nz nx
1710
1711
0
            __m128 d = _mm_mul_ps(                              // dy 0  dz dx
1712
0
                _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n);
1713
1714
0
            n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(               // nx ny nz -(dx+dy+dz)
1715
0
                _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)),     // nx ny nz 0
1716
0
                _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))),    // 0  0  0  dx
1717
0
                _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))),    // 0  0  0  dy
1718
0
                _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1)));    // 0  0  0  dz
1719
1720
            // Store result
1721
0
            __MM_STORE_PS(&faceNormals->x, n);
1722
0
            ++faceNormals;
1723
0
        }
1724
1725
0
#undef __LOAD_VECTOR3
1726
0
    }
1727
    //---------------------------------------------------------------------
1728
    void OptimisedUtilSSE::calculateLightFacing(
1729
        const Vector4& lightPos,
1730
        const Vector4* faceNormals,
1731
        char* lightFacings,
1732
        size_t numFaces)
1733
0
    {
1734
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1735
1736
0
        assert(_isAlignedForSSE(faceNormals));
1737
1738
        // Map to convert 4-bits mask to 4 byte values
1739
0
        static const char msMaskMapping[16][4] =
1740
0
        {
1741
0
            {0, 0, 0, 0},   {1, 0, 0, 0},   {0, 1, 0, 0},   {1, 1, 0, 0},
1742
0
            {0, 0, 1, 0},   {1, 0, 1, 0},   {0, 1, 1, 0},   {1, 1, 1, 0},
1743
0
            {0, 0, 0, 1},   {1, 0, 0, 1},   {0, 1, 0, 1},   {1, 1, 0, 1},
1744
0
            {0, 0, 1, 1},   {1, 0, 1, 1},   {0, 1, 1, 1},   {1, 1, 1, 1},
1745
0
        };
1746
1747
0
        __m128 n0, n1, n2, n3;
1748
0
        __m128 t0, t1;
1749
0
        __m128 dp;
1750
0
        int bitmask;
1751
1752
        // Load light vector, unaligned
1753
0
        __m128 lp = _mm_loadu_ps(&lightPos.x);
1754
1755
        // Perload zero to register for compare dot product values
1756
0
        __m128 zero = _mm_setzero_ps();
1757
1758
0
        size_t numIterations = numFaces / 4;
1759
0
        numFaces &= 3;
1760
1761
        // Four faces per-iteration
1762
0
        for (size_t i = 0; i < numIterations; ++i)
1763
0
        {
1764
            // Load face normals, aligned
1765
0
            n0 = __MM_LOAD_PS(&faceNormals[0].x);
1766
0
            n1 = __MM_LOAD_PS(&faceNormals[1].x);
1767
0
            n2 = __MM_LOAD_PS(&faceNormals[2].x);
1768
0
            n3 = __MM_LOAD_PS(&faceNormals[3].x);
1769
0
            faceNormals += 4;
1770
1771
            // Multiply by light vector
1772
0
            n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1773
0
            n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1774
0
            n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1775
0
            n3 = _mm_mul_ps(n3, lp);        // x3 y3 z3 w3
1776
1777
            // Horizontal add four vector values.
1778
0
            t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1779
0
                _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1780
0
                _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1781
0
            t1 = _mm_add_ps(                                            // x2+z2 x3+z3 y2+w2 y3+w3
1782
0
                _mm_unpacklo_ps(n2, n3),    // x2 x3 y2 y3
1783
0
                _mm_unpackhi_ps(n2, n3));   // z2 z3 w2 w3
1784
0
            dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp3
1785
0
                _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x3+z3
1786
0
                _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y3+w3
1787
1788
            // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps'
1789
            // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch,
1790
            // i.e. it's 2nd operand of the assembly instruction. And in fact
1791
            // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped
1792
            // in VC7.1.
1793
0
            bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1794
1795
            // Convert 4-bits mask to 4 bytes, and store results.
1796
            /*
1797
            *reinterpret_cast<uint32*>(lightFacings) =
1798
                *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]);
1799
                */
1800
0
            memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32));
1801
            
1802
            
1803
0
            lightFacings += 4;
1804
0
        }
1805
1806
        // Dealing with remaining faces
1807
0
        switch (numFaces)
1808
0
        {
1809
0
        case 3:
1810
0
            n0 = __MM_LOAD_PS(&faceNormals[0].x);
1811
0
            n1 = __MM_LOAD_PS(&faceNormals[1].x);
1812
0
            n2 = __MM_LOAD_PS(&faceNormals[2].x);
1813
1814
0
            n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1815
0
            n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1816
0
            n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1817
1818
0
            t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1819
0
                _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1820
0
                _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1821
0
            t1 = _mm_add_ps(                                            // x2+z2 x2+z2 y2+w2 y2+w2
1822
0
                _mm_unpacklo_ps(n2, n2),    // x2 x2 y2 y2
1823
0
                _mm_unpackhi_ps(n2, n2));   // z2 z2 w2 w2
1824
0
            dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp2
1825
0
                _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x2+z2
1826
0
                _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y2+w2
1827
1828
0
            bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1829
1830
0
            lightFacings[0] = msMaskMapping[bitmask][0];
1831
0
            lightFacings[1] = msMaskMapping[bitmask][1];
1832
0
            lightFacings[2] = msMaskMapping[bitmask][2];
1833
0
            break;
1834
1835
0
        case 2:
1836
0
            n0 = __MM_LOAD_PS(&faceNormals[0].x);
1837
0
            n1 = __MM_LOAD_PS(&faceNormals[1].x);
1838
1839
0
            n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1840
0
            n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1841
1842
0
            t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1843
0
                _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1844
0
                _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1845
0
            dp = _mm_add_ps(                                            // dp0 dp1 dp0 dp1
1846
0
                _mm_movelh_ps(t0, t0),      // x0+z0 x1+z1 x0+z0 x1+z1
1847
0
                _mm_movehl_ps(t0, t0));     // y0+w0 y1+w1 y0+w0 y1+w1
1848
1849
0
            bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1850
1851
0
            lightFacings[0] = msMaskMapping[bitmask][0];
1852
0
            lightFacings[1] = msMaskMapping[bitmask][1];
1853
0
            break;
1854
1855
0
        case 1:
1856
0
            n0 = __MM_LOAD_PS(&faceNormals[0].x);
1857
1858
0
            n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1859
1860
0
            t0 = _mm_add_ps(                                            // x0+z0 x0+z0 y0+w0 y0+w0
1861
0
                _mm_unpacklo_ps(n0, n0),    // x0 x0 y0 y0
1862
0
                _mm_unpackhi_ps(n0, n0));   // z0 z0 w0 w0
1863
0
            dp = _mm_add_ps(                                            // dp0 dp0 dp0 dp0
1864
0
                _mm_movelh_ps(t0, t0),      // x0+z0 x0+z0 x0+z0 x0+z0
1865
0
                _mm_movehl_ps(t0, t0));     // y0+w0 y0+w0 y0+w0 y0+w0
1866
1867
0
            bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1868
1869
0
            lightFacings[0] = msMaskMapping[bitmask][0];
1870
0
            break;
1871
0
        }
1872
0
    }
1873
    //---------------------------------------------------------------------
1874
    // Template to extrude vertices for directional light.
1875
    template <bool srcAligned, bool destAligned>
1876
    struct ExtrudeVertices_SSE_DirectionalLight
1877
    {
1878
        static void apply(
1879
            const Vector4& lightPos,
1880
            Real extrudeDist,
1881
            const float* pSrcPos,
1882
            float* pDestPos,
1883
            size_t numVertices)
1884
0
        {
1885
0
            typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1886
0
            typedef SSEMemoryAccessor<destAligned> DestAccessor;
1887
1888
            // Directional light, extrusion is along light direction
1889
1890
            // Load light vector, unaligned
1891
0
            __m128 lp = _mm_loadu_ps(&lightPos.x);
1892
1893
            // Calculate extrusion direction, note that we use inverted direction here
1894
            // for eliminate an extra negative instruction, we'll compensate for that
1895
            // by use subtract instruction instead later.
1896
0
            __m128 tmp = _mm_mul_ps(lp, lp);
1897
0
            tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp));
1898
            // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
1899
0
            tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist));
1900
0
            __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0));               // X Y Z -
1901
1902
            // Prepare extrude direction for extruding 4 vertices parallelly
1903
0
            __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0));   // X Y Z X
1904
0
            __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1));   // Y Z X Y
1905
0
            __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2));   // Z X Y Z
1906
1907
0
            __m128 s0, s1, s2;
1908
0
            __m128 d0, d1, d2;
1909
1910
0
            size_t numIterations = numVertices / 4;
1911
0
            numVertices &= 3;
1912
1913
            // Extruding 4 vertices per-iteration
1914
0
            for (size_t i = 0; i < numIterations; ++i)
1915
0
            {
1916
0
                s0 = SrcAccessor::load(pSrcPos + 0);
1917
0
                s1 = SrcAccessor::load(pSrcPos + 4);
1918
0
                s2 = SrcAccessor::load(pSrcPos + 8);
1919
0
                pSrcPos += 12;
1920
1921
                // The extrusion direction is inverted, use subtract instruction here
1922
0
                d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1923
0
                d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1924
0
                d2 = _mm_sub_ps(s2, dir2);                      // Z2 X3 Y3 Z3
1925
1926
0
                DestAccessor::store(pDestPos + 0, d0);
1927
0
                DestAccessor::store(pDestPos + 4, d1);
1928
0
                DestAccessor::store(pDestPos + 8, d2);
1929
0
                pDestPos += 12;
1930
0
            }
1931
1932
            // Dealing with remaining vertices
1933
0
            switch (numVertices)
1934
0
            {
1935
0
            case 3:
1936
                // 9 floating-point values
1937
0
                s0 = SrcAccessor::load(pSrcPos + 0);
1938
0
                s1 = SrcAccessor::load(pSrcPos + 4);
1939
0
                s2 = _mm_load_ss(pSrcPos + 8);
1940
1941
                // The extrusion direction is inverted, use subtract instruction here
1942
0
                d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1943
0
                d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1944
0
                d2 = _mm_sub_ss(s2, dir2);                      // Z2 -- -- --
1945
1946
0
                DestAccessor::store(pDestPos + 0, d0);
1947
0
                DestAccessor::store(pDestPos + 4, d1);
1948
0
                _mm_store_ss(pDestPos + 8, d2);
1949
0
                break;
1950
1951
0
            case 2:
1952
                // 6 floating-point values
1953
0
                s0 = SrcAccessor::load(pSrcPos + 0);
1954
0
                s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here
1955
1956
                // The extrusion direction is inverted, use subtract instruction here
1957
0
                d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1958
0
                d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 -- --
1959
1960
0
                DestAccessor::store(pDestPos + 0, d0);
1961
0
                _mm_storel_pi((__m64*)(pDestPos + 4), d1);
1962
0
                break;
1963
1964
0
            case 1:
1965
                // 3 floating-point values
1966
0
                s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here
1967
0
                s1 = _mm_load_ss(pSrcPos + 2);
1968
1969
                // The extrusion direction is inverted, use subtract instruction here
1970
0
                d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 -- --
1971
0
                d1 = _mm_sub_ss(s1, dir2);                      // Z0 -- -- --
1972
1973
0
                _mm_storel_pi((__m64*)(pDestPos + 0), d0);
1974
0
                _mm_store_ss(pDestPos + 2, d1);
1975
0
                break;
1976
0
            }
1977
0
        }
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
1978
    };
1979
    //---------------------------------------------------------------------
1980
    // Template to extrude vertices for point light.
1981
    template <bool srcAligned, bool destAligned>
1982
    struct ExtrudeVertices_SSE_PointLight
1983
    {
1984
        static void apply(
1985
            const Vector4& lightPos,
1986
            Real extrudeDist,
1987
            const float* pSrcPos,
1988
            float* pDestPos,
1989
            size_t numVertices)
1990
0
        {
1991
0
            typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1992
0
            typedef SSEMemoryAccessor<destAligned> DestAccessor;
1993
1994
            // Point light, will calculate extrusion direction for every vertex
1995
1996
            // Load light vector, unaligned
1997
0
            __m128 lp = _mm_loadu_ps(&lightPos.x);
1998
1999
            // Load extrude distance
2000
0
            __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist);
2001
2002
0
            size_t numIterations = numVertices / 4;
2003
0
            numVertices &= 3;
2004
2005
            // Extruding 4 vertices per-iteration
2006
0
            for (size_t i = 0; i < numIterations; ++i)
2007
0
            {
2008
                // Load source positions
2009
0
                __m128 s0 = SrcAccessor::load(pSrcPos + 0);     // x0 y0 z0 x1
2010
0
                __m128 s1 = SrcAccessor::load(pSrcPos + 4);     // y1 z1 x2 y2
2011
0
                __m128 s2 = SrcAccessor::load(pSrcPos + 8);     // z2 x3 y3 z3
2012
0
                pSrcPos += 12;
2013
2014
                // Arrange to 3x4 component-major for batches calculate
2015
0
                __MM_TRANSPOSE4x3_PS(s0, s1, s2);
2016
2017
                // Calculate unnormalised extrusion direction
2018
0
                __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3
2019
0
                __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3
2020
0
                __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3
2021
2022
                // Normalise extrusion direction and multiply by extrude distance
2023
0
                __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
2024
0
                tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4);
2025
0
                dx = _mm_mul_ps(dx, tmp);
2026
0
                dy = _mm_mul_ps(dy, tmp);
2027
0
                dz = _mm_mul_ps(dz, tmp);
2028
2029
                // Calculate extruded positions
2030
0
                __m128 d0 = _mm_add_ps(dx, s0);
2031
0
                __m128 d1 = _mm_add_ps(dy, s1);
2032
0
                __m128 d2 = _mm_add_ps(dz, s2);
2033
2034
                // Arrange back to 4x3 continuous format for store results
2035
0
                __MM_TRANSPOSE3x4_PS(d0, d1, d2);
2036
2037
                // Store extruded positions
2038
0
                DestAccessor::store(pDestPos + 0, d0);
2039
0
                DestAccessor::store(pDestPos + 4, d1);
2040
0
                DestAccessor::store(pDestPos + 8, d2);
2041
0
                pDestPos += 12;
2042
0
            }
2043
2044
            // Dealing with remaining vertices
2045
0
            for (size_t j = 0; j  < numVertices; ++j)
2046
0
            {
2047
                // Load source position
2048
0
                __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z
2049
0
                pSrcPos += 3;
2050
2051
                // Calculate unnormalised extrusion direction
2052
0
                __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z
2053
2054
                // Normalise extrusion direction and multiply by extrude distance
2055
0
                __m128 tmp = _mm_mul_ps(dir, dir);
2056
0
                tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3));
2057
                // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
2058
0
                tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4);
2059
0
                dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0));
2060
2061
                // Calculate extruded position
2062
0
                __m128 dst = _mm_add_ps(dir, src);
2063
2064
                // Store extruded position
2065
0
                _mm_store_ss(pDestPos + 0, dst);
2066
0
                _mm_storeh_pi((__m64*)(pDestPos + 1), dst);
2067
0
                pDestPos += 3;
2068
0
            }
2069
0
        }
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<true, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<true, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<false, true>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
Unexecuted instantiation: Ogre::ExtrudeVertices_SSE_PointLight<false, false>::apply(Ogre::Vector<4, float> const&, float, float const*, float*, unsigned long)
2070
    };
2071
    //---------------------------------------------------------------------
2072
    void OptimisedUtilSSE::extrudeVertices(
2073
        const Vector4& lightPos,
2074
        Real extrudeDist,
2075
        const float* pSrcPos,
2076
        float* pDestPos,
2077
        size_t numVertices)
2078
0
    {
2079
0
        __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
2080
2081
        // Note: Since pDestPos is following tail of pSrcPos, we can't assume
2082
        // it's aligned to SIMD alignment properly, so must check for it here.
2083
        //
2084
        // TODO: Add extra vertex to the vertex buffer for make sure pDestPos
2085
        // aligned same as pSrcPos.
2086
        //
2087
2088
        // We are use SSE reciprocal square root directly while calculating
2089
        // extrusion direction, since precision loss not that important here.
2090
        //
2091
0
        if (lightPos.w == 0.0f)
2092
0
        {
2093
0
            if (_isAlignedForSSE(pSrcPos))
2094
0
            {
2095
0
                if (_isAlignedForSSE(pDestPos))
2096
0
                    ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(
2097
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2098
0
                else
2099
0
                    ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(
2100
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2101
0
            }
2102
0
            else
2103
0
            {
2104
0
                if (_isAlignedForSSE(pDestPos))
2105
0
                    ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(
2106
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2107
0
                else
2108
0
                    ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(
2109
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2110
0
            }
2111
0
        }
2112
0
        else
2113
0
        {
2114
0
            assert(lightPos.w == 1.0f);
2115
2116
0
            if (_isAlignedForSSE(pSrcPos))
2117
0
            {
2118
0
                if (_isAlignedForSSE(pDestPos))
2119
0
                    ExtrudeVertices_SSE_PointLight<true, true>::apply(
2120
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2121
0
                else
2122
0
                    ExtrudeVertices_SSE_PointLight<true, false>::apply(
2123
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2124
0
            }
2125
0
            else
2126
0
            {
2127
0
                if (_isAlignedForSSE(pDestPos))
2128
0
                    ExtrudeVertices_SSE_PointLight<false, true>::apply(
2129
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2130
0
                else
2131
0
                    ExtrudeVertices_SSE_PointLight<false, false>::apply(
2132
0
                        lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2133
0
            }
2134
0
        }
2135
0
    }
2136
    //---------------------------------------------------------------------
2137
    //---------------------------------------------------------------------
2138
    //---------------------------------------------------------------------
2139
    extern OptimisedUtil* _getOptimisedUtilSSE(void);
2140
    extern OptimisedUtil* _getOptimisedUtilSSE(void)
2141
2
    {
2142
2
        static OptimisedUtilSSE msOptimisedUtilSSE;
2143
#if defined(__OGRE_SIMD_ALIGN_STACK)
2144
        static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE);
2145
        return &msOptimisedUtilWithStackAlign;
2146
#else
2147
2
        return &msOptimisedUtilSSE;
2148
2
#endif
2149
2
    }
2150
2151
}
2152
2153
#endif // __OGRE_HAVE_SSE