Coverage Report

Created: 2023-12-08 06:53

/src/freeimage-svn/FreeImage/trunk/Source/OpenEXR/IlmImf/ImfOptimizedPixelReading.h
Line
Count
Source (jump to first uncovered line)
1
///////////////////////////////////////////////////////////////////////////
2
//
3
// Copyright (c) 2012, Autodesk, Inc.
4
// 
5
// All rights reserved.
6
//
7
// Implementation of IIF-specific file format and speed optimizations 
8
// provided by Innobec Technologies inc on behalf of Autodesk.
9
// 
10
// Redistribution and use in source and binary forms, with or without
11
// modification, are permitted provided that the following conditions are
12
// met:
13
// *       Redistributions of source code must retain the above copyright
14
// notice, this list of conditions and the following disclaimer.
15
// *       Redistributions in binary form must reproduce the above
16
// copyright notice, this list of conditions and the following disclaimer
17
// in the documentation and/or other materials provided with the
18
// distribution.
19
// *       Neither the name of Industrial Light & Magic nor the names of
20
// its contributors may be used to endorse or promote products derived
21
// from this software without specific prior written permission. 
22
// 
23
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
//
35
///////////////////////////////////////////////////////////////////////////
36
37
#pragma once
38
39
#ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
40
#define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
41
42
#include "ImfSimd.h"
43
#include "ImfSystemSpecific.h"
44
#include <iostream>
45
#include "ImfChannelList.h"
46
#include "ImfFrameBuffer.h"
47
#include "ImfStringVectorAttribute.h"
48
49
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
50
51
class OptimizationMode
52
{
53
public:
54
55
56
    bool _optimizable;
57
    int _ySampling;
58
0
    OptimizationMode() : _optimizable(false) {}
59
    
60
};
61
62
63
#if IMF_HAVE_SSE2
64
65
66
//------------------------------------------------------------------------
67
// Test for SSE pointer alignemnt
68
//------------------------------------------------------------------------
69
EXR_FORCEINLINE
70
bool
71
isPointerSSEAligned (const void* EXR_RESTRICT pPointer)
72
0
{
73
0
    unsigned long trailingBits = ((unsigned long)pPointer) & 15;
74
0
    return trailingBits == 0;
75
0
}
76
77
//------------------------------------------------------------------------
78
// Load SSE from address into register
79
//------------------------------------------------------------------------
80
template<bool IS_ALIGNED>
81
EXR_FORCEINLINE
82
__m128i loadSSE (__m128i*& loadAddress)
83
{
84
    // throw exception :: this is not accepted
85
    return _mm_loadu_si128 (loadAddress);
86
}
87
88
template<>
89
EXR_FORCEINLINE
90
__m128i loadSSE<false> (__m128i*& loadAddress)
91
0
{
92
0
    return _mm_loadu_si128 (loadAddress);
93
0
}
94
95
template<>
96
EXR_FORCEINLINE
97
__m128i loadSSE<true> (__m128i*& loadAddress)
98
0
{
99
0
    return _mm_load_si128 (loadAddress);
100
0
}
101
102
//------------------------------------------------------------------------
103
// Store SSE from register into address
104
//------------------------------------------------------------------------
105
template<bool IS_ALIGNED>
106
EXR_FORCEINLINE
107
void storeSSE (__m128i*& storeAddress, __m128i& dataToStore)
108
{
109
110
}
111
112
template<>
113
EXR_FORCEINLINE
114
void
115
storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore)
116
0
{
117
0
    _mm_storeu_si128 (storeAddress, dataToStore);
118
0
}
119
120
template<>
121
EXR_FORCEINLINE
122
void
123
storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore)
124
0
{
125
0
    _mm_stream_si128 (storeAddress, dataToStore);
126
0
}
127
128
129
130
//------------------------------------------------------------------------
131
//
132
// Write to RGBA
133
//
134
//------------------------------------------------------------------------
135
136
//
137
// Using SSE intrinsics
138
//
139
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
140
EXR_FORCEINLINE 
141
void writeToRGBASSETemplate 
142
    (__m128i*& readPtrSSERed,
143
     __m128i*& readPtrSSEGreen,
144
     __m128i*& readPtrSSEBlue,
145
     __m128i*& readPtrSSEAlpha,
146
     __m128i*& writePtrSSE,
147
     const size_t& lPixelsToCopySSE)
148
0
{
149
0
    for (size_t i = 0; i < lPixelsToCopySSE; ++i)
150
0
    {
151
0
        __m128i redRegister   = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
152
0
        __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
153
0
        __m128i blueRegister  = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
154
0
        __m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha);
155
156
0
        __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
157
0
                                                        greenRegister);
158
0
        __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
159
0
                                                        alphaRegister);
160
161
0
        __m128i pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
162
0
                                                        blueAlphaRegister);
163
0
        __m128i pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
164
0
                                                        blueAlphaRegister);
165
166
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
167
0
        ++writePtrSSE;
168
169
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
170
0
        ++writePtrSSE;
171
172
0
        redGreenRegister  = _mm_unpackhi_epi16 (redRegister, greenRegister);
173
0
        blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister);
174
175
0
        pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
176
0
                                                blueAlphaRegister);
177
0
        pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
178
0
                                                blueAlphaRegister);
179
180
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
181
0
        ++writePtrSSE;
182
        
183
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
184
0
        ++writePtrSSE;
185
186
0
        ++readPtrSSEAlpha;
187
0
        ++readPtrSSEBlue;
188
0
        ++readPtrSSEGreen;
189
0
        ++readPtrSSERed;
190
0
    }
191
0
}
Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBASSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
192
193
//
194
// Not using SSE intrinsics.  This is still faster than the alternative
195
// because we have multiple read pointers and therefore we are able to
196
// take advantage of data locality for write operations.
197
//
198
EXR_FORCEINLINE 
199
void writeToRGBANormal (unsigned short*& readPtrRed,
200
                        unsigned short*& readPtrGreen,
201
                        unsigned short*& readPtrBlue,
202
                        unsigned short*& readPtrAlpha,
203
                        unsigned short*& writePtr,
204
                        const size_t& lPixelsToCopy)
205
0
{
206
0
    for (size_t i = 0; i < lPixelsToCopy; ++i)
207
0
    {
208
0
        *(writePtr++) = *(readPtrRed++);
209
0
        *(writePtr++) = *(readPtrGreen++);
210
0
        *(writePtr++) = *(readPtrBlue++);
211
0
        *(writePtr++) = *(readPtrAlpha++);
212
0
    }
213
0
}
214
215
//
216
// Determine which (template) version to use by checking whether pointers
217
// are aligned
218
//
219
EXR_FORCEINLINE 
220
void optimizedWriteToRGBA (unsigned short*& readPtrRed,
221
                           unsigned short*& readPtrGreen,
222
                           unsigned short*& readPtrBlue,
223
                           unsigned short*& readPtrAlpha,
224
                           unsigned short*& writePtr,
225
                           const size_t& pixelsToCopySSE,
226
                           const size_t& pixelsToCopyNormal)
227
0
{
228
0
    bool readPtrAreAligned = true;
229
230
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
231
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
232
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
233
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha);
234
235
0
    bool writePtrIsAligned = isPointerSSEAligned(writePtr);
236
237
0
    if (!readPtrAreAligned && !writePtrIsAligned)
238
0
    {
239
0
        writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed,
240
0
                                              (__m128i*&)readPtrGreen,
241
0
                                              (__m128i*&)readPtrBlue,
242
0
                                              (__m128i*&)readPtrAlpha,
243
0
                                              (__m128i*&)writePtr,
244
0
                                              pixelsToCopySSE);
245
0
    }
246
0
    else if (!readPtrAreAligned && writePtrIsAligned)
247
0
    {
248
0
        writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed,
249
0
                                             (__m128i*&)readPtrGreen,
250
0
                                             (__m128i*&)readPtrBlue,
251
0
                                             (__m128i*&)readPtrAlpha,
252
0
                                             (__m128i*&)writePtr,
253
0
                                             pixelsToCopySSE);
254
0
    }
255
0
    else if (readPtrAreAligned && !writePtrIsAligned)
256
0
    {
257
0
        writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed,
258
0
                                             (__m128i*&)readPtrGreen,
259
0
                                             (__m128i*&)readPtrBlue,
260
0
                                             (__m128i*&)readPtrAlpha,
261
0
                                             (__m128i*&)writePtr,
262
0
                                             pixelsToCopySSE);
263
0
    }
264
0
    else if(readPtrAreAligned && writePtrIsAligned)
265
0
    {
266
0
        writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed,
267
0
                                            (__m128i*&)readPtrGreen,
268
0
                                            (__m128i*&)readPtrBlue,
269
0
                                            (__m128i*&)readPtrAlpha,
270
0
                                            (__m128i*&)writePtr,
271
0
                                            pixelsToCopySSE);
272
0
    }
273
274
0
    writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha,
275
0
                       writePtr, pixelsToCopyNormal);
276
0
}
277
278
279
280
//------------------------------------------------------------------------
281
//
282
// Write to RGBA Fill A
283
//
284
//------------------------------------------------------------------------
285
286
//
287
// Using SSE intrinsics
288
//
289
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
290
EXR_FORCEINLINE 
291
void
292
writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed,
293
                             __m128i*& readPtrSSEGreen,
294
                             __m128i*& readPtrSSEBlue,
295
                             const unsigned short& alphaFillValue,
296
                             __m128i*& writePtrSSE,
297
                             const size_t& pixelsToCopySSE)
298
0
{
299
0
    const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue,
300
0
                                                      alphaFillValue,
301
0
                                                      alphaFillValue,
302
0
                                                      alphaFillValue,
303
0
                                                      alphaFillValue,
304
0
                                                      alphaFillValue,
305
0
                                                      alphaFillValue,
306
0
                                                      alphaFillValue);
307
308
0
    for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
309
0
    {
310
0
        __m128i redRegister   = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
311
0
        __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
312
0
        __m128i blueRegister  = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
313
314
0
        __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
315
0
                                                        greenRegister);
316
0
        __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
317
0
                                                        dummyAlphaRegister);
318
319
0
        __m128i pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
320
0
                                                        blueAlphaRegister);
321
0
        __m128i pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
322
0
                                                        blueAlphaRegister);
323
324
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
325
0
        ++writePtrSSE;
326
327
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
328
0
        ++writePtrSSE;
329
330
0
        redGreenRegister  = _mm_unpackhi_epi16 (redRegister,
331
0
                                                greenRegister);
332
0
        blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister,
333
0
                                                dummyAlphaRegister);
334
335
0
        pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
336
0
                                                blueAlphaRegister);
337
0
        pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
338
0
                                                blueAlphaRegister);
339
340
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
341
0
        ++writePtrSSE;
342
343
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
344
0
        ++writePtrSSE;
345
346
0
        ++readPtrSSEBlue;
347
0
        ++readPtrSSEGreen;
348
0
        ++readPtrSSERed;
349
0
    }
350
0
}
Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBAFillASSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned short const&, long long __vector(2)*&, unsigned long const&)
351
352
//
353
// Not using SSE intrinsics.  This is still faster than the alternative
354
// because we have multiple read pointers and therefore we are able to
355
// take advantage of data locality for write operations.
356
//
357
EXR_FORCEINLINE
358
void
359
writeToRGBAFillANormal (unsigned short*& readPtrRed,
360
                        unsigned short*& readPtrGreen,
361
                        unsigned short*& readPtrBlue,
362
                        const unsigned short& alphaFillValue,
363
                        unsigned short*& writePtr,
364
                        const size_t& pixelsToCopy)
365
0
{
366
0
    for (size_t i = 0; i < pixelsToCopy; ++i)
367
0
    {
368
0
        *(writePtr++) = *(readPtrRed++);
369
0
        *(writePtr++) = *(readPtrGreen++);
370
0
        *(writePtr++) = *(readPtrBlue++);
371
0
        *(writePtr++) = alphaFillValue;
372
0
    }
373
0
}
374
375
//
376
// Determine which (template) version to use by checking whether pointers
377
// are aligned.
378
//
379
EXR_FORCEINLINE 
380
void
381
optimizedWriteToRGBAFillA (unsigned short*& readPtrRed,
382
                           unsigned short*& readPtrGreen,
383
                           unsigned short*& readPtrBlue,
384
                           const unsigned short& alphaFillValue,
385
                           unsigned short*& writePtr,
386
                           const size_t& pixelsToCopySSE,
387
                           const size_t& pixelsToCopyNormal)
388
0
{
389
0
    bool readPtrAreAligned = true;
390
391
0
    readPtrAreAligned &= isPointerSSEAligned (readPtrRed);
392
0
    readPtrAreAligned &= isPointerSSEAligned (readPtrGreen);
393
0
    readPtrAreAligned &= isPointerSSEAligned (readPtrBlue);
394
395
0
    bool writePtrIsAligned = isPointerSSEAligned (writePtr);
396
397
0
    if (!readPtrAreAligned && !writePtrIsAligned)
398
0
    {
399
0
        writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed,
400
0
                                                   (__m128i*&)readPtrGreen,
401
0
                                                   (__m128i*&)readPtrBlue,
402
0
                                                   alphaFillValue,
403
0
                                                   (__m128i*&)writePtr,
404
0
                                                   pixelsToCopySSE);
405
0
    }
406
0
    else if (!readPtrAreAligned && writePtrIsAligned)
407
0
    {
408
0
        writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed,
409
0
                                                  (__m128i*&)readPtrGreen,
410
0
                                                  (__m128i*&)readPtrBlue,
411
0
                                                  alphaFillValue,
412
0
                                                  (__m128i*&)writePtr,
413
0
                                                  pixelsToCopySSE);
414
0
    }
415
0
    else if (readPtrAreAligned && !writePtrIsAligned)
416
0
    {
417
0
        writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed,
418
0
                                                  (__m128i*&)readPtrGreen,
419
0
                                                  (__m128i*&)readPtrBlue,
420
0
                                                  alphaFillValue,
421
0
                                                  (__m128i*&)writePtr,
422
0
                                                  pixelsToCopySSE);
423
0
    }
424
0
    else if (readPtrAreAligned && writePtrIsAligned)
425
0
    {
426
0
        writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed,
427
0
                                                 (__m128i*&)readPtrGreen,
428
0
                                                 (__m128i*&)readPtrBlue,
429
0
                                                 alphaFillValue,
430
0
                                                 (__m128i*&)writePtr,
431
0
                                                 pixelsToCopySSE);
432
0
    }
433
434
0
    writeToRGBAFillANormal (readPtrRed,
435
0
                            readPtrGreen, readPtrBlue, alphaFillValue,
436
0
                            writePtr, pixelsToCopyNormal);
437
0
}
438
439
440
441
//------------------------------------------------------------------------
442
//
443
// Write to RGB
444
//
445
//------------------------------------------------------------------------
446
447
//
448
// Using SSE intrinsics
449
//
450
template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
451
EXR_FORCEINLINE 
452
void
453
writeToRGBSSETemplate (__m128i*& readPtrSSERed,
454
                       __m128i*& readPtrSSEGreen,
455
                       __m128i*& readPtrSSEBlue,
456
                       __m128i*& writePtrSSE,
457
                       const size_t& pixelsToCopySSE)
458
0
{
459
460
0
    for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
461
0
    {
462
        //
463
        // Need to shuffle and unpack pointers to obtain my first register
464
        // We must save 8 pixels at a time, so we must have the following three registers at the end:
465
        // 1) R1 G1 B1 R2 G2 B2 R3 G3
466
        // 2) B3 R4 G4 B4 R5 G5 B5 R6
467
        // 3) G6 B6 R7 G7 B7 R8 G8 B8
468
        //
469
0
        __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
470
0
        __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
471
0
        __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
472
473
        //
474
        // First register: R1 G1 B1 R2 G2 B2 R3 G3
475
        // Construct 2 registers and then unpack them to obtain our final result:
476
        //
477
0
        __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
478
0
                                                        greenRegister);
479
0
        __m128i redBlueRegister   = _mm_unpacklo_epi16 (redRegister,
480
0
                                                        blueRegister);
481
0
        __m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister,
482
0
                                                        blueRegister);
483
484
        // Left Part (R1 G1 B1 R2)
485
0
        __m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
486
0
                                                    _MM_SHUFFLE(3,0,2,1));
487
0
        __m128i halfLeft     = _mm_unpacklo_epi32 (redGreenRegister,
488
0
                                                   quarterRight);
489
490
        // Right Part (G2 B2 R3 G3)
491
0
        __m128i quarterLeft  = _mm_shuffle_epi32 (greenBlueRegister,
492
0
                                                 _MM_SHUFFLE(3,2,0,1));
493
0
        quarterRight         = _mm_shuffle_epi32 (redGreenRegister,
494
0
                                                 _MM_SHUFFLE(3,0,1,2));
495
0
        __m128i halfRight    = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
496
497
0
        __m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
498
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
499
0
        ++writePtrSSE;
500
501
        //
502
        // Second register: B3 R4 G4 B4 R5 G5 B5 R6
503
        //
504
505
        // Left Part (B3, R4, G4, B4)
506
0
        quarterLeft  = _mm_shufflehi_epi16 (redBlueRegister,
507
0
                                            _MM_SHUFFLE(0, 3, 2, 1));
508
0
        quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
509
0
                                            _MM_SHUFFLE(1, 0, 3, 2));
510
0
        halfLeft     = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
511
512
        // Update the registers
513
0
        redGreenRegister  = _mm_unpackhi_epi16 (redRegister, greenRegister);
514
0
        redBlueRegister   = _mm_unpackhi_epi16 (redRegister, blueRegister);
515
0
        greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister);
516
517
        // Right Part (R5 G5 B5 R6)
518
0
        quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
519
0
                                            _MM_SHUFFLE(3,0,2,1));
520
0
        halfRight    = _mm_unpacklo_epi32 (redGreenRegister, quarterRight);
521
522
0
        fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
523
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
524
0
        ++writePtrSSE;
525
526
        //
527
        // Third register: G6 B6 R7 G7 B7 R8 G8 B8
528
        //
529
530
        // Left part (G6 B6 R7 G7)
531
0
        quarterLeft  = _mm_shuffle_epi32 (greenBlueRegister,
532
0
                                          _MM_SHUFFLE(3,2,0,1));
533
0
        quarterRight = _mm_shuffle_epi32 (redGreenRegister,
534
0
                                          _MM_SHUFFLE(3,0,1,2));
535
0
        halfLeft     = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
536
537
        // Right part (B7 R8 G8 B8)
538
0
        quarterLeft  = _mm_shufflehi_epi16 (redBlueRegister,
539
0
                                            _MM_SHUFFLE(0, 3, 2, 1));
540
0
        quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
541
0
                                            _MM_SHUFFLE(1, 0, 3, 2));
542
0
        halfRight    = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
543
544
0
        fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
545
0
        storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
546
0
        ++writePtrSSE;
547
548
        //
549
        // Increment read pointers
550
        //
551
0
        ++readPtrSSEBlue;
552
0
        ++readPtrSSEGreen;
553
0
        ++readPtrSSERed;
554
0
    }
555
0
}
Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<false, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<false, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<true, false>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
Unexecuted instantiation: void Imf_2_2::writeToRGBSSETemplate<true, true>(long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, long long __vector(2)*&, unsigned long const&)
556
557
//
558
// Not using SSE intrinsics.  This is still faster than the alternative
559
// because we have multiple read pointers and therefore we are able to
560
// take advantage of data locality for write operations.
561
//
562
EXR_FORCEINLINE 
563
void
564
writeToRGBNormal (unsigned short*& readPtrRed,
565
                  unsigned short*& readPtrGreen,
566
                  unsigned short*& readPtrBlue,
567
                  unsigned short*& writePtr,
568
                  const size_t& pixelsToCopy)
569
0
{
570
0
    for (size_t i = 0; i < pixelsToCopy; ++i)
571
0
    {
572
0
        *(writePtr++) = *(readPtrRed++);
573
0
        *(writePtr++) = *(readPtrGreen++);
574
0
        *(writePtr++) = *(readPtrBlue++);
575
0
    }
576
0
}
577
578
//
579
// Determine which (template) version to use by checking whether pointers
580
// are aligned
581
//
582
EXR_FORCEINLINE 
583
void optimizedWriteToRGB (unsigned short*& readPtrRed,
584
                          unsigned short*& readPtrGreen,
585
                          unsigned short*& readPtrBlue,
586
                          unsigned short*& writePtr,
587
                          const size_t& pixelsToCopySSE,
588
                          const size_t& pixelsToCopyNormal)
589
0
{
590
0
    bool readPtrAreAligned = true;
591
592
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
593
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
594
0
    readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
595
596
0
    bool writePtrIsAligned = isPointerSSEAligned(writePtr);
597
598
0
    if (!readPtrAreAligned && !writePtrIsAligned)
599
0
    {
600
0
        writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed,
601
0
                                             (__m128i*&)readPtrGreen,
602
0
                                             (__m128i*&)readPtrBlue,
603
0
                                             (__m128i*&)writePtr,
604
0
                                             pixelsToCopySSE);
605
0
    }
606
0
    else if (!readPtrAreAligned && writePtrIsAligned)
607
0
    {
608
0
        writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed,
609
0
                                            (__m128i*&)readPtrGreen,
610
0
                                            (__m128i*&)readPtrBlue,
611
0
                                            (__m128i*&)writePtr,
612
0
                                            pixelsToCopySSE);
613
0
    }
614
0
    else if (readPtrAreAligned && !writePtrIsAligned)
615
0
    {
616
0
        writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed,
617
0
                                            (__m128i*&)readPtrGreen,
618
0
                                            (__m128i*&)readPtrBlue,
619
0
                                            (__m128i*&)writePtr,
620
0
                                            pixelsToCopySSE);
621
0
    }
622
0
    else if (readPtrAreAligned && writePtrIsAligned)
623
0
    {
624
0
        writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed,
625
0
                                           (__m128i*&)readPtrGreen,
626
0
                                           (__m128i*&)readPtrBlue,
627
0
                                           (__m128i*&)writePtr,
628
0
                                           pixelsToCopySSE);
629
0
    }
630
631
632
0
    writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue,
633
0
                      writePtr, pixelsToCopyNormal);
634
0
}
635
636
637
638
639
#else // ! defined IMF_HAVE_SSE2
640
641
#endif // defined IMF_HAVE_SSE2
642
643
644
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
645
646
#endif