Coverage Report

Created: 2025-12-31 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/opencv/3rdparty/openexr/IlmImf/ImfDwaCompressorSimd.h
Line
Count
Source
1
///////////////////////////////////////////////////////////////////////////
2
//
3
// Copyright (c) 2009-2014 DreamWorks Animation LLC. 
4
//
5
// All rights reserved.
6
//
7
// Redistribution and use in source and binary forms, with or without
8
// modification, are permitted provided that the following conditions are
9
// met:
10
// *       Redistributions of source code must retain the above copyright
11
// notice, this list of conditions and the following disclaimer.
12
// *       Redistributions in binary form must reproduce the above
13
// copyright notice, this list of conditions and the following disclaimer
14
// in the documentation and/or other materials provided with the
15
// distribution.
16
// *       Neither the name of DreamWorks Animation nor the names of
17
// its contributors may be used to endorse or promote products derived
18
// from this software without specific prior written permission.
19
//
20
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
//
32
///////////////////////////////////////////////////////////////////////////
33
34
#ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
35
#define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
36
37
//
38
// Various SSE accelerated functions, used by Imf::DwaCompressor. 
39
// These have been separated into a separate .h file, as the fast
40
// paths are done with template specialization.
41
//
42
// Unless otherwise noted, all pointers are assumed to be 32-byte 
43
// aligned. Unaligned pointers may risk seg-faulting.
44
//
45
46
#include "ImfNamespace.h"
47
#include "ImfSimd.h"
48
#include "ImfSystemSpecific.h"
49
#include "OpenEXRConfig.h"
50
51
#include <half.h>
52
#include <assert.h>
53
54
#include <algorithm>
55
56
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
57
58
0
#define _SSE_ALIGNMENT        32
59
0
#define _SSE_ALIGNMENT_MASK 0x0F
60
#define _AVX_ALIGNMENT_MASK 0x1F
61
62
//
63
// Test if we should enable GCC inline asm paths for AVX
64
//
65
66
#ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX 
67
68
    #define IMF_HAVE_GCC_INLINEASM
69
70
    #ifdef __LP64__
71
        #define IMF_HAVE_GCC_INLINEASM_64
72
    #endif /* __LP64__ */
73
74
#endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */
75
76
//
77
// A simple 64-element array, aligned properly for SIMD access. 
78
//
79
80
template <class T>
81
class SimdAlignedBuffer64
82
{
83
    public:
84
85
0
        SimdAlignedBuffer64(): _buffer (0), _handle (0)           
86
0
        {
87
0
            alloc();
88
0
        }
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::SimdAlignedBuffer64()
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::SimdAlignedBuffer64()
89
90
        SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0)
91
        {
92
            alloc();
93
            memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
94
        }
95
96
        SimdAlignedBuffer64 &operator=(const SimdAlignedBuffer64 &rhs)
97
        {
98
            memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
99
            return *this;
100
        }
101
102
#if __cplusplus >= 201103L
103
        SimdAlignedBuffer64(SimdAlignedBuffer64 &&rhs) noexcept
104
0
            : _handle(rhs._handle), _buffer(rhs._buffer)
105
0
        {
106
0
            rhs._handle = nullptr;
107
0
            rhs._buffer = nullptr;
108
0
        }
109
110
        SimdAlignedBuffer64 &operator=(SimdAlignedBuffer64 &&rhs) noexcept
111
        {
112
            std::swap(_handle, rhs._handle);
113
            std::swap(_buffer, rhs._buffer);
114
            return *this;
115
        }
116
#endif
117
        ~SimdAlignedBuffer64 ()
118
0
        {
119
0
            if (_handle)
120
0
                EXRFreeAligned (_handle);
121
0
            _handle = 0;
122
0
            _buffer = 0;
123
0
        }
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::~SimdAlignedBuffer64()
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::~SimdAlignedBuffer64()
124
125
        void alloc()
126
0
        {
127
            //
128
            // Try EXRAllocAligned first - but it might fallback to
129
            // unaligned allocs. If so, overalloc.
130
            //
131
132
0
            _handle = (char *) EXRAllocAligned
133
0
                (64 * sizeof(T), _SSE_ALIGNMENT);
134
135
0
            if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0)
136
0
            {
137
0
                _buffer = (T *)_handle;
138
0
                return;
139
0
            }
140
141
0
            EXRFreeAligned(_handle);
142
0
            _handle = (char *) EXRAllocAligned
143
0
                (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
144
145
0
            char *aligned = _handle;
146
147
0
            while ((size_t)aligned & (_SSE_ALIGNMENT - 1))
148
0
                aligned++;
149
150
0
            _buffer = (T *)aligned;    
151
0
        }
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<unsigned short>::alloc()
Unexecuted instantiation: Imf_opencv::SimdAlignedBuffer64<float>::alloc()
152
153
        T     *_buffer;
154
155
    private:
156
157
        char  *_handle;
158
};
159
160
typedef SimdAlignedBuffer64<float>          SimdAlignedBuffer64f;
161
typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
162
163
namespace {
164
165
//
166
// Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
167
//
168
169
void
170
csc709Inverse (float &comp0, float &comp1, float &comp2)
171
0
{
172
0
    float src[3];
173
174
0
    src[0] = comp0;
175
0
    src[1] = comp1;
176
0
    src[2] = comp2;
177
178
0
    comp0 = src[0]                    + 1.5747f * src[2];
179
0
    comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
180
0
    comp2 = src[0] + 1.8556f * src[1];
181
0
}
182
183
#ifndef IMF_HAVE_SSE2
184
185
186
//
187
// Scalar color space conversion, based on 709 primiary chromaticies.
188
// No scaling or offsets, just the matrix
189
//
190
191
void
192
csc709Inverse64 (float *comp0, float *comp1, float *comp2)
193
{
194
    for (int i = 0; i < 64; ++i)
195
        csc709Inverse (comp0[i], comp1[i], comp2[i]);
196
}
197
198
#else /* IMF_HAVE_SSE2 */
199
200
//
201
// SSE2 color space conversion
202
//
203
204
void
205
csc709Inverse64 (float *comp0, float *comp1, float *comp2)
206
0
{
207
0
    __m128 c0 = { 1.5747f,  1.5747f,  1.5747f,  1.5747f};
208
0
    __m128 c1 = { 1.8556f,  1.8556f,  1.8556f,  1.8556f};
209
0
    __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
210
0
    __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f}; 
211
212
0
    __m128 *r = (__m128 *)comp0;
213
0
    __m128 *g = (__m128 *)comp1;
214
0
    __m128 *b = (__m128 *)comp2;
215
0
    __m128 src[3];
216
217
0
    #define CSC_INVERSE_709_SSE2_LOOP(i)                       \
218
0
            src[0] = r[i];                                     \
219
0
            src[1] = g[i];                                     \
220
0
            src[2] = b[i];                                     \
221
0
                                                               \
222
0
            r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \
223
0
                                                               \
224
0
            g[i]   = _mm_mul_ps (g[i], c2);                    \
225
0
            src[2] = _mm_mul_ps (src[2], c3);                  \
226
0
            g[i]   = _mm_add_ps (g[i], src[0]);                \
227
0
            g[i]   = _mm_add_ps (g[i], src[2]);                \
228
0
                                                               \
229
0
            b[i] = _mm_mul_ps (c1,   src[1]);                  \
230
0
            b[i] = _mm_add_ps (b[i], src[0]);
231
232
0
    CSC_INVERSE_709_SSE2_LOOP (0)
233
0
    CSC_INVERSE_709_SSE2_LOOP (1)
234
0
    CSC_INVERSE_709_SSE2_LOOP (2)
235
0
    CSC_INVERSE_709_SSE2_LOOP (3)
236
237
0
    CSC_INVERSE_709_SSE2_LOOP (4)
238
0
    CSC_INVERSE_709_SSE2_LOOP (5)
239
0
    CSC_INVERSE_709_SSE2_LOOP (6)
240
0
    CSC_INVERSE_709_SSE2_LOOP (7)
241
242
0
    CSC_INVERSE_709_SSE2_LOOP (8)
243
0
    CSC_INVERSE_709_SSE2_LOOP (9)
244
0
    CSC_INVERSE_709_SSE2_LOOP (10)
245
0
    CSC_INVERSE_709_SSE2_LOOP (11)
246
247
0
    CSC_INVERSE_709_SSE2_LOOP (12)
248
0
    CSC_INVERSE_709_SSE2_LOOP (13)
249
0
    CSC_INVERSE_709_SSE2_LOOP (14)
250
0
    CSC_INVERSE_709_SSE2_LOOP (15)
251
0
}
252
253
#endif /* IMF_HAVE_SSE2 */
254
255
256
//
257
// Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
258
//
259
// Simple FPU color space conversion. Based on the 709
260
// primary chromaticies, with no scaling or offsets.
261
//
262
263
void
264
csc709Forward64 (float *comp0, float *comp1, float *comp2)
265
0
{
266
0
    float src[3];
267
268
0
    for (int i = 0; i<64; ++i)
269
0
    {
270
0
        src[0] = comp0[i];    
271
0
        src[1] = comp1[i]; 
272
0
        src[2] = comp2[i];     
273
274
0
        comp0[i] =  0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
275
0
        comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
276
0
        comp2[i] =  0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
277
0
    }
278
0
}
279
280
281
//
282
// Byte interleaving of 2 byte arrays:
283
//    src0 = AAAA 
284
//    src1 = BBBB
285
//    dst  = ABABABAB
286
//
287
// numBytes is the size of each of the source buffers
288
//
289
290
#ifndef IMF_HAVE_SSE2 
291
292
// 
293
// Scalar default implementation 
294
//
295
296
void
297
interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
298
{
299
    for (int x = 0; x < numBytes; ++x)
300
    {
301
        dst[2 * x]     = src0[x];
302
        dst[2 * x + 1] = src1[x];
303
    }
304
}
305
306
#else  /* IMF_HAVE_SSE2 */
307
308
// 
309
// SSE2 byte interleaving
310
//
311
312
void
313
interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
314
0
{
315
0
    int dstAlignment  = (size_t)dst  % 16;
316
0
    int src0Alignment = (size_t)src0 % 16;
317
0
    int src1Alignment = (size_t)src1 % 16;
318
319
0
    __m128i *dst_epi8  = (__m128i*)dst;
320
0
    __m128i *src0_epi8 = (__m128i*)src0;
321
0
    __m128i *src1_epi8 = (__m128i*)src1;
322
0
    int sseWidth  =  numBytes / 16;
323
324
0
    if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
325
0
    {
326
0
        __m128i tmp0, tmp1;
327
328
        //
329
        // Aligned loads and stores
330
        //
331
332
0
        for (int x = 0; x < sseWidth; ++x)
333
0
        {
334
0
            tmp0 = src0_epi8[x];
335
0
            tmp1 = src1_epi8[x];
336
337
0
            _mm_stream_si128 (&dst_epi8[2 * x],
338
0
                              _mm_unpacklo_epi8 (tmp0, tmp1));
339
340
0
            _mm_stream_si128 (&dst_epi8[2 * x + 1],
341
0
                              _mm_unpackhi_epi8 (tmp0, tmp1));
342
0
        }
343
344
        //
345
        // Then do run the leftovers one at a time
346
        //
347
348
0
        for (int x = 16 * sseWidth; x < numBytes; ++x)
349
0
        {
350
0
            dst[2 * x]     = src0[x];
351
0
            dst[2 * x + 1] = src1[x];
352
0
        }
353
0
    }
354
0
    else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
355
0
    {
356
        //
357
        // Aligned stores, but catch up a few values so we can 
358
        // use aligned loads
359
        //
360
    
361
0
        for (int x = 0; x < std::min (numBytes, 8); ++x)
362
0
        {
363
0
            dst[2 * x]     = src0[x];
364
0
            dst[2 * x + 1] = src1[x];
365
0
        }
366
367
0
        if (numBytes > 8) 
368
0
        {
369
0
            dst_epi8  = (__m128i*)&dst[16];
370
0
            src0_epi8 = (__m128i*)&src0[8];
371
0
            src1_epi8 = (__m128i*)&src1[8];
372
0
            sseWidth  =  (numBytes - 8) / 16;
373
374
0
            for (int x=0; x<sseWidth; ++x)
375
0
            {
376
0
                _mm_stream_si128 (&dst_epi8[2 * x],
377
0
                                  _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
378
379
0
                _mm_stream_si128 (&dst_epi8[2 * x + 1],
380
0
                                  _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
381
0
            }
382
383
            //
384
            // Then do run the leftovers one at a time
385
            //
386
387
0
            for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
388
0
            {
389
0
                dst[2 * x]     = src0[x];
390
0
                dst[2 * x + 1] = src1[x];
391
0
            }
392
0
        }
393
0
    }
394
0
    else
395
0
    {
396
        //
397
        // Unaligned everything
398
        //
399
400
0
        for (int x = 0; x < sseWidth; ++x)
401
0
        {
402
0
            __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
403
0
            __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
404
405
0
            _mm_storeu_si128 (&dst_epi8[2 * x],
406
0
                              _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
407
408
0
            _mm_storeu_si128 (&dst_epi8[2 * x + 1],
409
0
                              _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
410
0
        }
411
412
        //
413
        // Then do run the leftovers one at a time
414
        //
415
416
0
        for (int x = 16 * sseWidth; x < numBytes; ++x)
417
0
        {
418
0
            dst[2 * x]     = src0[x];
419
0
            dst[2 * x + 1] = src1[x];
420
0
        }
421
0
    }
422
0
}
423
424
#endif /* IMF_HAVE_SSE2 */
425
426
427
//
428
// Float -> half float conversion
429
//
430
// To enable F16C based conversion, we can't rely on compile-time
431
// detection, hence the multiple defined versions. Pick one based
432
// on runtime cpuid detection.
433
//
434
435
//
436
// Default boring conversion
437
//
438
439
void 
440
convertFloatToHalf64_scalar (unsigned short *dst, float *src)
441
0
{
442
0
    for (int i=0; i<64; ++i)
443
0
        dst[i] = ((half)src[i]).bits();
444
0
}
445
446
447
//
448
// F16C conversion - Assumes aligned src and dst
449
//
450
451
void
452
convertFloatToHalf64_f16c (unsigned short *dst, float *src)
453
0
{
454
    //
455
    // Ordinarly, I'd avoid using inline asm and prefer intrinsics. 
456
    // However, in order to get the intrinsics, we need to tell 
457
    // the compiler to generate VEX instructions.
458
    //
459
    // (On the GCC side, -mf16c goes ahead and activates -mavc,
460
    //  resulting in VEX code. Without -mf16c, no intrinsics..)
461
    //
462
    // Now, it's quite likely that we'll find ourselves in situations
463
    // where we want to build *without* VEX, in order to maintain
464
    // maximum compatability. But to get there with intrinsics,
465
    // we'd need to break out code into a separate file. Bleh.
466
    // I'll take the asm.
467
    //
468
469
    #if defined IMF_HAVE_GCC_INLINEASM
470
        __asm__
471
           ("vmovaps       (%0),     %%ymm0         \n"
472
            "vmovaps   0x20(%0),     %%ymm1         \n"
473
            "vmovaps   0x40(%0),     %%ymm2         \n"
474
            "vmovaps   0x60(%0),     %%ymm3         \n"
475
            "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
476
            "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
477
            "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
478
            "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
479
            "vmovdqa   %%xmm0,       0x00(%1)       \n"
480
            "vmovdqa   %%xmm1,       0x10(%1)       \n"
481
            "vmovdqa   %%xmm2,       0x20(%1)       \n"
482
            "vmovdqa   %%xmm3,       0x30(%1)       \n"
483
            "vmovaps   0x80(%0),     %%ymm0         \n"
484
            "vmovaps   0xa0(%0),     %%ymm1         \n"
485
            "vmovaps   0xc0(%0),     %%ymm2         \n"
486
            "vmovaps   0xe0(%0),     %%ymm3         \n"
487
            "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
488
            "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
489
            "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
490
            "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
491
            "vmovdqa   %%xmm0,       0x40(%1)       \n"
492
            "vmovdqa   %%xmm1,       0x50(%1)       \n"
493
            "vmovdqa   %%xmm2,       0x60(%1)       \n"
494
            "vmovdqa   %%xmm3,       0x70(%1)       \n"
495
        #ifndef __AVX__
496
            "vzeroupper                             \n"
497
        #endif /* __AVX__ */
498
            : /* Output  */                
499
            : /* Input   */ "r"(src), "r"(dst)
500
        #ifndef __AVX__
501
            : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
502
        #else
503
            : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
504
        #endif /* __AVX__ */
505
           );
506
    #else
507
0
        convertFloatToHalf64_scalar (dst, src);
508
0
    #endif /* IMF_HAVE_GCC_INLINEASM */
509
0
}
510
511
512
//
513
// Convert an 8x8 block of HALF from zig-zag order to
514
// FLOAT in normal order. The order we want is:
515
//
516
//          src                           dst 
517
//  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
518
//  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
519
// 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
520
// 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53  
521
// 32 33 34 35 36 37 38 39      10 19 23 32 39 45 52 54
522
// 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
523
// 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
524
// 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
525
//
526
527
void
528
fromHalfZigZag_scalar (unsigned short *src, float *dst)
529
0
{
530
0
    half *srcHalf = (half *)src;
531
532
0
    dst[0] = (float)srcHalf[0];
533
0
    dst[1] = (float)srcHalf[1];
534
0
    dst[2] = (float)srcHalf[5];
535
0
    dst[3] = (float)srcHalf[6];
536
0
    dst[4] = (float)srcHalf[14];
537
0
    dst[5] = (float)srcHalf[15];
538
0
    dst[6] = (float)srcHalf[27];
539
0
    dst[7] = (float)srcHalf[28];
540
0
    dst[8] = (float)srcHalf[2];
541
0
    dst[9] = (float)srcHalf[4];
542
543
0
    dst[10] = (float)srcHalf[7];
544
0
    dst[11] = (float)srcHalf[13];
545
0
    dst[12] = (float)srcHalf[16];
546
0
    dst[13] = (float)srcHalf[26];
547
0
    dst[14] = (float)srcHalf[29];
548
0
    dst[15] = (float)srcHalf[42];
549
0
    dst[16] = (float)srcHalf[3];
550
0
    dst[17] = (float)srcHalf[8];
551
0
    dst[18] = (float)srcHalf[12];
552
0
    dst[19] = (float)srcHalf[17];
553
554
0
    dst[20] = (float)srcHalf[25];
555
0
    dst[21] = (float)srcHalf[30];
556
0
    dst[22] = (float)srcHalf[41];
557
0
    dst[23] = (float)srcHalf[43];
558
0
    dst[24] = (float)srcHalf[9];
559
0
    dst[25] = (float)srcHalf[11];
560
0
    dst[26] = (float)srcHalf[18];
561
0
    dst[27] = (float)srcHalf[24];
562
0
    dst[28] = (float)srcHalf[31];
563
0
    dst[29] = (float)srcHalf[40];
564
565
0
    dst[30] = (float)srcHalf[44];
566
0
    dst[31] = (float)srcHalf[53];
567
0
    dst[32] = (float)srcHalf[10];
568
0
    dst[33] = (float)srcHalf[19];
569
0
    dst[34] = (float)srcHalf[23];
570
0
    dst[35] = (float)srcHalf[32];
571
0
    dst[36] = (float)srcHalf[39];
572
0
    dst[37] = (float)srcHalf[45];
573
0
    dst[38] = (float)srcHalf[52];
574
0
    dst[39] = (float)srcHalf[54];
575
576
0
    dst[40] = (float)srcHalf[20];
577
0
    dst[41] = (float)srcHalf[22];
578
0
    dst[42] = (float)srcHalf[33];
579
0
    dst[43] = (float)srcHalf[38];
580
0
    dst[44] = (float)srcHalf[46];
581
0
    dst[45] = (float)srcHalf[51];
582
0
    dst[46] = (float)srcHalf[55];
583
0
    dst[47] = (float)srcHalf[60];
584
0
    dst[48] = (float)srcHalf[21];
585
0
    dst[49] = (float)srcHalf[34];
586
587
0
    dst[50] = (float)srcHalf[37];
588
0
    dst[51] = (float)srcHalf[47];
589
0
    dst[52] = (float)srcHalf[50];
590
0
    dst[53] = (float)srcHalf[56];
591
0
    dst[54] = (float)srcHalf[59];
592
0
    dst[55] = (float)srcHalf[61];
593
0
    dst[56] = (float)srcHalf[35];
594
0
    dst[57] = (float)srcHalf[36];
595
0
    dst[58] = (float)srcHalf[48];
596
0
    dst[59] = (float)srcHalf[49];
597
598
0
    dst[60] = (float)srcHalf[57];
599
0
    dst[61] = (float)srcHalf[58];
600
0
    dst[62] = (float)srcHalf[62];
601
0
    dst[63] = (float)srcHalf[63];
602
0
}
603
604
605
//
606
// If we can form the correct ordering in xmm registers,
607
// we can use F16C to convert from HALF -> FLOAT. However,
608
// making the correct order isn't trivial. 
609
// 
610
// We want to re-order a source 8x8 matrix from:
611
//
612
//  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
613
//  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
614
// 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
615
// 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53   (A)
616
// 32 33 34 35 36 37 38 39  --> 10 19 23 32 39 45 52 54
617
// 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
618
// 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
619
// 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
620
//
621
// Which looks like a mess, right? 
622
//
623
// Now, check out the NE/SW diagonals of (A). Along those lines, 
624
// we have runs of contiguous values! If we rewrite (A) a bit, we get:
625
//
626
//  0
627
//  1  2
628
//  5  4  3
629
//  6  7  8  9
630
// 14 13 12 11 10
631
// 15 16 17 18 19 20
632
// 27 26 25 24 23 22 21            (B)
633
// 28 29 30 31 32 33 34 35
634
//    42 41 40 39 38 37 36
635
//       43 44 45 46 47 48
636
//          53 52 51 50 49
637
//             54 55 56 57
638
//                60 59 58
639
//                   61 62
640
//                      63
641
//
642
// In this ordering, the columns are the rows (A). If we can 'transpose' 
643
// (B), we'll achieve our goal. But we want this to fit nicely into 
644
// xmm registers and still be able to load large runs efficiently.  
645
// Also, notice that the odd rows are in ascending order, while 
646
// the even rows are in descending order. 
647
//
648
// If we 'fold' the bottom half up into the top, we can preserve ordered
649
// runs accross rows, and still keep all the correct values in columns. 
650
// After transposing, we'll need to rotate things back into place. 
651
// This gives us:
652
//
653
//  0 | 42   41   40   39   38   37   36
654
//  1    2 | 43   44   45   46   47   48
655
//  5    4    3 | 53   52   51   50   49
656
//  6    7    8    9 | 54   55   56   57      (C)
657
// 14   13   12   11   10 | 60   59   58
658
// 15   16   17   18   19   20 | 61   62
659
// 27   26   25   24   23   22   21 | 61
660
// 28   29   30   31   32   33   34   35
661
//
662
// But hang on. We still have the backwards descending rows to deal with.
663
// Lets reverse the even rows so that all values are in ascending order
664
//
665
//  36   37  38   39   40   41   42 | 0
666
//  1    2 | 43   44   45   46   47   48
667
//  49   50  51   52   53 |  3    4    5  
668
//  6    7    8    9 | 54   55   56   57      (D)
669
// 58   59   60 | 10   11   12   13   14  
670
// 15   16   17   18   19   20 | 61   62
671
// 61 | 21   22   23   24   25   26   27 
672
// 28   29   30   31   32   33   34   35
673
//
674
// If we can form (D),  we will then:
675
//   1) Reverse the even rows
676
//   2) Transpose
677
//   3) Rotate the rows 
678
//
679
// and we'll have (A).
680
//
681
682
void 
683
fromHalfZigZag_f16c (unsigned short *src, float *dst)
684
0
{
685
    #if defined IMF_HAVE_GCC_INLINEASM_64
686
        __asm__
687
688
           /* x3 <- 0                    
689
            * x8 <- [ 0- 7]              
690
            * x6 <- [56-63]              
691
            * x9 <- [21-28]              
692
            * x7 <- [28-35]              
693
            * x3 <- [ 6- 9] (lower half) */
694
          
695
          ("vpxor   %%xmm3,  %%xmm3, %%xmm3   \n"
696
           "vmovdqa    (%0), %%xmm8           \n"
697
           "vmovdqa 112(%0), %%xmm6           \n"
698
           "vmovdqu  42(%0), %%xmm9           \n"
699
           "vmovdqu  56(%0), %%xmm7           \n"
700
           "vmovq    12(%0), %%xmm3           \n"
701
702
           /* Setup rows 0-2 of A in xmm0-xmm2 
703
            * x1 <- x8 >> 16 (1 value)     
704
            * x2 <- x8 << 32 (2 values)    
705
            * x0 <- alignr([35-42], x8, 2) 
706
            * x1 <- blend(x1, [41-48])     
707
            * x2 <- blend(x2, [49-56])     */
708
709
           "vpsrldq      $2, %%xmm8, %%xmm1   \n"      
710
           "vpslldq      $4, %%xmm8, %%xmm2   \n"      
711
           "vpalignr     $2, 70(%0), %%xmm8, %%xmm0 \n"
712
           "vpblendw  $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
713
           "vpblendw  $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
714
     
715
           /* Setup rows 4-6 of A in xmm4-xmm6 
716
            * x4 <- x6 >> 32 (2 values)   
717
            * x5 <- x6 << 16 (1 value)    
718
            * x6 <- alignr(x6,x9,14)      
719
            * x4 <- blend(x4, [ 7-14])    
720
            * x5 <- blend(x5, [15-22])    */
721
722
           "vpsrldq      $4, %%xmm6, %%xmm4         \n"
723
           "vpslldq      $2, %%xmm6, %%xmm5         \n"
724
           "vpalignr    $14, %%xmm6, %%xmm9, %%xmm6 \n"
725
           "vpblendw  $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
726
           "vpblendw  $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
727
728
           /* Load the upper half of row 3 into xmm3 
729
            * x3 <- [54-57] (upper half) */
730
731
           "vpinsrq      $1, 108(%0), %%xmm3, %%xmm3\n"
732
733
           /* Reverse the even rows. We're not using PSHUFB as
734
            * that requires loading an extra constant all the time,
735
            * and we're alreadly pretty memory bound.
736
            */
737
738
           "vpshuflw $0x1b, %%xmm0, %%xmm0          \n" 
739
           "vpshuflw $0x1b, %%xmm2, %%xmm2          \n" 
740
           "vpshuflw $0x1b, %%xmm4, %%xmm4          \n" 
741
           "vpshuflw $0x1b, %%xmm6, %%xmm6          \n" 
742
743
           "vpshufhw $0x1b, %%xmm0, %%xmm0          \n" 
744
           "vpshufhw $0x1b, %%xmm2, %%xmm2          \n" 
745
           "vpshufhw $0x1b, %%xmm4, %%xmm4          \n" 
746
           "vpshufhw $0x1b, %%xmm6, %%xmm6          \n" 
747
748
           "vpshufd $0x4e, %%xmm0, %%xmm0          \n" 
749
           "vpshufd $0x4e, %%xmm2, %%xmm2          \n" 
750
           "vpshufd $0x4e, %%xmm4, %%xmm4          \n" 
751
           "vpshufd $0x4e, %%xmm6, %%xmm6          \n" 
752
753
           /* Transpose xmm0-xmm7 into xmm8-xmm15 */
754
755
           "vpunpcklwd %%xmm1, %%xmm0, %%xmm8       \n"
756
           "vpunpcklwd %%xmm3, %%xmm2, %%xmm9       \n"
757
           "vpunpcklwd %%xmm5, %%xmm4, %%xmm10      \n"
758
           "vpunpcklwd %%xmm7, %%xmm6, %%xmm11      \n"
759
           "vpunpckhwd %%xmm1, %%xmm0, %%xmm12      \n"
760
           "vpunpckhwd %%xmm3, %%xmm2, %%xmm13      \n"
761
           "vpunpckhwd %%xmm5, %%xmm4, %%xmm14      \n"
762
           "vpunpckhwd %%xmm7, %%xmm6, %%xmm15      \n"
763
     
764
           "vpunpckldq  %%xmm9,  %%xmm8, %%xmm0     \n"
765
           "vpunpckldq %%xmm11, %%xmm10, %%xmm1     \n"
766
           "vpunpckhdq  %%xmm9,  %%xmm8, %%xmm2     \n"
767
           "vpunpckhdq %%xmm11, %%xmm10, %%xmm3     \n"
768
           "vpunpckldq %%xmm13, %%xmm12, %%xmm4     \n"
769
           "vpunpckldq %%xmm15, %%xmm14, %%xmm5     \n"
770
           "vpunpckhdq %%xmm13, %%xmm12, %%xmm6     \n"
771
           "vpunpckhdq %%xmm15, %%xmm14, %%xmm7     \n"
772
     
773
           "vpunpcklqdq %%xmm1,  %%xmm0, %%xmm8     \n"
774
           "vpunpckhqdq %%xmm1,  %%xmm0, %%xmm9     \n"
775
           "vpunpcklqdq %%xmm3,  %%xmm2, %%xmm10    \n"
776
           "vpunpckhqdq %%xmm3,  %%xmm2, %%xmm11    \n"
777
           "vpunpcklqdq %%xmm4,  %%xmm5, %%xmm12    \n"
778
           "vpunpckhqdq %%xmm5,  %%xmm4, %%xmm13    \n"
779
           "vpunpcklqdq %%xmm7,  %%xmm6, %%xmm14    \n"
780
           "vpunpckhqdq %%xmm7,  %%xmm6, %%xmm15    \n"
781
782
           /* Rotate the rows to get the correct final order. 
783
            * Rotating xmm12 isn't needed, as we can handle
784
            * the rotation in the PUNPCKLQDQ above. Rotating
785
            * xmm8 isn't needed as it's already in the right order           
786
            */
787
788
           "vpalignr  $2,  %%xmm9,  %%xmm9,  %%xmm9 \n"
789
           "vpalignr  $4, %%xmm10, %%xmm10, %%xmm10 \n"
790
           "vpalignr  $6, %%xmm11, %%xmm11, %%xmm11 \n"
791
           "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
792
           "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
793
           "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
794
795
            /* Convert from half -> float */
796
797
           "vcvtph2ps  %%xmm8, %%ymm8            \n"  
798
           "vcvtph2ps  %%xmm9, %%ymm9            \n"
799
           "vcvtph2ps %%xmm10, %%ymm10           \n"
800
           "vcvtph2ps %%xmm11, %%ymm11           \n"
801
           "vcvtph2ps %%xmm12, %%ymm12           \n"
802
           "vcvtph2ps %%xmm13, %%ymm13           \n"
803
           "vcvtph2ps %%xmm14, %%ymm14           \n"
804
           "vcvtph2ps %%xmm15, %%ymm15           \n"
805
           
806
           /* Move float values to dst */
807
808
           "vmovaps    %%ymm8,    (%1)           \n"
809
           "vmovaps    %%ymm9,  32(%1)           \n"
810
           "vmovaps   %%ymm10,  64(%1)           \n" 
811
           "vmovaps   %%ymm11,  96(%1)           \n" 
812
           "vmovaps   %%ymm12, 128(%1)           \n" 
813
           "vmovaps   %%ymm13, 160(%1)           \n" 
814
           "vmovaps   %%ymm14, 192(%1)           \n" 
815
           "vmovaps   %%ymm15, 224(%1)           \n"
816
        #ifndef __AVX__
817
            "vzeroupper                          \n"
818
        #endif /* __AVX__ */
819
            : /* Output  */                
820
            : /* Input   */ "r"(src), "r"(dst)
821
            : /* Clobber */ "memory",
822
        #ifndef __AVX__
823
                            "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", 
824
                            "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
825
                            "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
826
                            "%xmm12", "%xmm13", "%xmm14", "%xmm15"
827
        #else
828
                            "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3", 
829
                            "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
830
                            "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
831
                            "%ymm12", "%ymm13", "%ymm14", "%ymm15"
832
        #endif /* __AVX__ */
833
        );
834
835
    #else
836
0
        fromHalfZigZag_scalar(src, dst);
837
0
    #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */
838
0
}
839
840
841
//
842
// Inverse 8x8 DCT, only inverting the DC. This assumes that
843
// all AC frequencies are 0.
844
//
845
846
#ifndef IMF_HAVE_SSE2
847
848
void 
849
dctInverse8x8DcOnly (float *data)
850
{
851
    float val = data[0] * 3.535536e-01f * 3.535536e-01f;
852
853
    for (int i = 0; i < 64; ++i)
854
        data[i] = val;
855
}
856
857
#else  /* IMF_HAVE_SSE2 */
858
859
void
860
dctInverse8x8DcOnly (float *data)
861
0
{
862
0
    __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
863
0
    __m128 *dst = (__m128 *)data;
864
865
0
    for (int i = 0; i < 16; ++i)
866
0
        dst[i] = src;
867
0
}
868
869
#endif /* IMF_HAVE_SSE2 */
870
871
872
//
873
// Full 8x8 Inverse DCT:
874
//
875
// Simple inverse DCT on an 8x8 block, with scalar ops only.
876
//  Operates on data in-place.
877
//
878
// This is based on the iDCT formuation (y = frequency domain,
879
//                                       x = spatial domain)
880
//
881
//    [x0]    [        ][y0]    [        ][y1] 
882
//    [x1] =  [  M1    ][y2]  + [  M2    ][y3] 
883
//    [x2]    [        ][y4]    [        ][y5] 
884
//    [x3]    [        ][y6]    [        ][y7]
885
//
886
//    [x7]    [        ][y0]    [        ][y1] 
887
//    [x6] =  [  M1    ][y2]  - [  M2    ][y3] 
888
//    [x5]    [        ][y4]    [        ][y5] 
889
//    [x4]    [        ][y6]    [        ][y7]
890
//
891
// where M1:             M2:
892
//
893
//   [a  c  a   f]     [b  d  e  g]
894
//   [a  f -a  -c]     [d -g -b -e]
895
//   [a -f -a   c]     [e -b  g  d]
896
//   [a -c  a  -f]     [g -e  d -b]
897
//
898
// and the constants are as defined below..
899
//
900
// If you know how many of the lower rows are zero, that can
901
// be passed in to help speed things up. If you don't know, 
902
// just set zeroedRows=0.
903
//
904
905
//
906
// Default implementation
907
//
908
909
template <int zeroedRows>
910
void
911
dctInverse8x8_scalar (float *data)
912
0
{
913
0
    const float a = .5f * cosf (3.14159f / 4.0f);
914
0
    const float b = .5f * cosf (3.14159f / 16.0f);
915
0
    const float c = .5f * cosf (3.14159f / 8.0f);
916
0
    const float d = .5f * cosf (3.f*3.14159f / 16.0f);
917
0
    const float e = .5f * cosf (5.f*3.14159f / 16.0f);
918
0
    const float f = .5f * cosf (3.f*3.14159f / 8.0f);
919
0
    const float g = .5f * cosf (7.f*3.14159f / 16.0f);
920
921
0
    float alpha[4], beta[4], theta[4], gamma[4];
922
923
0
    float *rowPtr = NULL;
924
925
    //
926
    // First pass - row wise.
927
    //
928
    // This looks less-compact than the description above in
929
    // an attempt to fold together common sub-expressions.
930
    //
931
932
0
    for (int row = 0; row < 8 - zeroedRows; ++row)
933
0
    {
934
0
        rowPtr = data + row * 8;
935
936
0
        alpha[0] = c * rowPtr[2]; 
937
0
        alpha[1] = f * rowPtr[2]; 
938
0
        alpha[2] = c * rowPtr[6]; 
939
0
        alpha[3] = f * rowPtr[6]; 
940
941
0
        beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
942
0
        beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
943
0
        beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
944
0
        beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
945
946
0
        theta[0] = a * (rowPtr[0] + rowPtr[4]);
947
0
        theta[3] = a * (rowPtr[0] - rowPtr[4]);
948
949
0
        theta[1] = alpha[0] + alpha[3]; 
950
0
        theta[2] = alpha[1] - alpha[2]; 
951
952
953
0
        gamma[0] = theta[0] + theta[1];
954
0
        gamma[1] = theta[3] + theta[2];
955
0
        gamma[2] = theta[3] - theta[2];
956
0
        gamma[3] = theta[0] - theta[1];
957
958
959
0
        rowPtr[0] = gamma[0] + beta[0];
960
0
        rowPtr[1] = gamma[1] + beta[1];
961
0
        rowPtr[2] = gamma[2] + beta[2];
962
0
        rowPtr[3] = gamma[3] + beta[3];
963
964
0
        rowPtr[4] = gamma[3] - beta[3];
965
0
        rowPtr[5] = gamma[2] - beta[2];
966
0
        rowPtr[6] = gamma[1] - beta[1];
967
0
        rowPtr[7] = gamma[0] - beta[0];
968
0
    }
969
970
    //
971
    // Second pass - column wise.
972
    //
973
974
0
    for (int column = 0; column < 8; ++column)
975
0
    {
976
0
        alpha[0] = c * data[16+column]; 
977
0
        alpha[1] = f * data[16+column]; 
978
0
        alpha[2] = c * data[48+column]; 
979
0
        alpha[3] = f * data[48+column]; 
980
981
0
        beta[0] = b * data[8+column]  + d * data[24+column] +
982
0
                  e * data[40+column] + g * data[56+column];
983
984
0
        beta[1] = d * data[8+column]  - g * data[24+column] -
985
0
                  b * data[40+column] - e * data[56+column];
986
987
0
        beta[2] = e * data[8+column]  - b * data[24+column] + 
988
0
                  g * data[40+column] + d * data[56+column];
989
990
0
        beta[3] = g * data[8+column]  - e * data[24+column] + 
991
0
                  d * data[40+column] - b * data[56+column];
992
993
0
        theta[0] = a * (data[column] + data[32+column]);
994
0
        theta[3] = a * (data[column] - data[32+column]);
995
996
0
        theta[1] = alpha[0] + alpha[3]; 
997
0
        theta[2] = alpha[1] - alpha[2]; 
998
999
0
        gamma[0] = theta[0] + theta[1];
1000
0
        gamma[1] = theta[3] + theta[2];
1001
0
        gamma[2] = theta[3] - theta[2];
1002
0
        gamma[3] = theta[0] - theta[1];
1003
1004
0
        data[     column] = gamma[0] + beta[0];
1005
0
        data[ 8 + column] = gamma[1] + beta[1];
1006
0
        data[16 + column] = gamma[2] + beta[2];
1007
0
        data[24 + column] = gamma[3] + beta[3];
1008
1009
0
        data[32 + column] = gamma[3] - beta[3];
1010
0
        data[40 + column] = gamma[2] - beta[2];
1011
0
        data[48 + column] = gamma[1] - beta[1];
1012
0
        data[56 + column] = gamma[0] - beta[0];
1013
0
    }
1014
0
}
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<0>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<1>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<2>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<3>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<4>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<5>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<6>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_scalar<7>(float*)
1015
1016
1017
//
1018
// SSE2 Implementation
1019
//
1020
1021
template <int zeroedRows>
1022
void
1023
dctInverse8x8_sse2 (float *data)
1024
0
{
1025
0
    #ifdef IMF_HAVE_SSE2
1026
0
        __m128 a  = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f};
1027
0
        __m128 b  = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f};
1028
0
        __m128 c  = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f};
1029
0
        __m128 d  = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f};
1030
0
        __m128 e  = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f};
1031
0
        __m128 f  = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f};
1032
0
        __m128 g  = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f};
1033
1034
0
        __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
1035
0
        __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f};
1036
0
        __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f};
1037
0
        __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f};
1038
1039
0
        __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
1040
0
        __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f};
1041
0
        __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
1042
0
        __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f};
1043
1044
0
        __m128 *srcVec = (__m128 *)data;
1045
0
        __m128 x[8], evenSum, oddSum;
1046
0
        __m128 in[8], alpha[4], beta[4], theta[4], gamma[4];
1047
        
1048
        //
1049
        // Rows -   
1050
        //
1051
        //  Treat this just like matrix-vector multiplication. The
1052
        //  trick is to note that:
1053
        //
1054
        //    [M00 M01 M02 M03][v0]   [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
1055
        //    [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
1056
        //    [M20 M21 M22 M23][v2]   [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
1057
        //    [M30 M31 M32 M33][v3]   [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
1058
        //
1059
        // Then, we can fill a register with v_i and multiply by the i-th column
1060
        // of M, accumulating across all i-s. 
1061
        //
1062
        // The kids refer to the populating of a register with a single value
1063
        // "broadcasting", and it can be done with a shuffle instruction. It
1064
        // seems to be the slowest part of the whole ordeal.
1065
        //
1066
        // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
1067
        // c4-7 are from M2.
1068
        //
1069
1070
0
        #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i)                             \
1071
            /*                                                              \
1072
             * Broadcast the components of the row                          \
1073
             */                                                             \
1074
0
                                                                            \
1075
0
            x[0] = _mm_shuffle_ps (srcVec[2 * i],                           \
1076
0
                                   srcVec[2 * i],                           \
1077
0
                                   _MM_SHUFFLE (0, 0, 0, 0));               \
1078
0
                                                                            \
1079
0
            x[1] = _mm_shuffle_ps (srcVec[2 * i],                           \
1080
0
                                   srcVec[2 * i],                           \
1081
0
                                   _MM_SHUFFLE (1, 1, 1, 1));               \
1082
0
                                                                            \
1083
0
            x[2] = _mm_shuffle_ps (srcVec[2 * i],                           \
1084
0
                                   srcVec[2 * i],                           \
1085
0
                                   _MM_SHUFFLE (2, 2, 2, 2));               \
1086
0
                                                                            \
1087
0
            x[3] = _mm_shuffle_ps (srcVec[2 * i],                           \
1088
0
                                   srcVec[2 * i],                           \
1089
0
                                   _MM_SHUFFLE (3, 3, 3, 3));               \
1090
0
                                                                            \
1091
0
            x[4] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1092
0
                                   srcVec[2 * i + 1],                       \
1093
0
                                   _MM_SHUFFLE (0, 0, 0, 0));               \
1094
0
                                                                            \
1095
0
            x[5] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1096
0
                                   srcVec[2 * i + 1],                       \
1097
0
                                   _MM_SHUFFLE (1, 1, 1, 1));               \
1098
0
                                                                            \
1099
0
            x[6] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1100
0
                                   srcVec[2 * i + 1],                       \
1101
0
                                   _MM_SHUFFLE (2, 2, 2, 2));               \
1102
0
                                                                            \
1103
0
            x[7] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1104
0
                                   srcVec[2 * i + 1],                       \
1105
0
                                   _MM_SHUFFLE (3, 3, 3, 3));               \
1106
            /*                                                              \
1107
             * Multiply the components by each column of the matrix         \
1108
             */                                                             \
1109
0
                                                                            \
1110
0
            x[0] = _mm_mul_ps (x[0], c0);                                   \
1111
0
            x[2] = _mm_mul_ps (x[2], c1);                                   \
1112
0
            x[4] = _mm_mul_ps (x[4], c2);                                   \
1113
0
            x[6] = _mm_mul_ps (x[6], c3);                                   \
1114
0
                                                                            \
1115
0
            x[1] = _mm_mul_ps (x[1], c4);                                   \
1116
0
            x[3] = _mm_mul_ps (x[3], c5);                                   \
1117
0
            x[5] = _mm_mul_ps (x[5], c6);                                   \
1118
0
            x[7] = _mm_mul_ps (x[7], c7);                                   \
1119
0
                                                                            \
1120
            /*                                                              \
1121
             * Add across                                                   \
1122
             */                                                             \
1123
0
                                                                            \
1124
0
            evenSum = _mm_setzero_ps();                                     \
1125
0
            evenSum = _mm_add_ps (evenSum, x[0]);                           \
1126
0
            evenSum = _mm_add_ps (evenSum, x[2]);                           \
1127
0
            evenSum = _mm_add_ps (evenSum, x[4]);                           \
1128
0
            evenSum = _mm_add_ps (evenSum, x[6]);                           \
1129
0
                                                                            \
1130
0
            oddSum = _mm_setzero_ps();                                      \
1131
0
            oddSum = _mm_add_ps (oddSum, x[1]);                             \
1132
0
            oddSum = _mm_add_ps (oddSum, x[3]);                             \
1133
0
            oddSum = _mm_add_ps (oddSum, x[5]);                             \
1134
0
            oddSum = _mm_add_ps (oddSum, x[7]);                             \
1135
0
                                                                            \
1136
            /*                                                              \
1137
             * Final Sum:                                                   \
1138
             *    out [0, 1, 2, 3] = evenSum + oddSum                       \
1139
             *    out [7, 6, 5, 4] = evenSum - oddSum                       \
1140
             */                                                             \
1141
0
                                                                            \
1142
0
            srcVec[2 * i]     = _mm_add_ps (evenSum, oddSum);               \
1143
0
            srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum);               \
1144
0
            srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1],          \
1145
0
                                                srcVec[2 * i + 1],          \
1146
0
                                                _MM_SHUFFLE (0, 1, 2, 3));
1147
1148
0
        switch (zeroedRows)
1149
0
        {
1150
0
          case 0:
1151
0
          default:
1152
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1153
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1154
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1155
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1156
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1157
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1158
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1159
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (7)
1160
0
            break;
1161
1162
0
          case 1:
1163
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1164
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1165
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1166
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1167
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1168
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1169
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1170
0
            break;
1171
1172
0
          case 2:
1173
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1174
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1175
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1176
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1177
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1178
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1179
0
            break;
1180
1181
0
          case 3:
1182
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1183
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1184
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1185
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1186
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1187
0
            break;
1188
1189
0
          case 4:
1190
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1191
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1192
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1193
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1194
0
            break;
1195
1196
0
          case 5:
1197
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1198
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1199
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1200
0
            break;
1201
1202
0
          case 6:
1203
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1204
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1205
0
            break;
1206
1207
0
          case 7:
1208
0
            DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1209
0
            break;
1210
0
        }
1211
1212
        //
1213
        // Columns -
1214
        //
1215
        // This is slightly more straightforward, if less readable. Here
1216
        // we just operate on 4 columns at a time, in two batches.
1217
        //
1218
        // The slight mess is to try and cache sub-expressions, which
1219
        // we ignore in the row-wise pass.
1220
        //
1221
1222
0
        for (int col = 0; col < 2; ++col)
1223
0
        {
1224
1225
0
            for (int i = 0; i < 8; ++i)
1226
0
                in[i] = srcVec[2 * i + col];
1227
1228
0
            alpha[0] = _mm_mul_ps (c, in[2]);
1229
0
            alpha[1] = _mm_mul_ps (f, in[2]);
1230
0
            alpha[2] = _mm_mul_ps (c, in[6]);
1231
0
            alpha[3] = _mm_mul_ps (f, in[6]);
1232
1233
0
            beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b),
1234
0
                                                          _mm_mul_ps (in[3], d)),
1235
0
                                              _mm_add_ps (_mm_mul_ps (in[5], e),
1236
0
                                                          _mm_mul_ps (in[7], g)));
1237
1238
0
            beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d),
1239
0
                                                          _mm_mul_ps (in[3], g)),
1240
0
                                              _mm_add_ps (_mm_mul_ps (in[5], b),
1241
0
                                                          _mm_mul_ps (in[7], e)));
1242
1243
0
            beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e),
1244
0
                                                          _mm_mul_ps (in[3], b)),
1245
0
                                              _mm_add_ps (_mm_mul_ps (in[5], g),
1246
0
                                                          _mm_mul_ps (in[7], d)));
1247
1248
0
            beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g),
1249
0
                                                          _mm_mul_ps (in[3], e)),
1250
0
                                              _mm_sub_ps (_mm_mul_ps (in[5], d),
1251
0
                                                          _mm_mul_ps (in[7], b)));
1252
1253
0
            theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
1254
0
            theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
1255
1256
0
            theta[1] = _mm_add_ps (alpha[0], alpha[3]);
1257
0
            theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
1258
1259
0
            gamma[0] = _mm_add_ps (theta[0], theta[1]);
1260
0
            gamma[1] = _mm_add_ps (theta[3], theta[2]);
1261
0
            gamma[2] = _mm_sub_ps (theta[3], theta[2]);
1262
0
            gamma[3] = _mm_sub_ps (theta[0], theta[1]);
1263
1264
0
            srcVec[  col] = _mm_add_ps (gamma[0], beta[0]);
1265
0
            srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]);
1266
0
            srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]);
1267
0
            srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]);
1268
1269
0
            srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]);
1270
0
            srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]);
1271
0
            srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]);
1272
0
            srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]);
1273
0
        }
1274
1275
    #else /* IMF_HAVE_SSE2 */
1276
1277
        dctInverse8x8_scalar<zeroedRows> (data);
1278
1279
    #endif /* IMF_HAVE_SSE2 */
1280
0
}
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<0>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<1>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<2>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<3>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<4>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<5>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<6>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_sse2<7>(float*)
1281
1282
1283
//
1284
// AVX Implementation
1285
//
1286
1287
#define STR(A) #A
1288
1289
#define IDCT_AVX_SETUP_2_ROWS(_DST0,  _DST1,  _TMP0,  _TMP1, \
1290
                              _OFF00, _OFF01, _OFF10, _OFF11) \
1291
    "vmovaps                 " STR(_OFF00) "(%0),  %%xmm" STR(_TMP0) "  \n" \
1292
    "vmovaps                 " STR(_OFF01) "(%0),  %%xmm" STR(_TMP1) "  \n" \
1293
    "                                                                                \n" \
1294
    "vinsertf128  $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) "  \n" \
1295
    "vinsertf128  $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) "  \n" \
1296
    "                                                                                \n" \
1297
    "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1298
    "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n" \
1299
    "                                                                                \n" \
1300
    "vunpcklps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP0) "  \n" \
1301
    "vunpckhps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP1) "  \n" \
1302
    "                                                                                \n" \
1303
    "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1304
    "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n" 
1305
1306
#define IDCT_AVX_MMULT_ROWS(_SRC)                       \
1307
    /* Broadcast the source values into y12-y15 */      \
1308
    "vpermilps $0x00, " STR(_SRC) ", %%ymm12       \n"  \
1309
    "vpermilps $0x55, " STR(_SRC) ", %%ymm13       \n"  \
1310
    "vpermilps $0xaa, " STR(_SRC) ", %%ymm14       \n"  \
1311
    "vpermilps $0xff, " STR(_SRC) ", %%ymm15       \n"  \
1312
                                                        \
1313
    /* Multiple coefs and the broadcasted values */     \
1314
    "vmulps    %%ymm12,  %%ymm8, %%ymm12     \n"        \
1315
    "vmulps    %%ymm13,  %%ymm9, %%ymm13     \n"        \
1316
    "vmulps    %%ymm14, %%ymm10, %%ymm14     \n"        \
1317
    "vmulps    %%ymm15, %%ymm11, %%ymm15     \n"        \
1318
                                                        \
1319
    /* Accumulate the result back into the source */    \
1320
    "vaddps    %%ymm13, %%ymm12, %%ymm12      \n"       \
1321
    "vaddps    %%ymm15, %%ymm14, %%ymm14      \n"       \
1322
    "vaddps    %%ymm14, %%ymm12, " STR(_SRC) "\n"     
1323
1324
#define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK)      \
1325
    "vsubps   " STR(_ODD) "," STR(_EVEN) "," STR(_BACK)  "\n"  \
1326
    "vaddps   " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n"  \
1327
    /* Reverse the back half                                */ \
1328
    "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"  
1329
1330
/* In order to allow for path paths when we know certain rows
1331
 * of the 8x8 block are zero, most of the body of the DCT is
1332
 * in the following macro. Statements are wrapped in a ROWn()
1333
 * macro, where n is the lowest row in the 8x8 block in which
1334
 * they depend.
1335
 *
1336
 * This should work for the cases where we have 2-8 full rows.
1337
 * the 1-row case is special, and we'll handle it seperately.  
1338
 */
1339
#define IDCT_AVX_BODY \
1340
    /* ==============================================               
1341
     *               Row 1D DCT                                     
1342
     * ----------------------------------------------
1343
     */                                                           \
1344
                                                                  \
1345
    /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds 
1346
     * the row-major 8x8 block, load ymm0-3 with the even columns
1347
     * and ymm4-7 with the odd columns. The lower half of the ymm
1348
     * holds one row, while the upper half holds the next row.
1349
     *
1350
     * If our source is:
1351
     *    a0 a1 a2 a3   a4 a5 a6 a7
1352
     *    b0 b1 b2 b3   b4 b5 b6 b7
1353
     *
1354
     * We'll be forming:
1355
     *    a0 a2 a4 a6   b0 b2 b4 b6
1356
     *    a1 a3 a5 a7   b1 b3 b5 b7
1357
     */                                                              \
1358
    ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15,    0,  16,  32,  48) ) \
1359
    ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13,   64,  80,  96, 112) ) \
1360
    ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11,  128, 144, 160, 176) ) \
1361
    ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7,  8,  9,  192, 208, 224, 240) ) \
1362
                                                                     \
1363
    /* Multiple the even columns (ymm0-3) by the matrix M1
1364
     * storing the results back in ymm0-3
1365
     *
1366
     * Assume that (%1) holds the matrix in column major order
1367
     */                                                              \
1368
    "vbroadcastf128   (%1),  %%ymm8         \n"                      \
1369
    "vbroadcastf128 16(%1),  %%ymm9         \n"                      \
1370
    "vbroadcastf128 32(%1), %%ymm10         \n"                      \
1371
    "vbroadcastf128 48(%1), %%ymm11         \n"                      \
1372
                                                                     \
1373
    ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) )                              \
1374
    ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) )                              \
1375
    ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) )                              \
1376
    ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) )                              \
1377
                                                                     \
1378
    /* Repeat, but with the odd columns (ymm4-7) and the 
1379
     * matrix M2
1380
     */                                                              \
1381
    "vbroadcastf128  64(%1),  %%ymm8         \n"                     \
1382
    "vbroadcastf128  80(%1),  %%ymm9         \n"                     \
1383
    "vbroadcastf128  96(%1), %%ymm10         \n"                     \
1384
    "vbroadcastf128 112(%1), %%ymm11         \n"                     \
1385
                                                                     \
1386
    ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) )                              \
1387
    ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) )                              \
1388
    ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) )                              \
1389
    ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) )                              \
1390
                                                                     \
1391
    /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the 
1392
     * front halves of the results, and difference to get the 
1393
     * back halves. The front halfs end up in ymm0-3, the back
1394
     * halves end up in ymm12-15. 
1395
     */                                                                \
1396
    ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
1397
    ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
1398
    ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
1399
    ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
1400
                                                                       \
1401
    /* Reassemble the rows halves into ymm0-7  */                      \
1402
    ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7   \n" )           \
1403
    ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6   \n" )           \
1404
    ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5   \n" )           \
1405
    ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4   \n" )           \
1406
    ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3   \n" )           \
1407
    ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2   \n" )           \
1408
    ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1   \n" )           \
1409
    ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n" )           \
1410
                                                                       \
1411
                                                                       \
1412
    /* ==============================================
1413
     *                Column 1D DCT 
1414
     * ----------------------------------------------
1415
     */                                                                \
1416
                                                                       \
1417
    /* Rows should be in ymm0-7, and M2 columns should still be 
1418
     * preserved in ymm8-11.  M2 has 4 unique values (and +- 
1419
     * versions of each), and all (positive) values appear in 
1420
     * the first column (and row), which is in ymm8.
1421
     *
1422
     * For the column-wise DCT, we need to:
1423
     *   1) Broadcast each element a row of M2 into 4 vectors
1424
     *   2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
1425
     *   3) Accumulate into ymm12-15 for the odd outputs.
1426
     *
1427
     * Instead of doing 16 broadcasts for each element in M2, 
1428
     * do 4, filling y8-11 with:
1429
     *
1430
     *     ymm8:  [ b  b  b  b  | b  b  b  b ]
1431
     *     ymm9:  [ d  d  d  d  | d  d  d  d ]
1432
     *     ymm10: [ e  e  e  e  | e  e  e  e ]
1433
     *     ymm11: [ g  g  g  g  | g  g  g  g ]
1434
     * 
1435
     * And deal with the negative values by subtracting during accum.
1436
     */                                                                \
1437
    "vpermilps        $0xff,  %%ymm8, %%ymm11  \n"                     \
1438
    "vpermilps        $0xaa,  %%ymm8, %%ymm10  \n"                     \
1439
    "vpermilps        $0x55,  %%ymm8, %%ymm9   \n"                     \
1440
    "vpermilps        $0x00,  %%ymm8, %%ymm8   \n"                     \
1441
                                                                       \
1442
    /* This one is easy, since we have ymm12-15 open for scratch   
1443
     *    ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7 
1444
     */                                                                \
1445
    ROW1( "vmulps    %%ymm1,  %%ymm8, %%ymm12    \n" )                 \
1446
    ROW3( "vmulps    %%ymm3,  %%ymm9, %%ymm13    \n" )                 \
1447
    ROW5( "vmulps    %%ymm5, %%ymm10, %%ymm14    \n" )                 \
1448
    ROW7( "vmulps    %%ymm7, %%ymm11, %%ymm15    \n" )                 \
1449
                                                                       \
1450
    ROW3( "vaddps   %%ymm12, %%ymm13, %%ymm12    \n" )                 \
1451
    ROW7( "vaddps   %%ymm14, %%ymm15, %%ymm14    \n" )                 \
1452
    ROW5( "vaddps   %%ymm12, %%ymm14, %%ymm12    \n" )                 \
1453
                                                                       \
1454
    /* Tricker, since only y13-15 are open for scratch   
1455
     *    ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7 
1456
     */                                                                \
1457
    ROW1( "vmulps    %%ymm1,   %%ymm9, %%ymm13   \n" )                 \
1458
    ROW3( "vmulps    %%ymm3,  %%ymm11, %%ymm14   \n" )                 \
1459
    ROW5( "vmulps    %%ymm5,   %%ymm8, %%ymm15   \n" )                 \
1460
                                                                       \
1461
    ROW5( "vaddps    %%ymm14, %%ymm15, %%ymm14   \n" )                 \
1462
    ROW3( "vsubps    %%ymm14, %%ymm13, %%ymm13   \n" )                 \
1463
                                                                       \
1464
    ROW7( "vmulps    %%ymm7,  %%ymm10, %%ymm15   \n" )                 \
1465
    ROW7( "vsubps    %%ymm15, %%ymm13, %%ymm13   \n" )                 \
1466
                                                                       \
1467
    /* Tricker still, as only y14-15 are open for scratch   
1468
     *    ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7 
1469
     */                                                                \
1470
    ROW1( "vmulps     %%ymm1, %%ymm10,  %%ymm14  \n" )                 \
1471
    ROW3( "vmulps     %%ymm3,  %%ymm8,  %%ymm15  \n" )                 \
1472
                                                                       \
1473
    ROW3( "vsubps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1474
                                                                       \
1475
    ROW5( "vmulps     %%ymm5, %%ymm11, %%ymm15   \n" )                 \
1476
    ROW5( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1477
                                                                       \
1478
    ROW7( "vmulps    %%ymm7,   %%ymm9, %%ymm15   \n" )                 \
1479
    ROW7( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1480
                                                                       \
1481
                                                                       \
1482
    /* Easy, as we can blow away ymm1,3,5,7 for scratch
1483
     *    ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7 
1484
     */                                                                \
1485
    ROW1( "vmulps    %%ymm1, %%ymm11, %%ymm15    \n" )                 \
1486
    ROW3( "vmulps    %%ymm3, %%ymm10,  %%ymm3    \n" )                 \
1487
    ROW5( "vmulps    %%ymm5,  %%ymm9,  %%ymm5    \n" )                 \
1488
    ROW7( "vmulps    %%ymm7,  %%ymm8,  %%ymm7    \n" )                 \
1489
                                                                       \
1490
    ROW5( "vaddps   %%ymm15,  %%ymm5, %%ymm15    \n" )                 \
1491
    ROW7( "vaddps    %%ymm3,  %%ymm7,  %%ymm3    \n" )                 \
1492
    ROW3( "vsubps    %%ymm3, %%ymm15, %%ymm15    \n" )                 \
1493
                                                                       \
1494
                                                                       \
1495
    /* Load coefs for M1. Because we're going to broadcast
1496
     * coefs, we don't need to load the actual structure from
1497
     * M1. Instead, just load enough that we can broadcast.
1498
     * There are only 6 unique values in M1, but they're in +-
1499
     * pairs, leaving only 3 unique coefs if we add and subtract 
1500
     * properly.
1501
     *
1502
     * Fill      ymm1 with coef[2] = [ a  a  c  f | a  a  c  f ]
1503
     * Broadcast ymm5 with           [ f  f  f  f | f  f  f  f ]
1504
     * Broadcast ymm3 with           [ c  c  c  c | c  c  c  c ]
1505
     * Broadcast ymm1 with           [ a  a  a  a | a  a  a  a ]
1506
     */                                                                \
1507
    "vbroadcastf128   8(%1),  %%ymm1          \n"                      \
1508
    "vpermilps        $0xff,  %%ymm1, %%ymm5  \n"                      \
1509
    "vpermilps        $0xaa,  %%ymm1, %%ymm3  \n"                      \
1510
    "vpermilps        $0x00,  %%ymm1, %%ymm1  \n"                      \
1511
                                                                       \
1512
    /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following 
1513
     * common expressions:
1514
     *
1515
     *   E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) 
1516
     *   E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1517
     * 
1518
     *   E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1519
     *   E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1520
     *
1521
     * Afterwards, ymm8-11 will hold the even outputs.
1522
     */                                                                \
1523
                                                                       \
1524
    /*  ymm11 = (a ymm0 + a ymm4),   ymm1 = (a ymm0 - a ymm4) */       \
1525
    ROW0( "vmulps    %%ymm1,  %%ymm0, %%ymm11   \n" )                  \
1526
    ROW4( "vmulps    %%ymm1,  %%ymm4,  %%ymm4   \n" )                  \
1527
    ROW0( "vmovaps   %%ymm11, %%ymm1            \n" )                  \
1528
    ROW4( "vaddps    %%ymm4, %%ymm11, %%ymm11   \n" )                  \
1529
    ROW4( "vsubps    %%ymm4,  %%ymm1,  %%ymm1   \n" )                  \
1530
                                                                       \
1531
    /* ymm7 = (c ymm2 + f ymm6) */                                     \
1532
    ROW2( "vmulps    %%ymm3, %%ymm2,  %%ymm7    \n" )                  \
1533
    ROW6( "vmulps    %%ymm5, %%ymm6,  %%ymm9    \n" )                  \
1534
    ROW6( "vaddps    %%ymm9, %%ymm7,  %%ymm7    \n" )                  \
1535
                                                                       \
1536
    /* E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6) 
1537
     * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6) 
1538
     */                                                                \
1539
    ROW0( "vmovaps   %%ymm11, %%ymm8            \n" )                  \
1540
    ROW2( "vaddps     %%ymm7, %%ymm8,  %%ymm8   \n" )                  \
1541
    ROW2( "vsubps     %%ymm7, %%ymm11, %%ymm11  \n" )                  \
1542
                                                                       \
1543
    /* ymm7 = (f ymm2 - c ymm6) */                                     \
1544
    ROW2( "vmulps     %%ymm5,  %%ymm2, %%ymm7   \n" )                  \
1545
    ROW6( "vmulps     %%ymm3,  %%ymm6, %%ymm9   \n" )                  \
1546
    ROW6( "vsubps     %%ymm9,  %%ymm7, %%ymm7   \n" )                  \
1547
                                                                       \
1548
    /* E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6) 
1549
     * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1550
     */                                                                \
1551
    ROW0( "vmovaps   %%ymm1,  %%ymm9            \n" )                  \
1552
    ROW0( "vmovaps   %%ymm1, %%ymm10            \n" )                  \
1553
    ROW2( "vaddps    %%ymm7,  %%ymm1,  %%ymm9   \n" )                  \
1554
    ROW2( "vsubps    %%ymm7,  %%ymm1,  %%ymm10  \n" )                  \
1555
                                                                       \
1556
    /* Add the even (ymm8-11) and the odds (ymm12-15), 
1557
     * placing the results into ymm0-7 
1558
     */                                                                \
1559
    "vaddps   %%ymm12,  %%ymm8, %%ymm0       \n"                       \
1560
    "vaddps   %%ymm13,  %%ymm9, %%ymm1       \n"                       \
1561
    "vaddps   %%ymm14, %%ymm10, %%ymm2       \n"                       \
1562
    "vaddps   %%ymm15, %%ymm11, %%ymm3       \n"                       \
1563
                                                                       \
1564
    "vsubps   %%ymm12,  %%ymm8, %%ymm7       \n"                       \
1565
    "vsubps   %%ymm13,  %%ymm9, %%ymm6       \n"                       \
1566
    "vsubps   %%ymm14, %%ymm10, %%ymm5       \n"                       \
1567
    "vsubps   %%ymm15, %%ymm11, %%ymm4       \n"                       \
1568
                                                                       \
1569
    /* Copy out the results from ymm0-7  */                            \
1570
    "vmovaps   %%ymm0,    (%0)                   \n"                   \
1571
    "vmovaps   %%ymm1,  32(%0)                   \n"                   \
1572
    "vmovaps   %%ymm2,  64(%0)                   \n"                   \
1573
    "vmovaps   %%ymm3,  96(%0)                   \n"                   \
1574
    "vmovaps   %%ymm4, 128(%0)                   \n"                   \
1575
    "vmovaps   %%ymm5, 160(%0)                   \n"                   \
1576
    "vmovaps   %%ymm6, 192(%0)                   \n"                   \
1577
    "vmovaps   %%ymm7, 224(%0)                   \n"            
1578
1579
/* Output, input, and clobber (OIC) sections of the inline asm */
1580
#define IDCT_AVX_OIC(_IN0)                          \
1581
        : /* Output  */                            \
1582
        : /* Input   */ "r"(_IN0), "r"(sAvxCoef)      \
1583
        : /* Clobber */ "memory",                  \
1584
                        "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", \
1585
                        "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7", \
1586
                        "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",\
1587
                        "%xmm12", "%xmm13", "%xmm14", "%xmm15" 
1588
1589
/* Include vzeroupper for non-AVX builds                */
1590
#ifndef __AVX__ 
1591
    #define IDCT_AVX_ASM(_IN0)   \
1592
        __asm__(                 \
1593
            IDCT_AVX_BODY        \
1594
            "vzeroupper      \n" \
1595
            IDCT_AVX_OIC(_IN0)   \
1596
        );                       
1597
#else /* __AVX__ */
1598
    #define IDCT_AVX_ASM(_IN0)   \
1599
        __asm__(                 \
1600
            IDCT_AVX_BODY        \
1601
            IDCT_AVX_OIC(_IN0)   \
1602
        );                       
1603
#endif /* __AVX__ */
1604
1605
template <int zeroedRows>
1606
void
1607
dctInverse8x8_avx (float *data)
1608
0
{
1609
    #if defined IMF_HAVE_GCC_INLINEASM_64
1610
1611
    /* The column-major version of M1, followed by the 
1612
     * column-major version of M2:
1613
     *   
1614
     *          [ a  c  a  f ]          [ b  d  e  g ]
1615
     *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1616
     *          [ a -f -a  c ]          [ e -b  g  d ]
1617
     *          [ a -c  a -f ]          [ g -e  d -b ]
1618
     */   
1619
    const float sAvxCoef[32]  __attribute__((aligned(32))) = {
1620
        3.535536e-01,  3.535536e-01,  3.535536e-01,  3.535536e-01, /* a  a  a  a */
1621
        4.619398e-01,  1.913422e-01, -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1622
        3.535536e-01, -3.535536e-01, -3.535536e-01,  3.535536e-01, /* a -a -a  a */
1623
        1.913422e-01, -4.619398e-01,  4.619398e-01, -1.913422e-01, /* f -c  c -f */
1624
1625
        4.903927e-01,  4.157349e-01,  2.777855e-01,  9.754573e-02, /* b  d  e  g */
1626
        4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1627
        2.777855e-01, -4.903927e-01,  9.754573e-02,  4.157349e-01, /* e -b  g  d */
1628
        9.754573e-02, -2.777855e-01,  4.157349e-01, -4.903927e-01  /* g -e  d -b */
1629
    };
1630
1631
        #define ROW0(_X) _X
1632
        #define ROW1(_X) _X
1633
        #define ROW2(_X) _X
1634
        #define ROW3(_X) _X 
1635
        #define ROW4(_X) _X
1636
        #define ROW5(_X) _X 
1637
        #define ROW6(_X) _X
1638
        #define ROW7(_X) _X 
1639
1640
        if (zeroedRows == 0) {
1641
1642
            IDCT_AVX_ASM(data)
1643
1644
        } else if (zeroedRows == 1) {
1645
1646
            #undef  ROW7
1647
            #define ROW7(_X)
1648
            IDCT_AVX_ASM(data)
1649
1650
        } else if (zeroedRows == 2) {
1651
1652
            #undef  ROW6
1653
            #define ROW6(_X)
1654
            IDCT_AVX_ASM(data)
1655
1656
        } else if (zeroedRows == 3) {
1657
1658
            #undef  ROW5
1659
            #define ROW5(_X)
1660
            IDCT_AVX_ASM(data)
1661
1662
        } else if (zeroedRows == 4) {
1663
1664
            #undef  ROW4
1665
            #define ROW4(_X)
1666
            IDCT_AVX_ASM(data)
1667
1668
        } else if (zeroedRows == 5) {
1669
1670
            #undef  ROW3
1671
            #define ROW3(_X)
1672
            IDCT_AVX_ASM(data)
1673
1674
        } else if (zeroedRows == 6) {
1675
1676
            #undef  ROW2
1677
            #define ROW2(_X)
1678
            IDCT_AVX_ASM(data)
1679
1680
        } else if (zeroedRows == 7) {
1681
1682
            __asm__(  
1683
1684
                /* ==============================================
1685
                 *                Row 1D DCT 
1686
                 * ----------------------------------------------
1687
                 */ 
1688
                IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15,    0,  16,  32,  48) 
1689
1690
                "vbroadcastf128   (%1),  %%ymm8         \n"
1691
                "vbroadcastf128 16(%1),  %%ymm9         \n"
1692
                "vbroadcastf128 32(%1), %%ymm10         \n"
1693
                "vbroadcastf128 48(%1), %%ymm11         \n"
1694
1695
                /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1696
                "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1697
1698
                IDCT_AVX_MMULT_ROWS(%%ymm0) 
1699
1700
                "vbroadcastf128  64(%1),  %%ymm8         \n"
1701
                "vbroadcastf128  80(%1),  %%ymm9         \n"
1702
                "vbroadcastf128  96(%1), %%ymm10         \n"
1703
                "vbroadcastf128 112(%1), %%ymm11         \n"
1704
1705
                IDCT_AVX_MMULT_ROWS(%%ymm4) 
1706
1707
                IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) 
1708
1709
                "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n" 
1710
1711
                /* ==============================================
1712
                 *                Column 1D DCT 
1713
                 * ----------------------------------------------
1714
                 */ 
1715
1716
                /* DC only, so multiple by a and we're done */
1717
                "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1718
1719
                /* Copy out results  */
1720
                "vmovaps %%ymm0,    (%0)          \n"
1721
                "vmovaps %%ymm0,  32(%0)          \n"
1722
                "vmovaps %%ymm0,  64(%0)          \n"
1723
                "vmovaps %%ymm0,  96(%0)          \n"
1724
                "vmovaps %%ymm0, 128(%0)          \n"
1725
                "vmovaps %%ymm0, 160(%0)          \n"
1726
                "vmovaps %%ymm0, 192(%0)          \n"
1727
                "vmovaps %%ymm0, 224(%0)          \n"
1728
1729
                #ifndef __AVX__
1730
                    "vzeroupper                   \n" 
1731
                #endif /* __AVX__ */
1732
                IDCT_AVX_OIC(data)
1733
            );
1734
        } else {
1735
            assert(false); // Invalid template instance parameter
1736
        }
1737
    #else  /* IMF_HAVE_GCC_INLINEASM_64 */
1738
1739
0
        dctInverse8x8_scalar<zeroedRows>(data);
1740
1741
0
    #endif /*  IMF_HAVE_GCC_INLINEASM_64 */
1742
0
}
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<0>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<1>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<2>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<3>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<4>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<5>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<6>(float*)
Unexecuted instantiation: ImfDwaCompressor.cpp:void Imf_opencv::(anonymous namespace)::dctInverse8x8_avx<7>(float*)
1743
1744
1745
//
1746
// Full 8x8 Forward DCT:
1747
//
1748
// Base forward 8x8 DCT implementation. Works on the data in-place
1749
//
1750
// The implementation describedin Pennebaker + Mitchell,
1751
//  section 4.3.2, and illustrated in figure 4-7
1752
//
1753
// The basic idea is that the 1D DCT math reduces to:
1754
//
1755
//   2*out_0            = c_4 [(s_07 + s_34) + (s_12 + s_56)]
1756
//   2*out_4            = c_4 [(s_07 + s_34) - (s_12 + s_56)]
1757
//
1758
//   {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
1759
//
1760
//   {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
1761
//                                d_34 - c_4 (d_12 + d_56))
1762
//
1763
//   {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
1764
//                               -d_34 - c_4 (d_12 + d_56))
1765
//
1766
// where:
1767
//
1768
//    c_i  = cos(i*pi/16)
1769
//    s_i  = sin(i*pi/16)
1770
//
1771
//    s_ij = in_i + in_j
1772
//    d_ij = in_i - in_j
1773
//
1774
//    rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y} 
1775
//
1776
// We'll run the DCT in two passes. First, run the 1D DCT on 
1777
// the rows, in-place. Then, run over the columns in-place, 
1778
// and be done with it.
1779
//
1780
1781
#ifndef IMF_HAVE_SSE2
1782
1783
//
1784
// Default implementation
1785
//
1786
1787
void 
1788
dctForward8x8 (float *data)
1789
{
1790
    float A0, A1, A2, A3, A4, A5, A6, A7;
1791
    float K0, K1, rot_x, rot_y;
1792
1793
    float *srcPtr = data;
1794
    float *dstPtr = data;
1795
1796
    const float c1 = cosf (3.14159f * 1.0f / 16.0f);
1797
    const float c2 = cosf (3.14159f * 2.0f / 16.0f);
1798
    const float c3 = cosf (3.14159f * 3.0f / 16.0f);
1799
    const float c4 = cosf (3.14159f * 4.0f / 16.0f);
1800
    const float c5 = cosf (3.14159f * 5.0f / 16.0f);
1801
    const float c6 = cosf (3.14159f * 6.0f / 16.0f);
1802
    const float c7 = cosf (3.14159f * 7.0f / 16.0f);
1803
1804
    const float c1Half = .5f * c1; 
1805
    const float c2Half = .5f * c2;
1806
    const float c3Half = .5f * c3;
1807
    const float c5Half = .5f * c5;
1808
    const float c6Half = .5f * c6;
1809
    const float c7Half = .5f * c7;
1810
1811
    //
1812
    // First pass - do a 1D DCT over the rows and write the 
1813
    //              results back in place
1814
    //
1815
1816
    for (int row=0; row<8; ++row)
1817
    {
1818
        float *srcRowPtr = srcPtr + 8 * row;
1819
        float *dstRowPtr = dstPtr + 8 * row;
1820
1821
        A0 = srcRowPtr[0] + srcRowPtr[7];
1822
        A1 = srcRowPtr[1] + srcRowPtr[2];
1823
        A2 = srcRowPtr[1] - srcRowPtr[2];
1824
        A3 = srcRowPtr[3] + srcRowPtr[4];
1825
        A4 = srcRowPtr[3] - srcRowPtr[4];
1826
        A5 = srcRowPtr[5] + srcRowPtr[6];
1827
        A6 = srcRowPtr[5] - srcRowPtr[6];
1828
        A7 = srcRowPtr[0] - srcRowPtr[7];      
1829
1830
        K0 = c4 * (A0 + A3); 
1831
        K1 = c4 * (A1 + A5); 
1832
1833
        dstRowPtr[0] = .5f * (K0 + K1);
1834
        dstRowPtr[4] = .5f * (K0 - K1);
1835
1836
        //
1837
        // (2*dst2, 2*dst6) = rot 6 (d12 - d56,  s07 - s34)
1838
        //
1839
1840
        rot_x = A2 - A6;
1841
        rot_y = A0 - A3;
1842
1843
        dstRowPtr[2] =  c6Half * rot_x + c2Half * rot_y;
1844
        dstRowPtr[6] =  c6Half * rot_y - c2Half * rot_x;
1845
1846
        //
1847
        // K0, K1 are active until after dst[1],dst[7]
1848
        //  as well as dst[3], dst[5] are computed.
1849
        //
1850
1851
        K0 = c4 * (A1 - A5);      
1852
        K1 = -1 * c4 * (A2 + A6); 
1853
1854
        //
1855
        // Two ways to do a rotation:
1856
        //
1857
        //  rot i (x, y) = 
1858
        //           X =  c_i*x + s_i*y
1859
        //           Y = -s_i*x + c_i*y
1860
        //
1861
        //        OR
1862
        //
1863
        //           X = c_i*(x+y) + (s_i-c_i)*y
1864
        //           Y = c_i*y     - (s_i+c_i)*x
1865
        //
1866
        // the first case has 4 multiplies, but fewer constants,
1867
        // while the 2nd case has fewer multiplies but takes more space.
1868
1869
        //
1870
        // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1871
        //
1872
1873
        rot_x = A7 - K0;
1874
        rot_y = A4 + K1;
1875
1876
        dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
1877
        dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
1878
1879
        //
1880
        // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1881
        //
1882
1883
        rot_x = A7 + K0;
1884
        rot_y = K1 - A4;
1885
1886
        //
1887
        // A: 4, 7 are inactive. All A's are inactive
1888
        //
1889
1890
        dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
1891
        dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
1892
    }
1893
1894
    //
1895
    // Second pass - do the same, but on the columns
1896
    //
1897
1898
    for (int column = 0; column < 8; ++column)
1899
    {
1900
1901
        A0 = srcPtr[     column] + srcPtr[56 + column];
1902
        A7 = srcPtr[     column] - srcPtr[56 + column];
1903
1904
        A1 = srcPtr[ 8 + column] + srcPtr[16 + column];
1905
        A2 = srcPtr[ 8 + column] - srcPtr[16 + column];
1906
1907
        A3 = srcPtr[24 + column] + srcPtr[32 + column];
1908
        A4 = srcPtr[24 + column] - srcPtr[32 + column];
1909
1910
        A5 = srcPtr[40 + column] + srcPtr[48 + column];
1911
        A6 = srcPtr[40 + column] - srcPtr[48 + column];
1912
1913
        K0 = c4 * (A0 + A3); 
1914
        K1 = c4 * (A1 + A5); 
1915
1916
        dstPtr[   column] = .5f * (K0 + K1);
1917
        dstPtr[32+column] = .5f * (K0 - K1);
1918
1919
        //
1920
        // (2*dst2, 2*dst6) = rot 6 ( d12 - d56,  s07 - s34 )
1921
        //
1922
1923
        rot_x = A2 - A6;
1924
        rot_y = A0 - A3;
1925
1926
        dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y);
1927
        dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x);
1928
1929
        //
1930
        // K0, K1 are active until after dst[1],dst[7]
1931
        //  as well as dst[3], dst[5] are computed.
1932
        //
1933
1934
        K0 = c4 * (A1 - A5);      
1935
        K1 = -1 * c4 * (A2 + A6); 
1936
1937
        //
1938
        // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1939
        //
1940
1941
        rot_x = A7 - K0;
1942
        rot_y = A4 + K1;
1943
1944
        dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y);
1945
        dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y);
1946
1947
        //
1948
        // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1949
        //
1950
1951
        rot_x = A7 + K0;
1952
        rot_y = K1 - A4;
1953
1954
        dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y);
1955
        dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y);
1956
    }
1957
}
1958
1959
#else  /* IMF_HAVE_SSE2 */
1960
1961
//
1962
// SSE2 implementation
1963
//
1964
// Here, we're always doing a column-wise operation
1965
// plus transposes. This might be faster to do differently
1966
// between rows-wise and column-wise
1967
//
1968
1969
void 
1970
dctForward8x8 (float *data)
1971
0
{
1972
0
    __m128 *srcVec = (__m128 *)data;
1973
0
    __m128  a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
1974
0
    __m128  k0Vec, k1Vec, rotXVec, rotYVec;
1975
0
    __m128  transTmp[4], transTmp2[4];
1976
1977
0
    __m128  c4Vec     = { .70710678f,  .70710678f,  .70710678f,  .70710678f};
1978
0
    __m128  c4NegVec  = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
1979
1980
0
    __m128  c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f}; 
1981
0
    __m128  c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
1982
0
    __m128  c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f}; 
1983
0
    __m128  c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f}; 
1984
0
    __m128  c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
1985
0
    __m128  c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f}; 
1986
1987
0
    __m128  halfVec   = {.5f, .5f, .5f, .5f};
1988
1989
0
    for (int iter = 0; iter < 2; ++iter)
1990
0
    {
1991
        //
1992
        //  Operate on 4 columns at a time. The
1993
        //    offsets into our row-major array are:
1994
        //                  0:  0      1
1995
        //                  1:  2      3
1996
        //                  2:  4      5
1997
        //                  3:  6      7
1998
        //                  4:  8      9
1999
        //                  5: 10     11
2000
        //                  6: 12     13
2001
        //                  7: 14     15
2002
        //
2003
2004
0
        for (int pass=0; pass<2; ++pass)
2005
0
        {
2006
0
            a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
2007
0
            a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
2008
0
            a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
2009
0
            a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
2010
 
2011
0
            a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
2012
0
            a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
2013
0
            a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
2014
0
            a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
2015
2016
            //
2017
            // First stage; Compute out_0 and out_4
2018
            //
2019
2020
0
            k0Vec = _mm_add_ps (a0Vec, a3Vec);
2021
0
            k1Vec = _mm_add_ps (a1Vec, a5Vec);
2022
2023
0
            k0Vec = _mm_mul_ps (c4Vec, k0Vec);
2024
0
            k1Vec = _mm_mul_ps (c4Vec, k1Vec);
2025
2026
0
            srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
2027
0
            srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
2028
2029
0
            srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec );
2030
0
            srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec );
2031
2032
2033
            //
2034
            // Second stage; Compute out_2 and out_6
2035
            //
2036
            
2037
0
            k0Vec = _mm_sub_ps (a2Vec, a6Vec);
2038
0
            k1Vec = _mm_sub_ps (a0Vec, a3Vec);
2039
2040
0
            srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec),
2041
0
                                            _mm_mul_ps (c2HalfVec, k1Vec));
2042
2043
0
            srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec), 
2044
0
                                            _mm_mul_ps (c2HalfVec, k0Vec));
2045
2046
            //
2047
            // Precompute K0 and K1 for the remaining stages
2048
            //
2049
2050
0
            k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
2051
0
            k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec); 
2052
2053
            //
2054
            // Third Stage, compute out_3 and out_5
2055
            //
2056
2057
0
            rotXVec = _mm_sub_ps (a7Vec, k0Vec);
2058
0
            rotYVec = _mm_add_ps (a4Vec, k1Vec);
2059
2060
0
            srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec),
2061
0
                                            _mm_mul_ps (c5HalfVec, rotYVec));
2062
2063
0
            srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec),
2064
0
                                            _mm_mul_ps (c3HalfVec, rotYVec));
2065
2066
            //
2067
            // Fourth Stage, compute out_1 and out_7
2068
            //
2069
2070
0
            rotXVec = _mm_add_ps (a7Vec, k0Vec);
2071
0
            rotYVec = _mm_sub_ps (k1Vec, a4Vec);
2072
2073
0
            srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec),
2074
0
                                            _mm_mul_ps (c7HalfVec, rotYVec));
2075
2076
0
            srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec), 
2077
0
                                            _mm_mul_ps (c1HalfVec, rotYVec));
2078
0
        }
2079
2080
        //
2081
        // Transpose the matrix, in 4x4 blocks. So, if we have our
2082
        // 8x8 matrix divied into 4x4 blocks:
2083
        //
2084
        //         M0 | M1         M0t | M2t
2085
        //        ----+---   -->  -----+------
2086
        //         M2 | M3         M1t | M3t
2087
        //
2088
2089
        //
2090
        // M0t, done in place, the first half.
2091
        //
2092
2093
0
        transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
2094
0
        transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
2095
0
        transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
2096
0
        transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
2097
2098
        //
2099
        // M3t, also done in place, the first half.
2100
        //
2101
2102
0
        transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44);
2103
0
        transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
2104
0
        transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE);
2105
0
        transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
2106
2107
        //
2108
        // M0t, the second half.
2109
        //
2110
2111
0
        srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2112
0
        srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2113
0
        srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2114
0
        srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2115
2116
        //
2117
        // M3t, the second half.
2118
        //
2119
2120
0
        srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2121
0
        srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2122
0
        srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2123
0
        srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2124
2125
        //
2126
        // M1 and M2 need to be done at the same time, because we're
2127
        //  swapping. 
2128
        //
2129
        // First, the first half of M1t
2130
        //
2131
2132
0
        transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
2133
0
        transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
2134
0
        transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
2135
0
        transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
2136
2137
        //
2138
        // And the first half of M2t
2139
        //
2140
2141
0
        transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44);
2142
0
        transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
2143
0
        transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE);
2144
0
        transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
2145
2146
        //
2147
        // Second half of M1t
2148
        //
2149
2150
0
        srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2151
0
        srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2152
0
        srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2153
0
        srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2154
2155
        //
2156
        // Second half of M2
2157
        //
2158
2159
0
        srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2160
0
        srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2161
0
        srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2162
        srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2163
0
    }
2164
0
}
2165
2166
#endif /* IMF_HAVE_SSE2 */
2167
2168
} // anonymous namespace
2169
2170
OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
2171
2172
#endif